[Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure
       [not found] <cover.1263230079.git.mst@redhat.com>
@ 2010-01-11 17:16 ` Michael S. Tsirkin
  2010-01-12 22:32   ` [Qemu-devel] " Anthony Liguori
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 02/13] kvm: add API to set ioeventfd Michael S. Tsirkin
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:16 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

vhost needs physical addresses for
ring so expose that structure.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c |   18 ------------------
 hw/virtio.h |   17 +++++++++++++++++
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 3c609ce..8e3c9ad 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -49,24 +49,6 @@ typedef struct VRingUsed
     VRingUsedElem ring[0];
 } VRingUsed;
 
-typedef struct VRing
-{
-    unsigned int num;
-    target_phys_addr_t desc;
-    target_phys_addr_t avail;
-    target_phys_addr_t used;
-} VRing;
-
-struct VirtQueue
-{
-    VRing vring;
-    target_phys_addr_t pa;
-    uint16_t last_avail_idx;
-    int inuse;
-    uint16_t vector;
-    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
-};
-
 #define VIRTIO_PCI_QUEUE_MAX        16
 
 /* virt queue functions */
diff --git a/hw/virtio.h b/hw/virtio.h
index 3994cc9..ca840e1 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -180,5 +180,22 @@ void virtio_net_exit(VirtIODevice *vdev);
 	DEFINE_PROP_BIT("indirect_desc", _state, _field, \
 			VIRTIO_RING_F_INDIRECT_DESC, true)
 
+typedef struct VRing
+{
+    unsigned int num;
+    target_phys_addr_t desc;
+    target_phys_addr_t avail;
+    target_phys_addr_t used;
+} VRing;
+
+struct VirtQueue
+{
+    VRing vring;
+    target_phys_addr_t pa;
+    uint16_t last_avail_idx;
+    int inuse;
+    uint16_t vector;
+    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+};
 
 #endif
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 02/13] kvm: add API to set ioeventfd
       [not found] <cover.1263230079.git.mst@redhat.com>
  2010-01-11 17:16 ` [Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure Michael S. Tsirkin
@ 2010-01-11 17:17 ` Michael S. Tsirkin
  2010-01-12 22:35   ` [Qemu-devel] " Anthony Liguori
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 03/13] virtio: add iofd/irqfd support Michael S. Tsirkin
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:17 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 kvm-all.c |   24 ++++++++++++++++++++++++
 kvm.h     |    3 +++
 2 files changed, 27 insertions(+), 0 deletions(-)

diff --git a/kvm-all.c b/kvm-all.c
index a312654..aa00119 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -1113,3 +1113,27 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
 {
 }
 #endif /* !KVM_CAP_SET_GUEST_DEBUG */
+
+#ifdef KVM_IOEVENTFD
+int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
+{
+    struct kvm_ioeventfd kick = {
+        .datamatch = data,
+        .addr = addr,
+        .len = 2,
+        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
+        .fd = fd,
+    };
+    if (!assigned)
+        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
+    int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
+    if (r < 0)
+        return r;
+    return 0;
+}
+#else
+int kvm_set_ioeventfd(uint16_t data, uint16_t addr, int fd, bool assigned)
+{
+    return -ENOSYS;
+}
+#endif
diff --git a/kvm.h b/kvm.h
index 672d511..2d723b8 100644
--- a/kvm.h
+++ b/kvm.h
@@ -14,6 +14,7 @@
 #ifndef QEMU_KVM_H
 #define QEMU_KVM_H
 
+#include <stdbool.h>
 #include "config.h"
 #include "qemu-queue.h"
 
@@ -131,4 +132,6 @@ static inline void cpu_synchronize_state(CPUState *env)
     }
 }
 
+int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned);
+
 #endif
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 03/13] virtio: add iofd/irqfd support
       [not found] <cover.1263230079.git.mst@redhat.com>
  2010-01-11 17:16 ` [Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure Michael S. Tsirkin
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 02/13] kvm: add API to set ioeventfd Michael S. Tsirkin
@ 2010-01-11 17:17 ` Michael S. Tsirkin
  2010-01-12 22:36   ` [Qemu-devel] " Anthony Liguori
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 04/13] virtio-pci: fill in irqfd/queufd support Michael S. Tsirkin
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:17 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Add binding API to set iofd/irqfd support.
Will be used by vhost.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.c |   13 ++++++++++---
 hw/virtio.h |    4 ++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/hw/virtio.c b/hw/virtio.c
index 8e3c9ad..b9ec863 100644
--- a/hw/virtio.c
+++ b/hw/virtio.c
@@ -572,6 +572,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
     return &vdev->vq[i];
 }
 
+void virtio_irq(VirtIODevice *vdev, VirtQueue *vq)
+{
+    vdev->isr |= 0x01;
+    virtio_notify_vector(vdev, vq->vector);
+}
+
 void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
 {
     /* Make sure used ring is updated before we check NO_INTERRUPT. */
@@ -582,8 +588,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
          (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
         return;
 
-    vdev->isr |= 0x01;
-    virtio_notify_vector(vdev, vq->vector);
+    virtio_irq(vdev, vq);
 }
 
 void virtio_notify_config(VirtIODevice *vdev)
@@ -696,8 +701,10 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
     vdev->queue_sel = 0;
     vdev->config_vector = VIRTIO_NO_VECTOR;
     vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
-    for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++)
+    for(i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
         vdev->vq[i].vector = VIRTIO_NO_VECTOR;
+        vdev->vq[i].vdev = vdev;
+    }
 
     vdev->name = name;
     vdev->config_len = config_size;
diff --git a/hw/virtio.h b/hw/virtio.h
index ca840e1..193b3f9 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -88,6 +88,8 @@ typedef struct {
     int (*load_config)(void * opaque, QEMUFile *f);
     int (*load_queue)(void * opaque, int n, QEMUFile *f);
     unsigned (*get_features)(void * opaque);
+    int (*set_irqfd)(void * opaque, int n, int fd, bool assigned);
+    int (*set_queuefd)(void * opaque, int n, int fd, bool assigned);
 } VirtIOBindings;
 
 #define VIRTIO_PCI_QUEUE_MAX 16
@@ -196,6 +198,8 @@ struct VirtQueue
     int inuse;
     uint16_t vector;
     void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
+    VirtIODevice *vdev;
 };
 
+void virtio_irq(VirtIODevice *vdev, VirtQueue *vq);
 #endif
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 04/13] virtio-pci: fill in irqfd/queufd support
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (2 preceding siblings ...)
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 03/13] virtio: add iofd/irqfd support Michael S. Tsirkin
@ 2010-01-11 17:17 ` Michael S. Tsirkin
  2010-01-11 17:19 ` [Qemu-devel] [PATCH-RFC 05/13] syborg_virtio: add irqfd/eventfd support Michael S. Tsirkin
                   ` (8 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:17 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Support irqfd/queuefd. The last one only with kvm, that's
okay because vhost relies on kvm anyway.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-pci.c |   26 ++++++++++++++++++++++++++
 1 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 6d0f9dd..5a3be6b 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -23,6 +23,7 @@
 #include "msix.h"
 #include "net.h"
 #include "loader.h"
+#include "kvm.h"
 
 /* from Linux's linux/virtio_pci.h */
 
@@ -388,6 +389,29 @@ static unsigned virtio_pci_get_features(void *opaque)
     return proxy->host_features;
 }
 
+static void virtio_pci_irqfd_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    virtio_irq(vq->vdev, vq);
+}
+
+static int virtio_pci_irqfd(void * opaque, int n, int fd, bool assign)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    VirtQueue *vq = &proxy->vdev->vq[n];
+
+    qemu_set_fd_handler(fd, assign ? virtio_pci_irqfd_read : NULL, NULL, vq);
+    return 0;
+}
+
+static int virtio_pci_queuefd(void * opaque, int n, int fd, bool assign)
+{
+    VirtIOPCIProxy *proxy = opaque;
+    return kvm_set_ioeventfd(proxy->addr + VIRTIO_PCI_QUEUE_NOTIFY,
+                             n, fd, assign);
+}
+
+
 static const VirtIOBindings virtio_pci_bindings = {
     .notify = virtio_pci_notify,
     .save_config = virtio_pci_save_config,
@@ -395,6 +419,8 @@ static const VirtIOBindings virtio_pci_bindings = {
     .save_queue = virtio_pci_save_queue,
     .load_queue = virtio_pci_load_queue,
     .get_features = virtio_pci_get_features,
+    .set_irqfd = virtio_pci_irqfd,
+    .set_queuefd = virtio_pci_queuefd,
 };
 
 static void virtio_init_pci(VirtIOPCIProxy *proxy, VirtIODevice *vdev,
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 05/13] syborg_virtio: add irqfd/eventfd support
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (3 preceding siblings ...)
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 04/13] virtio-pci: fill in irqfd/queufd support Michael S. Tsirkin
@ 2010-01-11 17:19 ` Michael S. Tsirkin
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 06/13] s390-virtio: fill in irqfd support Michael S. Tsirkin
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:19 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

No idea if it's right .. and syborg does not support vectors so we won't
get performance gains out of it either - so quite likely it's best to
just keep this patch out of qemu.  But just to show what's possible.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/syborg_virtio.c |   25 +++++++++++++++++++++++++
 1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c
index 65239a0..ef6364f 100644
--- a/hw/syborg_virtio.c
+++ b/hw/syborg_virtio.c
@@ -249,9 +249,34 @@ static unsigned syborg_virtio_get_features(void *opaque)
     return proxy->host_features;
 }
 
+static void syborg_virtio_irqfd_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    virtio_irq(vq->vdev, vq);
+}
+
+static int syborg_virtio_irqfd(void * opaque, int n, int fd, bool assign)
+{
+    SyborgVirtIOProxy *proxy = opaque;
+    VirtQueue *vq = &proxy->vdev->vq[n];
+
+    qemu_set_fd_handler(fd, assign ? syborg_virtio_irqfd_read : NULL, NULL, vq);
+    return 0;
+}
+
+static int syborg_virtio_queuefd(void * opaque, int n, int fd, bool assign)
+{
+    SyborgVirtIOProxy *proxy = opaque;
+    return kvm_set_ioeventfd(proxy->busdev.mmio[0].addr +
+                             SYBORG_VIRTIO_QUEUE_NOTIFY << 2,
+                             n, fd, assign);
+}
+
 static VirtIOBindings syborg_virtio_bindings = {
     .notify = syborg_virtio_update_irq,
     .get_features = syborg_virtio_get_features,
+    .set_irqfd = syborg_virtio_irqfd,
+    .set_queuefd = syborg_virtio_queuefd,
 };
 
 static int syborg_virtio_init(SyborgVirtIOProxy *proxy, VirtIODevice *vdev)
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 06/13] s390-virtio: fill in irqfd support
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (4 preceding siblings ...)
  2010-01-11 17:19 ` [Qemu-devel] [PATCH-RFC 05/13] syborg_virtio: add irqfd/eventfd support Michael S. Tsirkin
@ 2010-01-11 17:20 ` Michael S. Tsirkin
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 07/13] virtio: move typedef to qemu-common Michael S. Tsirkin
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:20 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Don't know how to make queuefd work: probably need support for binding
hypercalls to eventfd in kvm?  Again, just a demonstration, probably not
for commit, because s390 does not have vector support.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/s390-virtio-bus.c |   16 ++++++++++++++++
 1 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c
index 980e7eb..b663972 100644
--- a/hw/s390-virtio-bus.c
+++ b/hw/s390-virtio-bus.c
@@ -315,11 +315,27 @@ static unsigned virtio_s390_get_features(void *opaque)
     return dev->host_features;
 }
 
+static void virtio_s390_irqfd_read(void *opaque)
+{
+    VirtQueue *vq = opaque;
+    virtio_irq(vq->vdev, vq);
+}
+
+static int virtio_s390_irqfd(void * opaque, int n, int fd, bool assign)
+{
+    VirtIOS390Device *dev = opaque;
+    VirtQueue *vq = &dev->vdev->vq[n];
+
+    qemu_set_fd_handler(fd, assign ? virtio_s390_irqfd_read : NULL, NULL, vq);
+    return 0;
+}
+
 /**************** S390 Virtio Bus Device Descriptions *******************/
 
 static const VirtIOBindings virtio_s390_bindings = {
     .notify = virtio_s390_notify,
     .get_features = virtio_s390_get_features,
+    .set_irqfd = virtio_s390_irqfd,
 };
 
 static VirtIOS390DeviceInfo s390_virtio_net = {
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 07/13] virtio: move typedef to qemu-common
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (5 preceding siblings ...)
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 06/13] s390-virtio: fill in irqfd support Michael S. Tsirkin
@ 2010-01-11 17:20 ` Michael S. Tsirkin
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 08/13] net/tap: add interface to get device fd Michael S. Tsirkin
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:20 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

make it possible to use type without header include

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio.h   |    1 -
 qemu-common.h |    1 +
 2 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/hw/virtio.h b/hw/virtio.h
index 193b3f9..6a7e3ec 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -67,7 +67,6 @@ static inline target_phys_addr_t vring_align(target_phys_addr_t addr,
 }
 
 typedef struct VirtQueue VirtQueue;
-typedef struct VirtIODevice VirtIODevice;
 
 #define VIRTQUEUE_MAX_SIZE 1024
 
diff --git a/qemu-common.h b/qemu-common.h
index 8630f8c..8bb1f83 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -217,6 +217,7 @@ typedef struct uWireSlave uWireSlave;
 typedef struct I2SCodec I2SCodec;
 typedef struct DeviceState DeviceState;
 typedef struct SSIBus SSIBus;
+typedef struct VirtIODevice VirtIODevice;
 
 /* CPU save/load.  */
 void cpu_save(QEMUFile *f, void *opaque);
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 08/13] net/tap: add interface to get device fd
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (6 preceding siblings ...)
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 07/13] virtio: move typedef to qemu-common Michael S. Tsirkin
@ 2010-01-11 17:20 ` Michael S. Tsirkin
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options Michael S. Tsirkin
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:20 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Will be used by vhost to attach/detach to backend.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/tap.c |    7 +++++++
 net/tap.h |    2 ++
 2 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index d3492de..7e9ca79 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -269,6 +269,13 @@ static void tap_poll(VLANClientState *nc, bool enable)
     tap_write_poll(s, enable);
 }
 
+int tap_get_fd(VLANClientState *nc)
+{
+    TAPState *s = DO_UPCAST(TAPState, nc, nc);
+    assert(nc->info->type == NET_CLIENT_TYPE_TAP);
+    return s->fd;
+}
+
 /* fd support */
 
 static NetClientInfo net_tap_info = {
diff --git a/net/tap.h b/net/tap.h
index 538a562..a244b28 100644
--- a/net/tap.h
+++ b/net/tap.h
@@ -48,4 +48,6 @@ int tap_probe_vnet_hdr(int fd);
 int tap_probe_has_ufo(int fd);
 void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
 
+int tap_get_fd(VLANClientState *vc);
+
 #endif /* QEMU_NET_TAP_H */
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (7 preceding siblings ...)
  2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 08/13] net/tap: add interface to get device fd Michael S. Tsirkin
@ 2010-01-11 17:22 ` Michael S. Tsirkin
  2010-01-12 22:39   ` [Qemu-devel] " Anthony Liguori
  2010-01-12 22:42   ` Anthony Liguori
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 10/13] tap: add API to retrieve vhost net header Michael S. Tsirkin
                   ` (3 subsequent siblings)
  12 siblings, 2 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:22 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

Looks like order got mixed up: vhost_net header
is added by a follow-up patch. Will be fixed
in the next revision.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net.c           |    8 ++++++++
 net/tap.c       |   29 +++++++++++++++++++++++++++++
 qemu-options.hx |    4 +++-
 3 files changed, 40 insertions(+), 1 deletions(-)

diff --git a/net.c b/net.c
index 6ef93e6..b942d03 100644
--- a/net.c
+++ b/net.c
@@ -976,6 +976,14 @@ static struct {
                 .name = "vnet_hdr",
                 .type = QEMU_OPT_BOOL,
                 .help = "enable the IFF_VNET_HDR flag on the tap interface"
+            }, {
+                .name = "vhost",
+                .type = QEMU_OPT_BOOL,
+                .help = "enable vhost-net network accelerator",
+            }, {
+                .name = "vhostfd",
+                .type = QEMU_OPT_STRING,
+                .help = "file descriptor of an already opened vhost net device",
             },
 #endif /* _WIN32 */
             { /* end of list */ }
diff --git a/net/tap.c b/net/tap.c
index 7e9ca79..d9f2e41 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -41,6 +41,8 @@
 
 #include "net/tap-linux.h"
 
+#include "hw/vhost_net.h"
+
 /* Maximum GSO packet size (64k) plus plenty of room for
  * the ethernet and virtio_net headers
  */
@@ -57,6 +59,7 @@ typedef struct TAPState {
     unsigned int has_vnet_hdr : 1;
     unsigned int using_vnet_hdr : 1;
     unsigned int has_ufo: 1;
+    struct vhost_net *vhost_net;
 } TAPState;
 
 static int launch_script(const char *setup_script, const char *ifname, int fd);
@@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc)
 {
     TAPState *s = DO_UPCAST(TAPState, nc, nc);
 
+    if (s->vhost_net) {
+        vhost_net_cleanup(s->vhost_net);
+    }
+
     qemu_purge_queued_packets(nc);
 
     if (s->down_script[0])
@@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan,
     s->has_ufo = tap_probe_has_ufo(s->fd);
     tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
     tap_read_poll(s, 1);
+    s->vhost_net = NULL;
     return s;
 }
 
@@ -456,6 +464,27 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
         }
     }
 
+    if (qemu_opt_get_bool(opts, "vhost", 0)) {
+        int vhostfd, r;
+        if (qemu_opt_get(opts, "vhostfd")) {
+            r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
+            if (r == -1) {
+                return -1;
+            }
+            vhostfd = r;
+        } else {
+            vhostfd = -1;
+        }
+        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
+        if (!s->vhost_net) {
+            qemu_error("vhost-net requested but could not be initialized\n");
+            return -1;
+        }
+    } else if (qemu_opt_get(opts, "vhostfd")) {
+        qemu_error("vhostfd= is not valid without vhost\n");
+        return -1;
+    }
+
     if (vlan) {
         vlan->nb_host_devs++;
     }
diff --git a/qemu-options.hx b/qemu-options.hx
index ecd50eb..ef185dd 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -813,7 +813,7 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
     "-net tap[,vlan=n][,name=str],ifname=name\n"
     "                connect the host TAP network interface to VLAN 'n'\n"
 #else
-    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off]\n"
+    "-net tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n"
     "                connect the host TAP network interface to VLAN 'n' and use the\n"
     "                network scripts 'file' (default=%s)\n"
     "                and 'dfile' (default=%s);\n"
@@ -823,6 +823,8 @@ DEF("net", HAS_ARG, QEMU_OPTION_net,
     "                default of 'sndbuf=1048576' can be disabled using 'sndbuf=0'\n"
     "                use vnet_hdr=off to avoid enabling the IFF_VNET_HDR tap flag; use\n"
     "                vnet_hdr=on to make the lack of IFF_VNET_HDR support an error condition\n"
+    "                use vhost=on to enable experimental in kernel accelerator\n"
+    "                use 'vhostfd=h' to connect to an already opened vhost net device\n"
 #endif
     "-net socket[,vlan=n][,name=str][,fd=h][,listen=[host]:port][,connect=host:port]\n"
     "                connect the vlan 'n' to another VLAN using a socket connection\n"
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 10/13] tap: add API to retrieve vhost net header
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (8 preceding siblings ...)
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options Michael S. Tsirkin
@ 2010-01-11 17:22 ` Michael S. Tsirkin
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 12/13] virtio: add status change callback Michael S. Tsirkin
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:22 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

same comment as patch 09.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 net/tap.c |    7 +++++++
 net/tap.h |    3 +++
 2 files changed, 10 insertions(+), 0 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index d9f2e41..166cf05 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -491,3 +491,10 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
 
     return 0;
 }
+
+struct vhost_net *tap_get_vhost_net(VLANClientState *nc)
+{
+    TAPState *s = DO_UPCAST(TAPState, nc, nc);
+    assert(nc->info->type == NET_CLIENT_TYPE_TAP);
+    return s->vhost_net;
+}
diff --git a/net/tap.h b/net/tap.h
index a244b28..b8cec83 100644
--- a/net/tap.h
+++ b/net/tap.h
@@ -50,4 +50,7 @@ void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
 
 int tap_get_fd(VLANClientState *vc);
 
+struct vhost_net;
+struct vhost_net *tap_get_vhost_net(VLANClientState *vc);
+
 #endif /* QEMU_NET_TAP_H */
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 12/13] virtio: add status change callback
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (9 preceding siblings ...)
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 10/13] tap: add API to retrieve vhost net header Michael S. Tsirkin
@ 2010-01-11 17:23 ` Michael S. Tsirkin
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 13/13] virtio-net: connect to vhost net backend Michael S. Tsirkin
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 11/13] vhost net support Michael S. Tsirkin
  12 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:23 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

vhost net backend needs to be notified when
frontend status changes. Add a callback.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/s390-virtio-bus.c |    3 +++
 hw/syborg_virtio.c   |    2 ++
 hw/virtio-pci.c      |    6 ++++++
 hw/virtio.h          |    1 +
 4 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/hw/s390-virtio-bus.c b/hw/s390-virtio-bus.c
index b663972..1f8a04f 100644
--- a/hw/s390-virtio-bus.c
+++ b/hw/s390-virtio-bus.c
@@ -243,6 +243,9 @@ void s390_virtio_device_update_status(VirtIOS390Device *dev)
     uint32_t features;
 
     vdev->status = ldub_phys(dev->dev_offs + VIRTIO_DEV_OFFS_STATUS);
+    if (vdev->set_status) {
+        vdev->set_status(vdev);
+    }
 
     /* Update guest supported feature bitmap */
 
diff --git a/hw/syborg_virtio.c b/hw/syborg_virtio.c
index ef6364f..264c1f7 100644
--- a/hw/syborg_virtio.c
+++ b/hw/syborg_virtio.c
@@ -152,6 +152,8 @@ static void syborg_virtio_writel(void *opaque, target_phys_addr_t offset,
         vdev->status = value & 0xFF;
         if (vdev->status == 0)
             virtio_reset(vdev);
+        if (vdev->set_status)
+            vdev->set_status(vdev);
         break;
     case SYBORG_VIRTIO_INT_ENABLE:
         s->int_enable = value;
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 5a3be6b..fa0a7a0 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -209,6 +209,9 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             virtio_reset(proxy->vdev);
             msix_unuse_all_vectors(&proxy->pci_dev);
         }
+        if (vdev->set_status) {
+            vdev->set_status(vdev);
+        }
         break;
     case VIRTIO_MSI_CONFIG_VECTOR:
         msix_vector_unuse(&proxy->pci_dev, vdev->config_vector);
@@ -376,6 +379,9 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
     if (PCI_COMMAND == address) {
         if (!(val & PCI_COMMAND_MASTER)) {
             proxy->vdev->status &= !VIRTIO_CONFIG_S_DRIVER_OK;
+            if (proxy->vdev->set_status) {
+                proxy->vdev->set_status(proxy->vdev);
+            }
         }
     }
 
diff --git a/hw/virtio.h b/hw/virtio.h
index 6a7e3ec..76734ac 100644
--- a/hw/virtio.h
+++ b/hw/virtio.h
@@ -112,6 +112,7 @@ struct VirtIODevice
     void (*get_config)(VirtIODevice *vdev, uint8_t *config);
     void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
     void (*reset)(VirtIODevice *vdev);
+    void (*set_status)(VirtIODevice *vdev);
     VirtQueue *vq;
     const VirtIOBindings *binding;
     void *binding_opaque;
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (10 preceding siblings ...)
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 12/13] virtio: add status change callback Michael S. Tsirkin
@ 2010-01-11 17:23 ` Michael S. Tsirkin
  2010-01-25 19:58   ` [Qemu-devel] " Anthony Liguori
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 11/13] vhost net support Michael S. Tsirkin
  12 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:23 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

start/stop backend on driver start/stop

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/hw/virtio-net.c b/hw/virtio-net.c
index c2a389f..99169e1 100644
--- a/hw/virtio-net.c
+++ b/hw/virtio-net.c
@@ -17,6 +17,7 @@
 #include "net/tap.h"
 #include "qemu-timer.h"
 #include "virtio-net.h"
+#include "vhost_net.h"
 
 #define VIRTIO_NET_VM_VERSION    11
 
@@ -47,6 +48,7 @@ typedef struct VirtIONet
     uint8_t nomulti;
     uint8_t nouni;
     uint8_t nobcast;
+    uint8_t vhost_started;
     struct {
         int in_use;
         int first_multi;
@@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
     n->nomulti = 0;
     n->nouni = 0;
     n->nobcast = 0;
+    if (n->vhost_started) {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        n->vhost_started = 0;
+    }
 
     /* Flush any MAC and VLAN filter table state */
     n->mac_table.in_use = 0;
@@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
     .link_status_changed = virtio_net_set_link_status,
 };
 
+static void virtio_net_set_status(struct VirtIODevice *vdev)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    if (!n->nic->nc.peer) {
+        return;
+    }
+    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
+        return;
+    }
+
+    if (!tap_get_vhost_net(n->nic->nc.peer)) {
+        return;
+    }
+    if (!!n->vhost_started == !!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
+        return;
+    }
+    if (vdev->status & VIRTIO_CONFIG_S_DRIVER_OK) {
+        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        if (r < 0) {
+            fprintf(stderr, "unable to start vhost net: "
+                    "falling back on userspace virtio\n");
+        } else {
+            n->vhost_started = 1;
+        }
+    } else {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+        n->vhost_started = 0;
+    }
+}
+
 VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
 {
     VirtIONet *n;
@@ -835,6 +871,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
     n->vdev.set_features = virtio_net_set_features;
     n->vdev.bad_features = virtio_net_bad_features;
     n->vdev.reset = virtio_net_reset;
+    n->vdev.set_status = virtio_net_set_status;
     n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
     n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
     n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
@@ -864,6 +901,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
 void virtio_net_exit(VirtIODevice *vdev)
 {
     VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
+    if (n->vhost_started) {
+        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
+    }
 
     qemu_purge_queued_packets(&n->nic->nc);
 
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] [PATCH-RFC 11/13] vhost net support
       [not found] <cover.1263230079.git.mst@redhat.com>
                   ` (11 preceding siblings ...)
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 13/13] virtio-net: connect to vhost net backend Michael S. Tsirkin
@ 2010-01-11 17:23 ` Michael S. Tsirkin
  2010-01-12 22:45   ` [Qemu-devel] " Anthony Liguori
  12 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-11 17:23 UTC (permalink / raw)
  To: Anthony Liguori, qemu-devel

This adds vhost net support in qemu. Will be tied to tap device and
virtio later. Raw backend is currently missing, will be worked
on/submitted separately.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 Makefile.target |    1 +
 hw/vhost.c      |  349 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vhost.h      |   33 +++++
 hw/vhost_net.c  |  145 +++++++++++++++++++++++
 hw/vhost_net.h  |   20 +++
 5 files changed, 548 insertions(+), 0 deletions(-)
 create mode 100644 hw/vhost.c
 create mode 100644 hw/vhost.h
 create mode 100644 hw/vhost_net.c
 create mode 100644 hw/vhost_net.h

diff --git a/Makefile.target b/Makefile.target
index 7c1f30c..61b7148 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -157,6 +157,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
 # virtio has to be here due to weird dependency between PCI and virtio-net.
 # need to fix this properly
 obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o
+obj-y += vhost_net.o vhost.o
 obj-$(CONFIG_KVM) += kvm.o kvm-all.o
 obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
 LIBS+=-lz
diff --git a/hw/vhost.c b/hw/vhost.c
new file mode 100644
index 0000000..d23d94c
--- /dev/null
+++ b/hw/vhost.c
@@ -0,0 +1,349 @@
+#include "linux/vhost.h"
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+#include "vhost.h"
+#include "hw/hw.h"
+/* For range_get_last */
+#include "pci.h"
+
+static void vhost_dev_unassign_memory(struct vhost_dev *dev,
+				      struct vhost_memory *mem,
+				      uint64_t start_addr,
+				      uint64_t size)
+{
+	int from, to;
+	for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
+		struct vhost_memory_region *reg = mem->regions + to;
+		uint64_t reglast;
+		uint64_t memlast;
+		uint64_t change;
+
+		/* clone old region */
+		memcpy(reg, dev->mem->regions + from, sizeof *reg);
+
+		/* No overlap is simple */
+		if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
+				    start_addr, size)) {
+			continue;
+		}
+		reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+		memlast = range_get_last(start_addr, size);
+
+		/* Remove whole region */
+		if (start_addr <= reg->guest_phys_addr && memlast >= reglast) {
+			--to;
+			continue;
+		}
+
+		/* Shrink region */
+		if (memlast >= reglast) {
+			reg->memory_size = start_addr - reg->guest_phys_addr;
+			continue;
+		}
+
+		/* Shift region */
+		if (start_addr <= reg->guest_phys_addr) {
+			change = memlast + 1 - reg->guest_phys_addr;
+			reg->memory_size -= change;
+			reg->guest_phys_addr += change;
+			reg->userspace_addr += change;
+			continue;
+		}
+
+		/* Split region: shrink first part, shift second part. */
+		memcpy(reg + 1, reg, sizeof *reg);
+		reg[0].memory_size = start_addr - reg->guest_phys_addr;
+		change = memlast + 1 - reg->guest_phys_addr;
+		reg[1].memory_size -= change;
+		reg[1].guest_phys_addr += change;
+		reg[1].userspace_addr += change;
+		++to;
+	}
+	mem->nregions = to;
+}
+
+/* Called after unassign, so no regions overlap the given range. */
+static void vhost_dev_assign_memory(struct vhost_dev *dev,
+				    struct vhost_memory *mem,
+				    uint64_t start_addr,
+				    uint64_t size,
+				    uint64_t uaddr)
+{
+	int from, to;
+	struct vhost_memory_region *merged = NULL;
+	for (from = 0, to = 0; from < dev->mem->nregions; ++from, ++to) {
+		struct vhost_memory_region *reg = mem->regions + to;
+		uint64_t prlast, urlast;
+		uint64_t pmlast, umlast;
+		uint64_t s, e, u;
+
+		/* clone old region */
+		memcpy(reg, dev->mem->regions + from, sizeof *reg);
+		prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
+		pmlast = range_get_last(start_addr, size);
+		urlast = range_get_last(reg->userspace_addr, reg->memory_size);
+		umlast = range_get_last(uaddr, size);
+
+		/* Not an adjecent region - do not merge. */
+		if ((prlast + 1 != start_addr || urlast + 1 != uaddr) &&
+		    (pmlast + 1 != reg->guest_phys_addr ||
+		     umlast + 1 != reg->userspace_addr)) {
+			continue;
+		}
+
+		if (!merged) {
+			--to;
+		} else {
+			merged = reg;
+		}
+		u = MIN(uaddr, reg->userspace_addr);
+		s = MIN(start_addr, reg->guest_phys_addr);
+		e = MAX(pmlast, prlast);
+		uaddr = merged->userspace_addr = u;
+		start_addr = merged->guest_phys_addr = s;
+		size = merged->memory_size = e - s + 1;
+	}
+
+	if (!merged) {
+		struct vhost_memory_region *reg = mem->regions + to;
+		reg->memory_size = size;
+		reg->guest_phys_addr = start_addr;
+		reg->userspace_addr = uaddr;
+		++to;
+	}
+	mem->nregions = to;
+}
+
+static void vhost_client_set_memory(CPUPhysMemoryClient *client,
+				    target_phys_addr_t start_addr,
+				    ram_addr_t size,
+				    ram_addr_t phys_offset)
+{
+	struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
+	ram_addr_t flags = phys_offset & ~TARGET_PAGE_MASK;
+	int s = offsetof(struct vhost_memory, regions) +
+		(dev->mem->nregions + 1)* sizeof dev->mem->regions[0];
+	struct vhost_memory *mem = qemu_malloc(s);
+	memcpy(mem, dev->mem, s);
+
+	/* First, remove old mapping for this memory, if any. */
+	vhost_dev_unassign_memory(dev, mem, start_addr, size);
+	if (flags == IO_MEM_RAM) {
+		/* Add given mapping, merging adjacent regions if any */
+		vhost_dev_assign_memory(dev, mem, start_addr, size,
+				(uintptr_t)qemu_get_ram_ptr(phys_offset));
+	}
+	qemu_free(dev->mem);
+	dev->mem = mem;
+}
+
+static int vhost_client_sync_dirty_bitmap(struct CPUPhysMemoryClient *client,
+					target_phys_addr_t start_addr,
+					target_phys_addr_t end_addr)
+{
+	/* TODO: migration */
+	return 0;
+}
+
+static int vhost_client_migration_log(struct CPUPhysMemoryClient *client,
+				      int enable)
+{
+	/* TODO: migration */
+	return 0;
+}
+
+static int vhost_virtqueue_init(struct vhost_dev *dev,
+				struct VirtIODevice *vdev,
+				struct vhost_virtqueue *vq,
+				struct VirtQueue *q,
+				unsigned idx)
+{
+	target_phys_addr_t s, l;
+	int r;
+	struct vhost_vring_addr addr = {
+		.index = idx,
+	};
+	struct vhost_vring_file file = {
+		.index = idx,
+	};
+	struct vhost_vring_state size = {
+		.index = idx,
+	};
+
+	size.num = q->vring.num;
+	r = ioctl(dev->control, VHOST_SET_VRING_NUM, &size);
+	if (r) {
+		return -errno;
+	}
+
+	file.fd = vq->kick = eventfd(0, 0);
+	r = ioctl(dev->control, VHOST_SET_VRING_KICK, &file);
+	if (r) {
+		return -errno;
+	}
+
+	file.fd = vq->call = eventfd(0, 0);
+	r = ioctl(dev->control, VHOST_SET_VRING_CALL, &file);
+	if (r) {
+		r = -errno;
+		goto fail_call;
+	}
+
+	s = l = sizeof(struct vring_desc) * q->vring.num;
+	vq->desc = cpu_physical_memory_map(q->vring.desc, &l, 0);
+	if (!vq->desc || l != s) {
+		r = -ENOMEM;
+		goto fail_alloc;
+	}
+	s = l = offsetof(struct vring_avail, ring) +
+		sizeof(u_int64_t) * q->vring.num;
+	vq->avail = cpu_physical_memory_map(q->vring.avail, &l, 0);
+	if (!vq->avail || l != s) {
+		r = -ENOMEM;
+		goto fail_alloc;
+	}
+	s = l = offsetof(struct vring_used, ring) +
+		sizeof(struct vring_used_elem) * q->vring.num;
+	vq->used = cpu_physical_memory_map(q->vring.used, &l, 1);
+	if (!vq->used || l != s) {
+		r = -ENOMEM;
+		goto fail_alloc;
+	}
+
+	addr.desc_user_addr = (u_int64_t)(unsigned long)vq->desc;
+	addr.avail_user_addr = (u_int64_t)(unsigned long)vq->avail;
+	addr.used_user_addr = (u_int64_t)(unsigned long)vq->used;
+	r = ioctl(dev->control, VHOST_SET_VRING_ADDR, &addr);
+	if (r < 0) {
+		r = -errno;
+		goto fail_alloc;
+	}
+	if (!vdev->binding->set_irqfd || !vdev->binding->set_queuefd) {
+		fprintf(stderr, "binding does not support irqfd/queuefd\n");
+		r = -ENOSYS;
+		goto fail_alloc;
+	}
+        r = vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, true);
+	if (r < 0) {
+		goto fail_alloc;
+	}
+
+        r = vdev->binding->set_queuefd(vdev->binding_opaque, idx, vq->kick, true);
+	if (r < 0) {
+		goto fail_queuefd;
+	}
+	return 0;
+
+fail_queuefd:
+        vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, false);
+fail_alloc:
+	close(vq->call);
+fail_call:
+	close(vq->kick);
+	return r;
+}
+
+static void vhost_virtqueue_cleanup(struct vhost_dev *dev,
+				    struct VirtIODevice *vdev,
+				    struct vhost_virtqueue *vq,
+				    struct VirtQueue *q,
+				    unsigned idx)
+{
+	int r;
+	r = vdev->binding->set_irqfd(vdev->binding_opaque, idx, vq->call, false);
+	if (r < 0) {
+		fprintf(stderr, "VQ cleanup failed: %d\n", r);
+	}
+
+	r = vdev->binding->set_queuefd(vdev->binding_opaque, idx, vq->kick, false);
+	if (r < 0) {
+		fprintf(stderr, "VQ cleanup failed: %d\n", r);
+	}
+}
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd)
+{
+	uint64_t features;
+	int r;
+	if (devfd >= 0) {
+		hdev->control = devfd;
+	} else {
+		hdev->control = open("/dev/vhost-net", O_RDWR);
+		if (hdev->control < 0)
+			return -errno;
+	}
+	r = ioctl(hdev->control, VHOST_SET_OWNER, NULL);
+	if (r < 0)
+		goto fail;
+
+	r = ioctl(hdev->control, VHOST_GET_FEATURES, &features);
+	if (r < 0)
+		goto fail;
+	hdev->features = features;
+	
+	hdev->client.set_memory = vhost_client_set_memory;
+	hdev->client.sync_dirty_bitmap = vhost_client_sync_dirty_bitmap;
+	hdev->client.migration_log = vhost_client_migration_log;
+	hdev->mem = qemu_mallocz(offsetof(struct vhost_memory, regions));
+	cpu_register_phys_memory_client(&hdev->client);
+	return 0;
+fail:
+	r = -errno;
+	close(hdev->control);
+	return r;
+}
+
+void vhost_dev_cleanup(struct vhost_dev *hdev)
+{
+	cpu_unregister_phys_memory_client(&hdev->client);
+	qemu_free(hdev->mem);
+	close(hdev->control);
+}
+
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+	int i, r;
+
+	r = ioctl(hdev->control, VHOST_SET_FEATURES, &hdev->acked_features);
+	if (r < 0)
+		goto fail;
+	r = ioctl(hdev->control, VHOST_SET_MEM_TABLE, hdev->mem);
+	if (r < 0)
+		goto fail;
+
+	for (i = 0; i < hdev->nvqs; ++i) {
+		r = vhost_virtqueue_init(hdev,
+		   			 vdev,
+					 hdev->vqs + i,
+					 vdev->vq + i,
+					 i);
+		if (r < 0)
+			goto fail_vq;
+	}
+
+	return 0;
+fail_vq:
+	while (--i >= 0) {
+		vhost_virtqueue_cleanup(hdev,
+					vdev,
+					hdev->vqs + i,
+					vdev->vq + i,
+					i);
+	}
+fail:
+	return r;
+}
+
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
+{
+	int i;
+
+	for (i = 0; i < hdev->nvqs; ++i) {
+		vhost_virtqueue_cleanup(hdev,
+					vdev,
+					hdev->vqs + i,
+					vdev->vq + i,
+					i);
+	}
+}
+
diff --git a/hw/vhost.h b/hw/vhost.h
new file mode 100644
index 0000000..9f82b42
--- /dev/null
+++ b/hw/vhost.h
@@ -0,0 +1,33 @@
+#ifndef VHOST_H
+#define VHOST_H
+
+#include "hw/hw.h"
+#include "hw/virtio.h"
+
+/* Generic structures common for any vhost based device. */
+struct vhost_virtqueue {
+	int kick;
+	int call;
+	void *desc;
+	void *avail;
+	void *used;
+};
+
+struct vhost_memory;
+struct vhost_dev {
+	CPUPhysMemoryClient client;
+	int control;
+	struct vhost_memory *mem;
+	struct vhost_virtqueue *vqs;
+	int nvqs;
+	unsigned long long features;
+	unsigned long long acked_features;
+	unsigned long long backend_features;
+};
+
+int vhost_dev_init(struct vhost_dev *hdev, int devfd);
+void vhost_dev_cleanup(struct vhost_dev *hdev);
+int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev);
+void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev);
+
+#endif
diff --git a/hw/vhost_net.c b/hw/vhost_net.c
new file mode 100644
index 0000000..e2c97c0
--- /dev/null
+++ b/hw/vhost_net.c
@@ -0,0 +1,145 @@
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <linux/kvm.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/vhost.h>
+#include <linux/virtio_ring.h>
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <netinet/in.h>
+
+#include <stdio.h>
+
+#include "net.h"
+#include "net/tap.h"
+
+#include "virtio-net.h"
+#include "vhost.h"
+#include "vhost_net.h"
+
+struct vhost_net {
+	struct vhost_dev dev;
+	struct vhost_virtqueue vqs[2];
+	int backend;
+	VLANClientState *vc;
+};
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features)
+{
+	/* Clear features not supported by host kernel. */
+	if (!(net->dev.features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY)))
+		features &= ~(1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+	if (!(net->dev.features & (1 << VIRTIO_RING_F_INDIRECT_DESC)))
+		features &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+	if (!(net->dev.features & (1 << VIRTIO_NET_F_MRG_RXBUF)))
+		features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
+	return features;
+}
+
+void vhost_net_ack_features(struct vhost_net *net, unsigned features)
+{
+	net->dev.acked_features = net->dev.backend_features;
+	if (features & (1 << VIRTIO_F_NOTIFY_ON_EMPTY))
+		net->dev.acked_features |= (1 << VIRTIO_F_NOTIFY_ON_EMPTY);
+	if (features & (1 << VIRTIO_RING_F_INDIRECT_DESC))
+		net->dev.acked_features |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
+}
+
+static int vhost_net_get_fd(VLANClientState *backend)
+{
+	switch (backend->info->type) {
+	case NET_CLIENT_TYPE_TAP:
+		return tap_get_fd(backend);
+	default:
+		fprintf(stderr, "vhost-net requires tap backend\n");
+		return -EBADFD;
+	}
+}
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd)
+{
+	int r;
+	struct vhost_net *net = qemu_malloc(sizeof *net);
+	if (!backend) {
+		fprintf(stderr, "vhost-net requires backend to be setup\n");
+		goto fail;
+	}
+	r = vhost_net_get_fd(backend);
+	if (r < 0)
+		goto fail;
+	net->vc = backend;
+	net->dev.backend_features = tap_has_vnet_hdr(backend) ? 0 :
+		(1 << VHOST_NET_F_VIRTIO_NET_HDR);
+	net->backend = r;
+
+	r = vhost_dev_init(&net->dev, devfd);
+	if (r < 0)
+		goto fail;
+	if (~net->dev.features & net->dev.backend_features) {
+		fprintf(stderr, "vhost lacks feature mask %llu for backend\n",
+			~net->dev.features & net->dev.backend_features);
+		vhost_dev_cleanup(&net->dev);
+		goto fail;
+	}
+
+	/* Set sane init value. Override when guest acks. */
+	vhost_net_ack_features(net, 0);
+	return net;
+fail:
+	qemu_free(net);
+	return NULL;
+}
+
+int vhost_net_start(struct vhost_net *net,
+		    VirtIODevice *dev)
+{
+	struct vhost_vring_file file = { };
+	int r;
+
+	net->dev.nvqs = 2;
+	net->dev.vqs = net->vqs;
+	r = vhost_dev_start(&net->dev, dev);
+	if (r < 0)
+		return r;
+
+	net->vc->info->poll(net->vc, false);
+	qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
+	file.fd = net->backend;
+	for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+		r = ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+		if (r < 0) {
+			r = -errno;
+			goto fail;
+		}
+	}
+	return 0;
+fail:
+	file.fd = -1;
+	while (--file.index >= 0) {
+		ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+	}
+	net->vc->info->poll(net->vc, true);
+	vhost_dev_stop(&net->dev, dev);
+	return r;
+}
+
+void vhost_net_stop(struct vhost_net *net,
+		    VirtIODevice *dev)
+{
+	struct vhost_vring_file file = { .fd = -1 };
+
+	for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
+		ioctl(net->dev.control, VHOST_NET_SET_BACKEND, &file);
+	}
+	net->vc->info->poll(net->vc, true);
+	vhost_dev_stop(&net->dev, dev);
+}
+
+void vhost_net_cleanup(struct vhost_net *net)
+{
+	vhost_dev_cleanup(&net->dev);
+	qemu_free(net);
+}
+/* TODO: log */
diff --git a/hw/vhost_net.h b/hw/vhost_net.h
new file mode 100644
index 0000000..21f0277
--- /dev/null
+++ b/hw/vhost_net.h
@@ -0,0 +1,20 @@
+#ifndef VHOST_NET_H
+#define VHOST_NET_H
+
+#include "net.h"
+
+struct vhost_net;
+
+struct vhost_net *vhost_net_init(VLANClientState *backend, int devfd);
+
+int vhost_net_start(struct vhost_net *net,
+		    VirtIODevice *dev);
+void vhost_net_stop(struct vhost_net *net,
+		    VirtIODevice *dev);
+
+void vhost_net_cleanup(struct vhost_net *net);
+
+unsigned vhost_net_get_features(struct vhost_net *net, unsigned features);
+void vhost_net_ack_features(struct vhost_net *net, unsigned features);
+
+#endif
-- 
1.6.6.rc1.43.gf55cc

^ permalink raw reply related	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 01/13] virtio: export virtqueue structure
  2010-01-11 17:16 ` [Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure Michael S. Tsirkin
@ 2010-01-12 22:32   ` Anthony Liguori
  2010-01-25 19:10     ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:32 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:16 AM, Michael S. Tsirkin wrote:
> vhost needs physical addresses for
> ring so expose that structure.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>    

I think accessor functions might make more sense.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 02/13] kvm: add API to set ioeventfd
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 02/13] kvm: add API to set ioeventfd Michael S. Tsirkin
@ 2010-01-12 22:35   ` Anthony Liguori
  2010-01-25 19:20     ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:35 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   kvm-all.c |   24 ++++++++++++++++++++++++
>   kvm.h     |    3 +++
>   2 files changed, 27 insertions(+), 0 deletions(-)
>
> diff --git a/kvm-all.c b/kvm-all.c
> index a312654..aa00119 100644
> --- a/kvm-all.c
> +++ b/kvm-all.c
> @@ -1113,3 +1113,27 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
>   {
>   }
>   #endif /* !KVM_CAP_SET_GUEST_DEBUG */
> +
> +#ifdef KVM_IOEVENTFD
> +int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
> +{
> +    struct kvm_ioeventfd kick = {
> +        .datamatch = data,
> +        .addr = addr,
> +        .len = 2,
> +        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
> +        .fd = fd,
> +    };
> +    if (!assigned)
> +        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
> +    int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&kick);
> +    if (r<  0)
> +        return r;
> +    return 0;
> +}
>    

I think we really ought to try to avoid having global state used here.  
That means either we need to pass a CPUState to this function or we need 
to have a ioeventfd be allocated as a structure that is then passed 
around that can store a copy of the kvm_state fetched through CPUState.

I think the later is best.  We really don't want ioeventfd scattered 
throughout the code.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 03/13] virtio: add iofd/irqfd support
  2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 03/13] virtio: add iofd/irqfd support Michael S. Tsirkin
@ 2010-01-12 22:36   ` Anthony Liguori
  2010-01-13 10:50     ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:36 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
> Add binding API to set iofd/irqfd support.
> Will be used by vhost.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   hw/virtio.c |   13 ++++++++++---
>   hw/virtio.h |    4 ++++
>   2 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/hw/virtio.c b/hw/virtio.c
> index 8e3c9ad..b9ec863 100644
> --- a/hw/virtio.c
> +++ b/hw/virtio.c
> @@ -572,6 +572,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>       return&vdev->vq[i];
>   }
>
> +void virtio_irq(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    vdev->isr |= 0x01;
> +    virtio_notify_vector(vdev, vq->vector);
> +}
> +
>   void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>   {
>       /* Make sure used ring is updated before we check NO_INTERRUPT. */
> @@ -582,8 +588,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>            (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
>           return;
>
> -    vdev->isr |= 0x01;
> -    virtio_notify_vector(vdev, vq->vector);
> +    virtio_irq(vdev, vq);
>   }
>
>   void virtio_notify_config(VirtIODevice *vdev)
> @@ -696,8 +701,10 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
>       vdev->queue_sel = 0;
>       vdev->config_vector = VIRTIO_NO_VECTOR;
>       vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
> -    for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++)
> +    for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++) {
>           vdev->vq[i].vector = VIRTIO_NO_VECTOR;
> +        vdev->vq[i].vdev = vdev;
> +    }
>
>       vdev->name = name;
>       vdev->config_len = config_size;
> diff --git a/hw/virtio.h b/hw/virtio.h
> index ca840e1..193b3f9 100644
> --- a/hw/virtio.h
> +++ b/hw/virtio.h
> @@ -88,6 +88,8 @@ typedef struct {
>       int (*load_config)(void * opaque, QEMUFile *f);
>       int (*load_queue)(void * opaque, int n, QEMUFile *f);
>       unsigned (*get_features)(void * opaque);
> +    int (*set_irqfd)(void * opaque, int n, int fd, bool assigned);
> +    int (*set_queuefd)(void * opaque, int n, int fd, bool assigned);
>    

I'd like to see us abstract things like irqfd a bit more.  It should be 
a first class object with an method to raise/lower.  In fact, you can 
probably just use qemu_irq.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 09/13] tap: add vhost/vhostfd options
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options Michael S. Tsirkin
@ 2010-01-12 22:39   ` Anthony Liguori
  2010-01-13 10:59     ` Michael S. Tsirkin
  2010-01-12 22:42   ` Anthony Liguori
  1 sibling, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:39 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:22 AM, Michael S. Tsirkin wrote:
> Looks like order got mixed up: vhost_net header
> is added by a follow-up patch. Will be fixed
> in the next revision.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   net.c           |    8 ++++++++
>   net/tap.c       |   29 +++++++++++++++++++++++++++++
>   qemu-options.hx |    4 +++-
>   3 files changed, 40 insertions(+), 1 deletions(-)
>
> diff --git a/net.c b/net.c
> index 6ef93e6..b942d03 100644
> --- a/net.c
> +++ b/net.c
> @@ -976,6 +976,14 @@ static struct {
>                   .name = "vnet_hdr",
>                   .type = QEMU_OPT_BOOL,
>                   .help = "enable the IFF_VNET_HDR flag on the tap interface"
> +            }, {
> +                .name = "vhost",
> +                .type = QEMU_OPT_BOOL,
> +                .help = "enable vhost-net network accelerator",
> +            }, {
> +                .name = "vhostfd",
> +                .type = QEMU_OPT_STRING,
> +                .help = "file descriptor of an already opened vhost net device",
>               },
>    

Semantically, I think making vhost it's own backend makes more sense 
from a user perspective.

I don't think it's a significant code change.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 09/13] tap: add vhost/vhostfd options
  2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options Michael S. Tsirkin
  2010-01-12 22:39   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-12 22:42   ` Anthony Liguori
  1 sibling, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:42 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:22 AM, Michael S. Tsirkin wrote:
> Looks like order got mixed up: vhost_net header
> is added by a follow-up patch. Will be fixed
> in the next revision.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   net.c           |    8 ++++++++
>   net/tap.c       |   29 +++++++++++++++++++++++++++++
>   qemu-options.hx |    4 +++-
>   3 files changed, 40 insertions(+), 1 deletions(-)
>
> diff --git a/net.c b/net.c
> index 6ef93e6..b942d03 100644
> --- a/net.c
> +++ b/net.c
> @@ -976,6 +976,14 @@ static struct {
>                   .name = "vnet_hdr",
>                   .type = QEMU_OPT_BOOL,
>                   .help = "enable the IFF_VNET_HDR flag on the tap interface"
> +            }, {
> +                .name = "vhost",
> +                .type = QEMU_OPT_BOOL,
> +                .help = "enable vhost-net network accelerator",
> +            }, {
> +                .name = "vhostfd",
> +                .type = QEMU_OPT_STRING,
> +                .help = "file descriptor of an already opened vhost net device",
>               },
>   #endif /* _WIN32 */
>               { /* end of list */ }
> diff --git a/net/tap.c b/net/tap.c
> index 7e9ca79..d9f2e41 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -41,6 +41,8 @@
>
>   #include "net/tap-linux.h"
>
> +#include "hw/vhost_net.h"
> +
>   /* Maximum GSO packet size (64k) plus plenty of room for
>    * the ethernet and virtio_net headers
>    */
> @@ -57,6 +59,7 @@ typedef struct TAPState {
>       unsigned int has_vnet_hdr : 1;
>       unsigned int using_vnet_hdr : 1;
>       unsigned int has_ufo: 1;
> +    struct vhost_net *vhost_net;
>   } TAPState;
>
>   static int launch_script(const char *setup_script, const char *ifname, int fd);
> @@ -252,6 +255,10 @@ static void tap_cleanup(VLANClientState *nc)
>   {
>       TAPState *s = DO_UPCAST(TAPState, nc, nc);
>
> +    if (s->vhost_net) {
> +        vhost_net_cleanup(s->vhost_net);
> +    }
> +
>       qemu_purge_queued_packets(nc);
>
>       if (s->down_script[0])
> @@ -307,6 +314,7 @@ static TAPState *net_tap_fd_init(VLANState *vlan,
>       s->has_ufo = tap_probe_has_ufo(s->fd);
>       tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
>       tap_read_poll(s, 1);
> +    s->vhost_net = NULL;
>       return s;
>   }
>
> @@ -456,6 +464,27 @@ int net_init_tap(QemuOpts *opts, Monitor *mon, const char *name, VLANState *vlan
>           }
>       }
>
> +    if (qemu_opt_get_bool(opts, "vhost", 0)) {
> +        int vhostfd, r;
> +        if (qemu_opt_get(opts, "vhostfd")) {
> +            r = net_handle_fd_param(mon, qemu_opt_get(opts, "vhostfd"));
> +            if (r == -1) {
> +                return -1;
> +            }
> +            vhostfd = r;
> +        } else {
> +            vhostfd = -1;
> +        }
> +        s->vhost_net = vhost_net_init(&s->nc, vhostfd);
>    

Hrm, I don't see a patch that introduces this function?  Am I missing 
something obvious?

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 11/13] vhost net support
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 11/13] vhost net support Michael S. Tsirkin
@ 2010-01-12 22:45   ` Anthony Liguori
  0 siblings, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-12 22:45 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
> This adds vhost net support in qemu. Will be tied to tap device and
> virtio later. Raw backend is currently missing, will be worked
> on/submitted separately.
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>    

Hit send too quick.. here it is.

> ---
>   Makefile.target |    1 +
>   hw/vhost.c      |  349 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
>   hw/vhost.h      |   33 +++++
>   hw/vhost_net.c  |  145 +++++++++++++++++++++++
>   hw/vhost_net.h  |   20 +++
>   5 files changed, 548 insertions(+), 0 deletions(-)
>   create mode 100644 hw/vhost.c
>   create mode 100644 hw/vhost.h
>   create mode 100644 hw/vhost_net.c
>   create mode 100644 hw/vhost_net.h
>
> diff --git a/Makefile.target b/Makefile.target
> index 7c1f30c..61b7148 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -157,6 +157,7 @@ obj-y = vl.o async.o monitor.o pci.o pci_host.o pcie_host.o machine.o gdbstub.o
>   # virtio has to be here due to weird dependency between PCI and virtio-net.
>   # need to fix this properly
>   obj-y += virtio-blk.o virtio-balloon.o virtio-net.o virtio-console.o virtio-pci.o
> +obj-y += vhost_net.o vhost.o
>   obj-$(CONFIG_KVM) += kvm.o kvm-all.o
>   obj-$(CONFIG_ISA_MMIO) += isa_mmio.o
>   LIBS+=-lz
> diff --git a/hw/vhost.c b/hw/vhost.c
> new file mode 100644
> index 0000000..d23d94c
> --- /dev/null
> +++ b/hw/vhost.c
> @@ -0,0 +1,349 @@
> +#include "linux/vhost.h"
> +#include<sys/ioctl.h>
> +#include<sys/eventfd.h>
> +#include "vhost.h"
> +#include "hw/hw.h"
> +/* For range_get_last */
> +#include "pci.h"
> +
> +static void vhost_dev_unassign_memory(struct vhost_dev *dev,
> +				      struct vhost_memory *mem,
> +				      uint64_t start_addr,
> +				      uint64_t size)
> +{
> +	int from, to;
> +	for (from = 0, to = 0; from<  dev->mem->nregions; ++from, ++to) {
> +		struct vhost_memory_region *reg = mem->regions + to;
> +		uint64_t reglast;
> +		uint64_t memlast;
> +		uint64_t change;
> +
> +		/* clone old region */
> +		memcpy(reg, dev->mem->regions + from, sizeof *reg);
> +
> +		/* No overlap is simple */
> +		if (!ranges_overlap(reg->guest_phys_addr, reg->memory_size,
> +				    start_addr, size)) {
> +			continue;
> +		}
> +		reglast = range_get_last(reg->guest_phys_addr, reg->memory_size);
> +		memlast = range_get_last(start_addr, size);
> +
> +		/* Remove whole region */
> +		if (start_addr<= reg->guest_phys_addr&&  memlast>= reglast) {
> +			--to;
> +			continue;
> +		}
> +
> +		/* Shrink region */
> +		if (memlast>= reglast) {
> +			reg->memory_size = start_addr - reg->guest_phys_addr;
> +			continue;
> +		}
> +
> +		/* Shift region */
> +		if (start_addr<= reg->guest_phys_addr) {
> +			change = memlast + 1 - reg->guest_phys_addr;
> +			reg->memory_size -= change;
> +			reg->guest_phys_addr += change;
> +			reg->userspace_addr += change;
> +			continue;
> +		}
> +
> +		/* Split region: shrink first part, shift second part. */
> +		memcpy(reg + 1, reg, sizeof *reg);
> +		reg[0].memory_size = start_addr - reg->guest_phys_addr;
> +		change = memlast + 1 - reg->guest_phys_addr;
> +		reg[1].memory_size -= change;
> +		reg[1].guest_phys_addr += change;
> +		reg[1].userspace_addr += change;
> +		++to;
> +	}
> +	mem->nregions = to;
> +}
> +
> +/* Called after unassign, so no regions overlap the given range. */
> +static void vhost_dev_assign_memory(struct vhost_dev *dev,
> +				    struct vhost_memory *mem,
> +				    uint64_t start_addr,
> +				    uint64_t size,
> +				    uint64_t uaddr)
> +{
> +	int from, to;
> +	struct vhost_memory_region *merged = NULL;
> +	for (from = 0, to = 0; from<  dev->mem->nregions; ++from, ++to) {
> +		struct vhost_memory_region *reg = mem->regions + to;
> +		uint64_t prlast, urlast;
> +		uint64_t pmlast, umlast;
> +		uint64_t s, e, u;
> +
> +		/* clone old region */
> +		memcpy(reg, dev->mem->regions + from, sizeof *reg);
> +		prlast = range_get_last(reg->guest_phys_addr, reg->memory_size);
> +		pmlast = range_get_last(start_addr, size);
> +		urlast = range_get_last(reg->userspace_addr, reg->memory_size);
> +		umlast = range_get_last(uaddr, size);
> +
> +		/* Not an adjecent region - do not merge. */
> +		if ((prlast + 1 != start_addr || urlast + 1 != uaddr)&&
> +		    (pmlast + 1 != reg->guest_phys_addr ||
> +		     umlast + 1 != reg->userspace_addr)) {
> +			continue;
> +		}
> +
> +		if (!merged) {
> +			--to;
> +		} else {
> +			merged = reg;
> +		}
> +		u = MIN(uaddr, reg->userspace_addr);
> +		s = MIN(start_addr, reg->guest_phys_addr);
> +		e = MAX(pmlast, prlast);
> +		uaddr = merged->userspace_addr = u;
> +		start_addr = merged->guest_phys_addr = s;
> +		size = merged->memory_size = e - s + 1;
> +	}
> +
> +	if (!merged) {
> +		struct vhost_memory_region *reg = mem->regions + to;
> +		reg->memory_size = size;
> +		reg->guest_phys_addr = start_addr;
> +		reg->userspace_addr = uaddr;
> +		++to;
> +	}
> +	mem->nregions = to;
> +}
> +
> +static void vhost_client_set_memory(CPUPhysMemoryClient *client,
> +				    target_phys_addr_t start_addr,
> +				    ram_addr_t size,
> +				    ram_addr_t phys_offset)
> +{
> +	struct vhost_dev *dev = container_of(client, struct vhost_dev, client);
> +	ram_addr_t flags = phys_offset&  ~TARGET_PAGE_MASK;
> +	int s = offsetof(struct vhost_memory, regions) +
> +		(dev->mem->nregions + 1)* sizeof dev->mem->regions[0];
> +	struct vhost_memory *mem = qemu_malloc(s);
> +	memcpy(mem, dev->mem, s);
> +
> +	/* First, remove old mapping for this memory, if any. */
> +	vhost_dev_unassign_memory(dev, mem, start_addr, size);
> +	if (flags == IO_MEM_RAM) {
> +		/* Add given mapping, merging adjacent regions if any */
> +		vhost_dev_assign_memory(dev, mem, start_addr, size,
> +				(uintptr_t)qemu_get_ram_ptr(phys_offset));
> +	}
> +	qemu_free(dev->mem);
> +	dev->mem = mem;
> +}
>    

This is gnarly and basically duplicated with kvm-all.c.  Dunno if code 
sharing is practical though.diff --git a/hw/vhost.h b/hw/vhost.h

> new file mode 100644
> index 0000000..9f82b42
> --- /dev/null
> +++ b/hw/vhost.h
> @@ -0,0 +1,33 @@
> +#ifndef VHOST_H
> +#define VHOST_H
> +
> +#include "hw/hw.h"
> +#include "hw/virtio.h"
> +
> +/* Generic structures common for any vhost based device. */
> +struct vhost_virtqueue {
> +	int kick;
> +	int call;
> +	void *desc;
> +	void *avail;
> +	void *used;
> +};
>    

Please follow CodingStyle.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 03/13] virtio: add iofd/irqfd support
  2010-01-12 22:36   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-13 10:50     ` Michael S. Tsirkin
  0 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-13 10:50 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Tue, Jan 12, 2010 at 04:36:45PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
>> Add binding API to set iofd/irqfd support.
>> Will be used by vhost.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   hw/virtio.c |   13 ++++++++++---
>>   hw/virtio.h |    4 ++++
>>   2 files changed, 14 insertions(+), 3 deletions(-)
>>
>> diff --git a/hw/virtio.c b/hw/virtio.c
>> index 8e3c9ad..b9ec863 100644
>> --- a/hw/virtio.c
>> +++ b/hw/virtio.c
>> @@ -572,6 +572,12 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size,
>>       return&vdev->vq[i];
>>   }
>>
>> +void virtio_irq(VirtIODevice *vdev, VirtQueue *vq)
>> +{
>> +    vdev->isr |= 0x01;
>> +    virtio_notify_vector(vdev, vq->vector);
>> +}
>> +
>>   void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>   {
>>       /* Make sure used ring is updated before we check NO_INTERRUPT. */
>> @@ -582,8 +588,7 @@ void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>            (vq->inuse || vring_avail_idx(vq) != vq->last_avail_idx)))
>>           return;
>>
>> -    vdev->isr |= 0x01;
>> -    virtio_notify_vector(vdev, vq->vector);
>> +    virtio_irq(vdev, vq);
>>   }
>>
>>   void virtio_notify_config(VirtIODevice *vdev)
>> @@ -696,8 +701,10 @@ VirtIODevice *virtio_common_init(const char *name, uint16_t device_id,
>>       vdev->queue_sel = 0;
>>       vdev->config_vector = VIRTIO_NO_VECTOR;
>>       vdev->vq = qemu_mallocz(sizeof(VirtQueue) * VIRTIO_PCI_QUEUE_MAX);
>> -    for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++)
>> +    for(i = 0; i<  VIRTIO_PCI_QUEUE_MAX; i++) {
>>           vdev->vq[i].vector = VIRTIO_NO_VECTOR;
>> +        vdev->vq[i].vdev = vdev;
>> +    }
>>
>>       vdev->name = name;
>>       vdev->config_len = config_size;
>> diff --git a/hw/virtio.h b/hw/virtio.h
>> index ca840e1..193b3f9 100644
>> --- a/hw/virtio.h
>> +++ b/hw/virtio.h
>> @@ -88,6 +88,8 @@ typedef struct {
>>       int (*load_config)(void * opaque, QEMUFile *f);
>>       int (*load_queue)(void * opaque, int n, QEMUFile *f);
>>       unsigned (*get_features)(void * opaque);
>> +    int (*set_irqfd)(void * opaque, int n, int fd, bool assigned);
>> +    int (*set_queuefd)(void * opaque, int n, int fd, bool assigned);
>>    
>
> I'd like to see us abstract things like irqfd a bit more.  It should be  
> a first class object with an method to raise/lower.  In fact, you can  
> probably just use qemu_irq.
>
> Regards,
>
> Anthony Liguori

With a single provider, I fail to see the point of extra abstraction
layers, and I have no idea how a good abstraction would look. Let's see
when we have multiple implementations, then we can abstract them as
appropriate.  Also, irqfd does not have methods to raise/lower so
qemu_irq is far from a good fit.

-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 09/13] tap: add vhost/vhostfd options
  2010-01-12 22:39   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-13 10:59     ` Michael S. Tsirkin
  0 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-13 10:59 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Tue, Jan 12, 2010 at 04:39:52PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:22 AM, Michael S. Tsirkin wrote:
>> Looks like order got mixed up: vhost_net header
>> is added by a follow-up patch. Will be fixed
>> in the next revision.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   net.c           |    8 ++++++++
>>   net/tap.c       |   29 +++++++++++++++++++++++++++++
>>   qemu-options.hx |    4 +++-
>>   3 files changed, 40 insertions(+), 1 deletions(-)
>>
>> diff --git a/net.c b/net.c
>> index 6ef93e6..b942d03 100644
>> --- a/net.c
>> +++ b/net.c
>> @@ -976,6 +976,14 @@ static struct {
>>                   .name = "vnet_hdr",
>>                   .type = QEMU_OPT_BOOL,
>>                   .help = "enable the IFF_VNET_HDR flag on the tap interface"
>> +            }, {
>> +                .name = "vhost",
>> +                .type = QEMU_OPT_BOOL,
>> +                .help = "enable vhost-net network accelerator",
>> +            }, {
>> +                .name = "vhostfd",
>> +                .type = QEMU_OPT_STRING,
>> +                .help = "file descriptor of an already opened vhost net device",
>>               },
>>    
>
> Semantically, I think making vhost it's own backend makes more sense  
> from a user perspective.


It doesn't. Users mostly do not care that vhost is used: they just get
fast virtio and that's all.  Users do care about e.g. tap because they
need setup scripts, understand bridging etc.


Do you propose -net tap be replaced with -net vhost?  This means vhost will
need to get tap flags if it is attached to tap and raw flags if attached
to raw, etc.

A separate backend that only works with virtio frontend is also
ugly. OTOH an option that has effect only with virtio frontend
is pretty usual, vnet_hdr is one such example.


>
> I don't think it's a significant code change.
>
> Regards,
>
> Anthony Liguori

-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 01/13] virtio: export virtqueue structure
  2010-01-12 22:32   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-25 19:10     ` Michael S. Tsirkin
  2010-01-25 19:23       ` Anthony Liguori
  0 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 19:10 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Tue, Jan 12, 2010 at 04:32:44PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:16 AM, Michael S. Tsirkin wrote:
>> vhost needs physical addresses for
>> ring so expose that structure.
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>    
>
> I think accessor functions might make more sense.
>
> Regards,
>
> Anthony Liguori

Well, take a look:

typedef struct VRing
{
    unsigned int num;
    target_phys_addr_t desc;
    target_phys_addr_t avail;
    target_phys_addr_t used;
} VRing;

struct VirtQueue
{
    VRing vring;
    target_phys_addr_t pa;
    uint16_t last_avail_idx;
    int inuse;
    uint16_t vector;
    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
    VirtIODevice *vdev;
    EventNotifier guest_notifier;
    EventNotifier host_notifier;
}

(Notifiers are added by patches I will shortly post).
We need at least:

    unsigned int num;
    target_phys_addr_t desc;
    target_phys_addr_t avail;
    target_phys_addr_t used;
    VRing vring;
    target_phys_addr_t pa;
    uint16_t vector;
    VirtIODevice *vdev;
    EventNotifier guest_notifier;
    EventNotifier host_notifier;

We do not need:
    uint16_t last_avail_idx;
    int inuse;
    void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);

(last_avail_idx will be needed if we ever want to
 move a running guest from kernel to userspace
 or back).
IOW, most of VirtQueue needs to be exposed.
So - do we really want accessors?

-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 02/13] kvm: add API to set ioeventfd
  2010-01-12 22:35   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-25 19:20     ` Michael S. Tsirkin
  2010-01-25 19:28       ` Anthony Liguori
  0 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 19:20 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Tue, Jan 12, 2010 at 04:35:17PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   kvm-all.c |   24 ++++++++++++++++++++++++
>>   kvm.h     |    3 +++
>>   2 files changed, 27 insertions(+), 0 deletions(-)
>>
>> diff --git a/kvm-all.c b/kvm-all.c
>> index a312654..aa00119 100644
>> --- a/kvm-all.c
>> +++ b/kvm-all.c
>> @@ -1113,3 +1113,27 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
>>   {
>>   }
>>   #endif /* !KVM_CAP_SET_GUEST_DEBUG */
>> +
>> +#ifdef KVM_IOEVENTFD
>> +int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
>> +{
>> +    struct kvm_ioeventfd kick = {
>> +        .datamatch = data,
>> +        .addr = addr,
>> +        .len = 2,
>> +        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
>> +        .fd = fd,
>> +    };
>> +    if (!assigned)
>> +        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
>> +    int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&kick);
>> +    if (r<  0)
>> +        return r;
>> +    return 0;
>> +}
>>    
>
> I think we really ought to try to avoid having global state used here.   
> That means either we need to pass a CPUState to this function or we need  
> to have a ioeventfd be allocated as a structure that is then passed  
> around that can store a copy of the kvm_state fetched through CPUState.
>
> I think the later is best.  We really don't want ioeventfd scattered  
> throughout the code.
>
> Regards,
>
> Anthony Liguori


I don't really understand this comment.
I have no idea where to get CPUState in
a device and how to get kvm state from there.

And all other kvm-all functions use kvm_state
already, let's fix them first if we want to
get rid of global state?

Avi, what's your take on this patch?

-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 01/13] virtio: export virtqueue structure
  2010-01-25 19:10     ` Michael S. Tsirkin
@ 2010-01-25 19:23       ` Anthony Liguori
  0 siblings, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 19:23 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/25/2010 01:10 PM, Michael S. Tsirkin wrote:
> On Tue, Jan 12, 2010 at 04:32:44PM -0600, Anthony Liguori wrote:
>    
>> On 01/11/2010 11:16 AM, Michael S. Tsirkin wrote:
>>      
>>> vhost needs physical addresses for
>>> ring so expose that structure.
>>>
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>
>>>        
>> I think accessor functions might make more sense.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
> Well, take a look:
>
> typedef struct VRing
> {
>      unsigned int num;
>      target_phys_addr_t desc;
>      target_phys_addr_t avail;
>      target_phys_addr_t used;
> } VRing;
>
> struct VirtQueue
> {
>      VRing vring;
>      target_phys_addr_t pa;
>      uint16_t last_avail_idx;
>      int inuse;
>      uint16_t vector;
>      void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
>      VirtIODevice *vdev;
>      EventNotifier guest_notifier;
>      EventNotifier host_notifier;
> }
>
> (Notifiers are added by patches I will shortly post).
> We need at least:
>
>      unsigned int num;
>      target_phys_addr_t desc;
>      target_phys_addr_t avail;
>      target_phys_addr_t used;
>      VRing vring;
>      target_phys_addr_t pa;
>    

You need num and the physical address of the ring.  The location of 
desc/available/used are all fixed offsets rom the start of the ring (by 
spec).

>      uint16_t vector;
>      VirtIODevice *vdev;
>      EventNotifier guest_notifier;
>      EventNotifier host_notifier;
>    

guest_notifier and host_notifier appear to be new.  I don't see them in 
your previous patchset.  I can't see why you need vdev.  The backlink is 
there only for handle_output because handle_output is passed a 
VirtQueue.  I also don't see you using vector in this patch series.

So at least from what you posted in the RFC, you just need to know the 
ring pa and num which would be totally reasonable to have an accessor for.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 02/13] kvm: add API to set ioeventfd
  2010-01-25 19:28       ` Anthony Liguori
@ 2010-01-25 19:28         ` Michael S. Tsirkin
  2010-01-25 19:44           ` Anthony Liguori
  0 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 19:28 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Mon, Jan 25, 2010 at 01:28:49PM -0600, Anthony Liguori wrote:
> On 01/25/2010 01:20 PM, Michael S. Tsirkin wrote:
>> On Tue, Jan 12, 2010 at 04:35:17PM -0600, Anthony Liguori wrote:
>>    
>>> On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
>>>      
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>> ---
>>>>    kvm-all.c |   24 ++++++++++++++++++++++++
>>>>    kvm.h     |    3 +++
>>>>    2 files changed, 27 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/kvm-all.c b/kvm-all.c
>>>> index a312654..aa00119 100644
>>>> --- a/kvm-all.c
>>>> +++ b/kvm-all.c
>>>> @@ -1113,3 +1113,27 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
>>>>    {
>>>>    }
>>>>    #endif /* !KVM_CAP_SET_GUEST_DEBUG */
>>>> +
>>>> +#ifdef KVM_IOEVENTFD
>>>> +int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
>>>> +{
>>>> +    struct kvm_ioeventfd kick = {
>>>> +        .datamatch = data,
>>>> +        .addr = addr,
>>>> +        .len = 2,
>>>> +        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
>>>> +        .fd = fd,
>>>> +    };
>>>> +    if (!assigned)
>>>> +        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
>>>> +    int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&kick);
>>>> +    if (r<   0)
>>>> +        return r;
>>>> +    return 0;
>>>> +}
>>>>
>>>>        
>>> I think we really ought to try to avoid having global state used here.
>>> That means either we need to pass a CPUState to this function or we need
>>> to have a ioeventfd be allocated as a structure that is then passed
>>> around that can store a copy of the kvm_state fetched through CPUState.
>>>
>>> I think the later is best.  We really don't want ioeventfd scattered
>>> throughout the code.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>      
>>
>> I don't really understand this comment.
>> I have no idea where to get CPUState in
>> a device and how to get kvm state from there.
>>    
>
> This function doesn't seem to get called in the current patches so it's  
> hard to evaluate what the right thing to do is.
>> And all other kvm-all functions use kvm_state
>> already, let's fix them first if we want to
>> get rid of global state?
>>    
>
> Any time an operation is tied to a CPU in some way, we should not rely  
> on global state.  Based on the name and the fact that it references PIO,  
> it strongly suggests to me that it's somehow related to a CPU but since  
> it's not used in the current code it's hard to validate that.
>
> Regards,
>
> Anthony Liguori


It is called in the binding. Pls take a look.
The function does not reference a specific CPU: it binds
an eventfd descriptor to a specific address/data pair
for all CPUs.

>
>> Avi, what's your take on this patch?
>>
>>    

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 02/13] kvm: add API to set ioeventfd
  2010-01-25 19:20     ` Michael S. Tsirkin
@ 2010-01-25 19:28       ` Anthony Liguori
  2010-01-25 19:28         ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 19:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/25/2010 01:20 PM, Michael S. Tsirkin wrote:
> On Tue, Jan 12, 2010 at 04:35:17PM -0600, Anthony Liguori wrote:
>    
>> On 01/11/2010 11:17 AM, Michael S. Tsirkin wrote:
>>      
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>> ---
>>>    kvm-all.c |   24 ++++++++++++++++++++++++
>>>    kvm.h     |    3 +++
>>>    2 files changed, 27 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index a312654..aa00119 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -1113,3 +1113,27 @@ void kvm_remove_all_breakpoints(CPUState *current_env)
>>>    {
>>>    }
>>>    #endif /* !KVM_CAP_SET_GUEST_DEBUG */
>>> +
>>> +#ifdef KVM_IOEVENTFD
>>> +int kvm_set_ioeventfd(uint16_t addr, uint16_t data, int fd, bool assigned)
>>> +{
>>> +    struct kvm_ioeventfd kick = {
>>> +        .datamatch = data,
>>> +        .addr = addr,
>>> +        .len = 2,
>>> +        .flags = KVM_IOEVENTFD_FLAG_DATAMATCH | KVM_IOEVENTFD_FLAG_PIO,
>>> +        .fd = fd,
>>> +    };
>>> +    if (!assigned)
>>> +        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
>>> +    int r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD,&kick);
>>> +    if (r<   0)
>>> +        return r;
>>> +    return 0;
>>> +}
>>>
>>>        
>> I think we really ought to try to avoid having global state used here.
>> That means either we need to pass a CPUState to this function or we need
>> to have a ioeventfd be allocated as a structure that is then passed
>> around that can store a copy of the kvm_state fetched through CPUState.
>>
>> I think the later is best.  We really don't want ioeventfd scattered
>> throughout the code.
>>
>> Regards,
>>
>> Anthony Liguori
>>      
>
> I don't really understand this comment.
> I have no idea where to get CPUState in
> a device and how to get kvm state from there.
>    

This function doesn't seem to get called in the current patches so it's 
hard to evaluate what the right thing to do is.

> And all other kvm-all functions use kvm_state
> already, let's fix them first if we want to
> get rid of global state?
>    

Any time an operation is tied to a CPU in some way, we should not rely 
on global state.  Based on the name and the fact that it references PIO, 
it strongly suggests to me that it's somehow related to a CPU but since 
it's not used in the current code it's hard to validate that.

Regards,

Anthony Liguori


> Avi, what's your take on this patch?
>
>    

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 02/13] kvm: add API to set ioeventfd
  2010-01-25 19:28         ` Michael S. Tsirkin
@ 2010-01-25 19:44           ` Anthony Liguori
  0 siblings, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 19:44 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/25/2010 01:28 PM, Michael S. Tsirkin wrote:
> It is called in the binding. Pls take a look.
> The function does not reference a specific CPU: it binds
> an eventfd descriptor to a specific address/data pair
> for all CPUs.
>    

This problem is really a symptom of a bigger problem.  Let me respond to 
another patch to provide better context.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 13/13] virtio-net: connect to vhost net backend Michael S. Tsirkin
@ 2010-01-25 19:58   ` Anthony Liguori
  2010-01-25 20:27     ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 19:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
> start/stop backend on driver start/stop
>
> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
> ---
>   hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>   1 files changed, 40 insertions(+), 0 deletions(-)
>
> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
> index c2a389f..99169e1 100644
> --- a/hw/virtio-net.c
> +++ b/hw/virtio-net.c
> @@ -17,6 +17,7 @@
>   #include "net/tap.h"
>   #include "qemu-timer.h"
>   #include "virtio-net.h"
> +#include "vhost_net.h"
>
>   #define VIRTIO_NET_VM_VERSION    11
>
> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>       uint8_t nomulti;
>       uint8_t nouni;
>       uint8_t nobcast;
> +    uint8_t vhost_started;
>       struct {
>           int in_use;
>           int first_multi;
> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>       n->nomulti = 0;
>       n->nouni = 0;
>       n->nobcast = 0;
> +    if (n->vhost_started) {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        n->vhost_started = 0;
> +    }
>
>       /* Flush any MAC and VLAN filter table state */
>       n->mac_table.in_use = 0;
> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>       .link_status_changed = virtio_net_set_link_status,
>   };
>
> +static void virtio_net_set_status(struct VirtIODevice *vdev)
> +{
> +    VirtIONet *n = to_virtio_net(vdev);
> +    if (!n->nic->nc.peer) {
> +        return;
> +    }
> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
> +        return;
> +    }
> +
> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
> +        return;
> +    }
> +    if (!!n->vhost_started == !!(vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK)) {
> +        return;
> +    }
> +    if (vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK) {
> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        if (r<  0) {
> +            fprintf(stderr, "unable to start vhost net: "
> +                    "falling back on userspace virtio\n");
> +        } else {
> +            n->vhost_started = 1;
> +        }
> +    } else {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +        n->vhost_started = 0;
> +    }
> +}
> +
>    

This function does a number of bad things.  It makes virtio-net have 
specific knowledge of backends (like tap) and then has virtio-net pass 
device specific state (vdev) to a network backend.

Ultimately, the following things need to happen:

1) when a driver is ready to begin operating:
   a) virtio-net needs to tell vhost the location of the ring in 
physical memory
   b) virtio-net needs to tell vhost about any notification it receives 
(allowing kvm to shortcut userspace)
   c) virtio-net needs to tell vhost about which irq is being used 
(allowing kvm to shortcut userspace)

What this suggests is that we need an API for the network backends to do 
the following:

  1) probe whether ring passthrough is supported
  2) set the *virtual* address of the ring elements
  3) hand the backend some sort of notification object for sending and 
receiving notifications
  4) stop ring passthrough

vhost should not need any direct knowledge of the device.  This 
interface should be enough to communicating the required data.  I think 
the only bit that is a little fuzzy and perhaps non-obvious for the 
current patches is the notification object.  I would expect it look 
something like:

typedef struct IOEvent {
   int type;
   void (*notify)(IOEvent *);
   void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
} IOEvent;

And then we would have

typedef struct KVMIOEvent {
   IOEvent event = {.type = KVM_IO_EVENT};
   int fd;
} KVMIOEvent;

if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the 
PIO notification and hand that to vhost.  vhost would check event.type 
and if it's KVM_IOEVENT, downcast and use that to get at the file 
descriptor.

The KVMIOEvent should be created, not in the set status callback, but in 
the PCI map callback.  And in the PCI map callback, cpu_single_env 
should be passed to a kvm specific function to create this KVMIOEvent 
object that contains the created eventfd() that's handed to kvm via ioctl.

It doesn't have to be exactly like this, but the point of all of this is 
that these KVM specific mechanism (which are really implementation 
details) should not be pervasive throughout the QEMU interfaces.  There 
should also be strong separation between the vhost-net code and the 
virtio-net device.

Regards,

Anthony Liguori


>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>   {
>       VirtIONet *n;
> @@ -835,6 +871,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>       n->vdev.set_features = virtio_net_set_features;
>       n->vdev.bad_features = virtio_net_bad_features;
>       n->vdev.reset = virtio_net_reset;
> +    n->vdev.set_status = virtio_net_set_status;
>       n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
>       n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
>       n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
> @@ -864,6 +901,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>   void virtio_net_exit(VirtIODevice *vdev)
>   {
>       VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
> +    if (n->vhost_started) {
> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
> +    }
>
>       qemu_purge_queued_packets(&n->nic->nc);
>
>    

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 19:58   ` [Qemu-devel] " Anthony Liguori
@ 2010-01-25 20:27     ` Michael S. Tsirkin
  2010-01-25 21:00       ` Anthony Liguori
  0 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 20:27 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>> start/stop backend on driver start/stop
>>
>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>> ---
>>   hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>   1 files changed, 40 insertions(+), 0 deletions(-)
>>
>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>> index c2a389f..99169e1 100644
>> --- a/hw/virtio-net.c
>> +++ b/hw/virtio-net.c
>> @@ -17,6 +17,7 @@
>>   #include "net/tap.h"
>>   #include "qemu-timer.h"
>>   #include "virtio-net.h"
>> +#include "vhost_net.h"
>>
>>   #define VIRTIO_NET_VM_VERSION    11
>>
>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>       uint8_t nomulti;
>>       uint8_t nouni;
>>       uint8_t nobcast;
>> +    uint8_t vhost_started;
>>       struct {
>>           int in_use;
>>           int first_multi;
>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>       n->nomulti = 0;
>>       n->nouni = 0;
>>       n->nobcast = 0;
>> +    if (n->vhost_started) {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        n->vhost_started = 0;
>> +    }
>>
>>       /* Flush any MAC and VLAN filter table state */
>>       n->mac_table.in_use = 0;
>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>       .link_status_changed = virtio_net_set_link_status,
>>   };
>>
>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>> +{
>> +    VirtIONet *n = to_virtio_net(vdev);
>> +    if (!n->nic->nc.peer) {
>> +        return;
>> +    }
>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>> +        return;
>> +    }
>> +
>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>> +        return;
>> +    }
>> +    if (!!n->vhost_started == !!(vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK)) {
>> +        return;
>> +    }
>> +    if (vdev->status&  VIRTIO_CONFIG_S_DRIVER_OK) {
>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        if (r<  0) {
>> +            fprintf(stderr, "unable to start vhost net: "
>> +                    "falling back on userspace virtio\n");
>> +        } else {
>> +            n->vhost_started = 1;
>> +        }
>> +    } else {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +        n->vhost_started = 0;
>> +    }
>> +}
>> +
>>    
>
> This function does a number of bad things.  It makes virtio-net have  
> specific knowledge of backends (like tap) and then has virtio-net pass  
> device specific state (vdev) to a network backend.
>
> Ultimately, the following things need to happen:
>
> 1) when a driver is ready to begin operating:
>   a) virtio-net needs to tell vhost the location of the ring in physical 
> memory
>   b) virtio-net needs to tell vhost about any notification it receives  
> (allowing kvm to shortcut userspace)
>   c) virtio-net needs to tell vhost about which irq is being used  
> (allowing kvm to shortcut userspace)
>
> What this suggests is that we need an API for the network backends to do  
> the following:
>
>  1) probe whether ring passthrough is supported
>  2) set the *virtual* address of the ring elements
>  3) hand the backend some sort of notification object for sending and  
> receiving notifications
>  4) stop ring passthrough

Look at how vnet_hdr is setup: frontend probes backend, and has 'if
(backend has vnet header) blabla' vhost is really very similar, so I
would like to do it in the same way.

Generally I do not believe in abstractions that only have one
implementation behind it: you only know how to abstract interface after you
have more than one implementation.  So whoever writes another frontend
that can use vhost will be in a better position to add infrastructure to
abstract both that new thing and virtio.

> vhost should not need any direct knowledge of the device.  This  
> interface should be enough to communicating the required data.  I think  
> the only bit that is a little fuzzy and perhaps non-obvious for the  
> current patches is the notification object.  I would expect it look  
> something like:
>
> typedef struct IOEvent {
>   int type;
>   void (*notify)(IOEvent *);
>   void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
> } IOEvent;
> And then we would have
>
> typedef struct KVMIOEvent {
>   IOEvent event = {.type = KVM_IO_EVENT};
>   int fd;
> } KVMIOEvent;
>
> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the  
> PIO notification and hand that to vhost.  vhost would check event.type  
> and if it's KVM_IOEVENT, downcast and use that to get at the file  
> descriptor.

Since we don't have any other IOEvents, I just put the fd
in the generic structure directly. If other types surface
we'll see how to generalize it.

> The KVMIOEvent should be created, not in the set status callback, but in  
> the PCI map callback.  And in the PCI map callback, cpu_single_env  
> should be passed to a kvm specific function to create this KVMIOEvent  
> object that contains the created eventfd() that's handed to kvm via 
> ioctl.

So this specific thing does not work very well because with irqchip, we
want to bypass notification and send irq directly from kernel.
So I created a structure but it does not have callbacks,
only the fd.

> It doesn't have to be exactly like this, but the point of all of this is  
> that these KVM specific mechanism (which are really implementation  
> details) should not be pervasive throughout the QEMU interfaces.


>  There  
> should also be strong separation between the vhost-net code and the  
> virtio-net device.
>
> Regards,
>
> Anthony Liguori
>

I don't see the point of this last idea.  vhost is virtio accelerator,
not a generic network backend.  Whoever wants to use vhost for
non-virtio frontends will have to add infrastructure for this
separation, but I do not believe it's practical or desirable.



>
>>   VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>   {
>>       VirtIONet *n;
>> @@ -835,6 +871,7 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>       n->vdev.set_features = virtio_net_set_features;
>>       n->vdev.bad_features = virtio_net_bad_features;
>>       n->vdev.reset = virtio_net_reset;
>> +    n->vdev.set_status = virtio_net_set_status;
>>       n->rx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_rx);
>>       n->tx_vq = virtio_add_queue(&n->vdev, 256, virtio_net_handle_tx);
>>       n->ctrl_vq = virtio_add_queue(&n->vdev, 64, virtio_net_handle_ctrl);
>> @@ -864,6 +901,9 @@ VirtIODevice *virtio_net_init(DeviceState *dev, NICConf *conf)
>>   void virtio_net_exit(VirtIODevice *vdev)
>>   {
>>       VirtIONet *n = DO_UPCAST(VirtIONet, vdev, vdev);
>> +    if (n->vhost_started) {
>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>> +    }
>>
>>       qemu_purge_queued_packets(&n->nic->nc);
>>
>>    

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 20:27     ` Michael S. Tsirkin
@ 2010-01-25 21:00       ` Anthony Liguori
  2010-01-25 21:01         ` Michael S. Tsirkin
                           ` (2 more replies)
  0 siblings, 3 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 21:00 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>    
>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>      
>>> start/stop backend on driver start/stop
>>>
>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>> ---
>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>
>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>> index c2a389f..99169e1 100644
>>> --- a/hw/virtio-net.c
>>> +++ b/hw/virtio-net.c
>>> @@ -17,6 +17,7 @@
>>>    #include "net/tap.h"
>>>    #include "qemu-timer.h"
>>>    #include "virtio-net.h"
>>> +#include "vhost_net.h"
>>>
>>>    #define VIRTIO_NET_VM_VERSION    11
>>>
>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>        uint8_t nomulti;
>>>        uint8_t nouni;
>>>        uint8_t nobcast;
>>> +    uint8_t vhost_started;
>>>        struct {
>>>            int in_use;
>>>            int first_multi;
>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>        n->nomulti = 0;
>>>        n->nouni = 0;
>>>        n->nobcast = 0;
>>> +    if (n->vhost_started) {
>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        n->vhost_started = 0;
>>> +    }
>>>
>>>        /* Flush any MAC and VLAN filter table state */
>>>        n->mac_table.in_use = 0;
>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>        .link_status_changed = virtio_net_set_link_status,
>>>    };
>>>
>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>> +{
>>> +    VirtIONet *n = to_virtio_net(vdev);
>>> +    if (!n->nic->nc.peer) {
>>> +        return;
>>> +    }
>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>> +        return;
>>> +    }
>>> +
>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>> +        return;
>>> +    }
>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>> +        return;
>>> +    }
>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        if (r<   0) {
>>> +            fprintf(stderr, "unable to start vhost net: "
>>> +                    "falling back on userspace virtio\n");
>>> +        } else {
>>> +            n->vhost_started = 1;
>>> +        }
>>> +    } else {
>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>> +        n->vhost_started = 0;
>>> +    }
>>> +}
>>> +
>>>
>>>        
>> This function does a number of bad things.  It makes virtio-net have
>> specific knowledge of backends (like tap) and then has virtio-net pass
>> device specific state (vdev) to a network backend.
>>
>> Ultimately, the following things need to happen:
>>
>> 1) when a driver is ready to begin operating:
>>    a) virtio-net needs to tell vhost the location of the ring in physical
>> memory
>>    b) virtio-net needs to tell vhost about any notification it receives
>> (allowing kvm to shortcut userspace)
>>    c) virtio-net needs to tell vhost about which irq is being used
>> (allowing kvm to shortcut userspace)
>>
>> What this suggests is that we need an API for the network backends to do
>> the following:
>>
>>   1) probe whether ring passthrough is supported
>>   2) set the *virtual* address of the ring elements
>>   3) hand the backend some sort of notification object for sending and
>> receiving notifications
>>   4) stop ring passthrough
>>      
> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
> (backend has vnet header) blabla' vhost is really very similar, so I
> would like to do it in the same way.
>    

vnet_hdr is IMHO a really bad example to copy from.

vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this 
means is that if you want to migrate from -net tap -net nic,model=virtio 
to -net user -net nic,model=virtio, it will fail.

This is a hard problem to solve in qemu though because it would require 
that we implement software GSO which so far, no one has stepped up to do.

We don't have to repeat this with vhost-net though because unlike 
vnet_hdr, we don't have to expose anything to the guest.

> Generally I do not believe in abstractions that only have one
> implementation behind it: you only know how to abstract interface after you
> have more than one implementation.  So whoever writes another frontend
> that can use vhost will be in a better position to add infrastructure to
> abstract both that new thing and virtio.
>    

I agree with you, but at the same time, I also believe that layering 
violations should be avoided.  I'm not suggesting that you need to make 
anything other than the vhost-net + virtio-net case work.  Just make the 
interfaces abstract enough that you don't need to hand a vdev to 
vhost-net and such that you don't have to pass kvm specific data 
structure (ioeventfd) in what are supposed to be generic interfaces.

>> vhost should not need any direct knowledge of the device.  This
>> interface should be enough to communicating the required data.  I think
>> the only bit that is a little fuzzy and perhaps non-obvious for the
>> current patches is the notification object.  I would expect it look
>> something like:
>>
>> typedef struct IOEvent {
>>    int type;
>>    void (*notify)(IOEvent *);
>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>> } IOEvent;
>> And then we would have
>>
>> typedef struct KVMIOEvent {
>>    IOEvent event = {.type = KVM_IO_EVENT};
>>    int fd;
>> } KVMIOEvent;
>>
>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>> PIO notification and hand that to vhost.  vhost would check event.type
>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>> descriptor.
>>      
> Since we don't have any other IOEvents, I just put the fd
> in the generic structure directly. If other types surface
> we'll see how to generalize it.
>    

I'd feel a whole lot better if we didn't pass the fd around and instead 
passed around a structure.  With just a tiny bit of layering, we can 
even avoid making that structure KVM specific :-)

>> The KVMIOEvent should be created, not in the set status callback, but in
>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>> should be passed to a kvm specific function to create this KVMIOEvent
>> object that contains the created eventfd() that's handed to kvm via
>> ioctl.
>>      
> So this specific thing does not work very well because with irqchip, we
> want to bypass notification and send irq directly from kernel.
> So I created a structure but it does not have callbacks,
> only the fd.
>    

Your structure is an int, right?  I understand the limits due to lack of 
in-kernel irqchip but I still think passing around an fd is a mistake.

>>   There
>> should also be strong separation between the vhost-net code and the
>> virtio-net device.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>      
> I don't see the point of this last idea.  vhost is virtio accelerator,
> not a generic network backend.  Whoever wants to use vhost for
> non-virtio frontends will have to add infrastructure for this
> separation, but I do not believe it's practical or desirable.
>    

vhost is a userspace interface to inject packets into a network device 
just like a raw socket or a tun/tap device is.  It happens to have some 
interesting features like it supports remapping of physical addresses 
and it implements interfaces for notifications that can conveniently be 
used by KVM to bypass userspace in the fast paths.

We should think of it this way for the same reason that vhost-net 
doesn't live in kvm.ko.  If it's easy to isolate something and make it 
more generic, it's almost always the right thing to do.  In this case, 
isolating vhost-net from virtio-net in qemu just involves passing some 
information in a function call verses passing a non-public data 
structure that is then accessed directly.  Regardless of whether you 
agree with my view of vhost-net, the argument alone to avoid making a 
non-public structure public should be enough of an argument.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 21:00       ` Anthony Liguori
@ 2010-01-25 21:01         ` Michael S. Tsirkin
  2010-01-25 21:05         ` Michael S. Tsirkin
  2010-02-24  3:14         ` Paul Brook
  2 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 21:01 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Mon, Jan 25, 2010 at 03:00:16PM -0600, Anthony Liguori wrote:
> On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>>    
>>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>>      
>>>> start/stop backend on driver start/stop
>>>>
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>> ---
>>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>>> index c2a389f..99169e1 100644
>>>> --- a/hw/virtio-net.c
>>>> +++ b/hw/virtio-net.c
>>>> @@ -17,6 +17,7 @@
>>>>    #include "net/tap.h"
>>>>    #include "qemu-timer.h"
>>>>    #include "virtio-net.h"
>>>> +#include "vhost_net.h"
>>>>
>>>>    #define VIRTIO_NET_VM_VERSION    11
>>>>
>>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>>        uint8_t nomulti;
>>>>        uint8_t nouni;
>>>>        uint8_t nobcast;
>>>> +    uint8_t vhost_started;
>>>>        struct {
>>>>            int in_use;
>>>>            int first_multi;
>>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>>        n->nomulti = 0;
>>>>        n->nouni = 0;
>>>>        n->nobcast = 0;
>>>> +    if (n->vhost_started) {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>>
>>>>        /* Flush any MAC and VLAN filter table state */
>>>>        n->mac_table.in_use = 0;
>>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>>        .link_status_changed = virtio_net_set_link_status,
>>>>    };
>>>>
>>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>>> +{
>>>> +    VirtIONet *n = to_virtio_net(vdev);
>>>> +    if (!n->nic->nc.peer) {
>>>> +        return;
>>>> +    }
>>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>>> +        return;
>>>> +    }
>>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>>> +        return;
>>>> +    }
>>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        if (r<   0) {
>>>> +            fprintf(stderr, "unable to start vhost net: "
>>>> +                    "falling back on userspace virtio\n");
>>>> +        } else {
>>>> +            n->vhost_started = 1;
>>>> +        }
>>>> +    } else {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>> +}
>>>> +
>>>>
>>>>        
>>> This function does a number of bad things.  It makes virtio-net have
>>> specific knowledge of backends (like tap) and then has virtio-net pass
>>> device specific state (vdev) to a network backend.
>>>
>>> Ultimately, the following things need to happen:
>>>
>>> 1) when a driver is ready to begin operating:
>>>    a) virtio-net needs to tell vhost the location of the ring in physical
>>> memory
>>>    b) virtio-net needs to tell vhost about any notification it receives
>>> (allowing kvm to shortcut userspace)
>>>    c) virtio-net needs to tell vhost about which irq is being used
>>> (allowing kvm to shortcut userspace)
>>>
>>> What this suggests is that we need an API for the network backends to do
>>> the following:
>>>
>>>   1) probe whether ring passthrough is supported
>>>   2) set the *virtual* address of the ring elements
>>>   3) hand the backend some sort of notification object for sending and
>>> receiving notifications
>>>   4) stop ring passthrough
>>>      
>> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
>> (backend has vnet header) blabla' vhost is really very similar, so I
>> would like to do it in the same way.
>>    
>
> vnet_hdr is IMHO a really bad example to copy from.
>
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this  
> means is that if you want to migrate from -net tap -net nic,model=virtio  
> to -net user -net nic,model=virtio, it will fail.
>
> This is a hard problem to solve in qemu though because it would require  
> that we implement software GSO which so far, no one has stepped up to do.
>
> We don't have to repeat this with vhost-net though because unlike  
> vnet_hdr, we don't have to expose anything to the guest.

We do, capabilities depend on what kernel supports.

>> Generally I do not believe in abstractions that only have one
>> implementation behind it: you only know how to abstract interface after you
>> have more than one implementation.  So whoever writes another frontend
>> that can use vhost will be in a better position to add infrastructure to
>> abstract both that new thing and virtio.
>>    
>
> I agree with you, but at the same time, I also believe that layering  
> violations should be avoided.  I'm not suggesting that you need to make  
> anything other than the vhost-net + virtio-net case work.  Just make the  
> interfaces abstract enough that you don't need to hand a vdev to  
> vhost-net and such that you don't have to pass kvm specific data  
> structure (ioeventfd) in what are supposed to be generic interfaces.
>
>>> vhost should not need any direct knowledge of the device.  This
>>> interface should be enough to communicating the required data.  I think
>>> the only bit that is a little fuzzy and perhaps non-obvious for the
>>> current patches is the notification object.  I would expect it look
>>> something like:
>>>
>>> typedef struct IOEvent {
>>>    int type;
>>>    void (*notify)(IOEvent *);
>>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>>> } IOEvent;
>>> And then we would have
>>>
>>> typedef struct KVMIOEvent {
>>>    IOEvent event = {.type = KVM_IO_EVENT};
>>>    int fd;
>>> } KVMIOEvent;
>>>
>>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>>> PIO notification and hand that to vhost.  vhost would check event.type
>>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>>> descriptor.
>>>      
>> Since we don't have any other IOEvents, I just put the fd
>> in the generic structure directly. If other types surface
>> we'll see how to generalize it.
>>    
>
> I'd feel a whole lot better if we didn't pass the fd around and instead  
> passed around a structure.  With just a tiny bit of layering, we can  
> even avoid making that structure KVM specific :-)
>
>>> The KVMIOEvent should be created, not in the set status callback, but in
>>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>>> should be passed to a kvm specific function to create this KVMIOEvent
>>> object that contains the created eventfd() that's handed to kvm via
>>> ioctl.
>>>      
>> So this specific thing does not work very well because with irqchip, we
>> want to bypass notification and send irq directly from kernel.
>> So I created a structure but it does not have callbacks,
>> only the fd.
>>    
>
> Your structure is an int, right?  I understand the limits due to lack of  
> in-kernel irqchip but I still think passing around an fd is a mistake.
>
>>>   There
>>> should also be strong separation between the vhost-net code and the
>>> virtio-net device.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>> I don't see the point of this last idea.  vhost is virtio accelerator,
>> not a generic network backend.  Whoever wants to use vhost for
>> non-virtio frontends will have to add infrastructure for this
>> separation, but I do not believe it's practical or desirable.
>>    
>
> vhost is a userspace interface to inject packets into a network device  
> just like a raw socket or a tun/tap device is.  It happens to have some  
> interesting features like it supports remapping of physical addresses  
> and it implements interfaces for notifications that can conveniently be  
> used by KVM to bypass userspace in the fast paths.
>
> We should think of it this way for the same reason that vhost-net  
> doesn't live in kvm.ko.  If it's easy to isolate something and make it  
> more generic, it's almost always the right thing to do.  In this case,  
> isolating vhost-net from virtio-net in qemu just involves passing some  
> information in a function call verses passing a non-public data  
> structure that is then accessed directly.  Regardless of whether you  
> agree with my view of vhost-net, the argument alone to avoid making a  
> non-public structure public should be enough of an argument.
>
> Regards,
>
> Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 21:00       ` Anthony Liguori
  2010-01-25 21:01         ` Michael S. Tsirkin
@ 2010-01-25 21:05         ` Michael S. Tsirkin
  2010-01-25 21:11           ` Anthony Liguori
  2010-02-24  3:14         ` Paul Brook
  2 siblings, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-01-25 21:05 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

On Mon, Jan 25, 2010 at 03:00:16PM -0600, Anthony Liguori wrote:
> On 01/25/2010 02:27 PM, Michael S. Tsirkin wrote:
>> On Mon, Jan 25, 2010 at 01:58:08PM -0600, Anthony Liguori wrote:
>>    
>>> On 01/11/2010 11:23 AM, Michael S. Tsirkin wrote:
>>>      
>>>> start/stop backend on driver start/stop
>>>>
>>>> Signed-off-by: Michael S. Tsirkin<mst@redhat.com>
>>>> ---
>>>>    hw/virtio-net.c |   40 ++++++++++++++++++++++++++++++++++++++++
>>>>    1 files changed, 40 insertions(+), 0 deletions(-)
>>>>
>>>> diff --git a/hw/virtio-net.c b/hw/virtio-net.c
>>>> index c2a389f..99169e1 100644
>>>> --- a/hw/virtio-net.c
>>>> +++ b/hw/virtio-net.c
>>>> @@ -17,6 +17,7 @@
>>>>    #include "net/tap.h"
>>>>    #include "qemu-timer.h"
>>>>    #include "virtio-net.h"
>>>> +#include "vhost_net.h"
>>>>
>>>>    #define VIRTIO_NET_VM_VERSION    11
>>>>
>>>> @@ -47,6 +48,7 @@ typedef struct VirtIONet
>>>>        uint8_t nomulti;
>>>>        uint8_t nouni;
>>>>        uint8_t nobcast;
>>>> +    uint8_t vhost_started;
>>>>        struct {
>>>>            int in_use;
>>>>            int first_multi;
>>>> @@ -114,6 +116,10 @@ static void virtio_net_reset(VirtIODevice *vdev)
>>>>        n->nomulti = 0;
>>>>        n->nouni = 0;
>>>>        n->nobcast = 0;
>>>> +    if (n->vhost_started) {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>>
>>>>        /* Flush any MAC and VLAN filter table state */
>>>>        n->mac_table.in_use = 0;
>>>> @@ -820,6 +826,36 @@ static NetClientInfo net_virtio_info = {
>>>>        .link_status_changed = virtio_net_set_link_status,
>>>>    };
>>>>
>>>> +static void virtio_net_set_status(struct VirtIODevice *vdev)
>>>> +{
>>>> +    VirtIONet *n = to_virtio_net(vdev);
>>>> +    if (!n->nic->nc.peer) {
>>>> +        return;
>>>> +    }
>>>> +    if (n->nic->nc.peer->info->type != NET_CLIENT_TYPE_TAP) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    if (!tap_get_vhost_net(n->nic->nc.peer)) {
>>>> +        return;
>>>> +    }
>>>> +    if (!!n->vhost_started == !!(vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK)) {
>>>> +        return;
>>>> +    }
>>>> +    if (vdev->status&   VIRTIO_CONFIG_S_DRIVER_OK) {
>>>> +        int r = vhost_net_start(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        if (r<   0) {
>>>> +            fprintf(stderr, "unable to start vhost net: "
>>>> +                    "falling back on userspace virtio\n");
>>>> +        } else {
>>>> +            n->vhost_started = 1;
>>>> +        }
>>>> +    } else {
>>>> +        vhost_net_stop(tap_get_vhost_net(n->nic->nc.peer), vdev);
>>>> +        n->vhost_started = 0;
>>>> +    }
>>>> +}
>>>> +
>>>>
>>>>        
>>> This function does a number of bad things.  It makes virtio-net have
>>> specific knowledge of backends (like tap) and then has virtio-net pass
>>> device specific state (vdev) to a network backend.
>>>
>>> Ultimately, the following things need to happen:
>>>
>>> 1) when a driver is ready to begin operating:
>>>    a) virtio-net needs to tell vhost the location of the ring in physical
>>> memory
>>>    b) virtio-net needs to tell vhost about any notification it receives
>>> (allowing kvm to shortcut userspace)
>>>    c) virtio-net needs to tell vhost about which irq is being used
>>> (allowing kvm to shortcut userspace)
>>>
>>> What this suggests is that we need an API for the network backends to do
>>> the following:
>>>
>>>   1) probe whether ring passthrough is supported
>>>   2) set the *virtual* address of the ring elements
>>>   3) hand the backend some sort of notification object for sending and
>>> receiving notifications
>>>   4) stop ring passthrough
>>>      
>> Look at how vnet_hdr is setup: frontend probes backend, and has 'if
>> (backend has vnet header) blabla' vhost is really very similar, so I
>> would like to do it in the same way.
>>    
>
> vnet_hdr is IMHO a really bad example to copy from.
>
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this  
> means is that if you want to migrate from -net tap -net nic,model=virtio  
> to -net user -net nic,model=virtio, it will fail.
>
> This is a hard problem to solve in qemu though because it would require  
> that we implement software GSO which so far, no one has stepped up to do.
>
> We don't have to repeat this with vhost-net though because unlike  
> vnet_hdr, we don't have to expose anything to the guest.
>
>> Generally I do not believe in abstractions that only have one
>> implementation behind it: you only know how to abstract interface after you
>> have more than one implementation.  So whoever writes another frontend
>> that can use vhost will be in a better position to add infrastructure to
>> abstract both that new thing and virtio.
>>    
>
> I agree with you, but at the same time, I also believe that layering  
> violations should be avoided.  I'm not suggesting that you need to make  
> anything other than the vhost-net + virtio-net case work.  Just make the  
> interfaces abstract enough that you don't need to hand a vdev to  
> vhost-net and such that you don't have to pass kvm specific data  
> structure (ioeventfd) in what are supposed to be generic interfaces.
>
>>> vhost should not need any direct knowledge of the device.  This
>>> interface should be enough to communicating the required data.  I think
>>> the only bit that is a little fuzzy and perhaps non-obvious for the
>>> current patches is the notification object.  I would expect it look
>>> something like:
>>>
>>> typedef struct IOEvent {
>>>    int type;
>>>    void (*notify)(IOEvent *);
>>>    void (*on_notify)(IOEvent *, void (*cb)(IOEvent *, void *));
>>> } IOEvent;
>>> And then we would have
>>>
>>> typedef struct KVMIOEvent {
>>>    IOEvent event = {.type = KVM_IO_EVENT};
>>>    int fd;
>>> } KVMIOEvent;
>>>
>>> if (kvm_enabled()) in virtio-net, we would allocate a KVMIOEvent for the
>>> PIO notification and hand that to vhost.  vhost would check event.type
>>> and if it's KVM_IOEVENT, downcast and use that to get at the file
>>> descriptor.
>>>      
>> Since we don't have any other IOEvents, I just put the fd
>> in the generic structure directly. If other types surface
>> we'll see how to generalize it.
>>    
>
> I'd feel a whole lot better if we didn't pass the fd around and instead  
> passed around a structure.  With just a tiny bit of layering, we can  
> even avoid making that structure KVM specific :-)
>
>>> The KVMIOEvent should be created, not in the set status callback, but in
>>> the PCI map callback.  And in the PCI map callback, cpu_single_env
>>> should be passed to a kvm specific function to create this KVMIOEvent
>>> object that contains the created eventfd() that's handed to kvm via
>>> ioctl.
>>>      
>> So this specific thing does not work very well because with irqchip, we
>> want to bypass notification and send irq directly from kernel.
>> So I created a structure but it does not have callbacks,
>> only the fd.
>>    
>
> Your structure is an int, right?

it *has* an int:
	struct EventNotifier {
		int fd;
	};

> I understand the limits due to lack of  
> in-kernel irqchip but I still think passing around an fd is a mistake.

So API's will all use EventNotifier *.
We'll be able to add downcasting etc if/when we need it.

>>> There
>>> should also be strong separation between the vhost-net code and the
>>> virtio-net device.
>>>
>>> Regards,
>>>
>>> Anthony Liguori
>>>
>>>      
>> I don't see the point of this last idea.  vhost is virtio accelerator,
>> not a generic network backend.  Whoever wants to use vhost for
>> non-virtio frontends will have to add infrastructure for this
>> separation, but I do not believe it's practical or desirable.
>>    
>
> vhost is a userspace interface to inject packets into a network device  
> just like a raw socket or a tun/tap device is.  It happens to have some  
> interesting features like it supports remapping of physical addresses  
> and it implements interfaces for notifications that can conveniently be  
> used by KVM to bypass userspace in the fast paths.
>
> We should think of it this way for the same reason that vhost-net  
> doesn't live in kvm.ko.  If it's easy to isolate something and make it  
> more generic, it's almost always the right thing to do.  In this case,  
> isolating vhost-net from virtio-net in qemu just involves passing some  
> information in a function call verses passing a non-public data  
> structure that is then accessed directly.  Regardless of whether you  
> agree with my view of vhost-net, the argument alone to avoid making a  
> non-public structure public should be enough of an argument.
>
> Regards,
>
> Anthony Liguori

I'll add accessors, it's not a big deal.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 21:05         ` Michael S. Tsirkin
@ 2010-01-25 21:11           ` Anthony Liguori
  0 siblings, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-01-25 21:11 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

On 01/25/2010 03:05 PM, Michael S. Tsirkin wrote:
> it *has* an int:
> 	struct EventNotifier {
> 		int fd;
> 	};
>    

Yes, this would be a lot better.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-01-25 21:00       ` Anthony Liguori
  2010-01-25 21:01         ` Michael S. Tsirkin
  2010-01-25 21:05         ` Michael S. Tsirkin
@ 2010-02-24  3:14         ` Paul Brook
  2010-02-24  5:29           ` Michael S. Tsirkin
  2010-02-24 14:51           ` Anthony Liguori
  2 siblings, 2 replies; 41+ messages in thread
From: Paul Brook @ 2010-02-24  3:14 UTC (permalink / raw)
  To: qemu-devel; +Cc: Michael S. Tsirkin

> vnet_hdr is IMHO a really bad example to copy from.
> 
> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> means is that if you want to migrate from -net tap -net nic,model=virtio
> to -net user -net nic,model=virtio, it will fail.
> 
> This is a hard problem to solve in qemu though because it would require
> that we implement software GSO which so far, no one has stepped up to do.

Or make virtio-net pass this on to the guest, and have that deal with the 
problem. If you implement software GSO, then devices can assume it's always 
present and don't need to probe.

Paul

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24  3:14         ` Paul Brook
@ 2010-02-24  5:29           ` Michael S. Tsirkin
  2010-02-24 11:30             ` Paul Brook
  2010-02-24 14:51           ` Anthony Liguori
  1 sibling, 1 reply; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-02-24  5:29 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > vnet_hdr is IMHO a really bad example to copy from.
> > 
> > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > means is that if you want to migrate from -net tap -net nic,model=virtio
> > to -net user -net nic,model=virtio, it will fail.
> > 
> > This is a hard problem to solve in qemu though because it would require
> > that we implement software GSO which so far, no one has stepped up to do.
> 
> Or make virtio-net pass this on to the guest, and have that deal with the 
> problem.

This is exacly what we do, via feature bits.

> If you implement software GSO, then devices can assume it's always 
> present and don't need to probe.
> 
> Paul

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24  5:29           ` Michael S. Tsirkin
@ 2010-02-24 11:30             ` Paul Brook
  2010-02-24 11:46               ` Michael S. Tsirkin
  0 siblings, 1 reply; 41+ messages in thread
From: Paul Brook @ 2010-02-24 11:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Michael S. Tsirkin

> On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > > vnet_hdr is IMHO a really bad example to copy from.
> > >
> > > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > > means is that if you want to migrate from -net tap -net
> > > nic,model=virtio to -net user -net nic,model=virtio, it will fail.
> > >
> > > This is a hard problem to solve in qemu though because it would require
> > > that we implement software GSO which so far, no one has stepped up to
> > > do.
> >
> > Or make virtio-net pass this on to the guest, and have that deal with the
> > problem.
> 
> This is exacly what we do, via feature bits.

AFAIK we only have static feature bits. There aren't useful for anything that 
the user may change on the fly (or via migration).

If you don't have a fallback implementation then the guest must be able to 
cope with this feature disappearing without warning. If we do have a software 
fallback then the feature bit is just for backwards compatibility, and should 
be enabled unconditionally (on current machine types).

Paul

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24 11:30             ` Paul Brook
@ 2010-02-24 11:46               ` Michael S. Tsirkin
  2010-02-24 12:26                 ` Paul Brook
  2010-02-24 15:16                 ` Anthony Liguori
  0 siblings, 2 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-02-24 11:46 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Wed, Feb 24, 2010 at 11:30:18AM +0000, Paul Brook wrote:
> > On Wed, Feb 24, 2010 at 03:14:25AM +0000, Paul Brook wrote:
> > > > vnet_hdr is IMHO a really bad example to copy from.
> > > >
> > > > vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
> > > > means is that if you want to migrate from -net tap -net
> > > > nic,model=virtio to -net user -net nic,model=virtio, it will fail.
> > > >
> > > > This is a hard problem to solve in qemu though because it would require
> > > > that we implement software GSO which so far, no one has stepped up to
> > > > do.
> > >
> > > Or make virtio-net pass this on to the guest, and have that deal with the
> > > problem.
> > 
> > This is exacly what we do, via feature bits.
> 
> AFAIK we only have static feature bits. There aren't useful for anything that 
> the user may change on the fly (or via migration).

Ah, you mean telling the guest to switch features on and off: natureally
this would require guest driver changes.  This might be also non-trivial
to implement. Consider a request that has been posted without checksum,
suddenly we disable checksum support.  Guest will need a way to handle
that.  Guest OSes might also not be prepared to handle device features
going away.

> If you don't have a fallback implementation then the guest must be able to 
> cope with this feature disappearing without warning.

Instead, we simply fail migration at the moment. We also use machine
type to let users force some level of homogenuity in the cluster.

> If we do have a software 
> fallback then the feature bit is just for backwards compatibility, and should 
> be enabled unconditionally (on current machine types).
> 
> Paul

Software fallback might turn out to be slower than disabling the feature
in the guest. For example, and extra pass over packet might cause extra CPU
cache thrashing. In that case, it's not obvious whether enabling it
unconditionally will turn out to be a good idea. But we'll have to have
that code to be able to tell.

-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24 11:46               ` Michael S. Tsirkin
@ 2010-02-24 12:26                 ` Paul Brook
  2010-02-24 12:40                   ` Michael S. Tsirkin
  2010-02-24 15:16                 ` Anthony Liguori
  1 sibling, 1 reply; 41+ messages in thread
From: Paul Brook @ 2010-02-24 12:26 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: qemu-devel

> > If we do have a software
> > fallback then the feature bit is just for backwards compatibility, and
> > should be enabled unconditionally (on current machine types).
>
> Software fallback might turn out to be slower than disabling the feature
> in the guest. For example, and extra pass over packet might cause extra CPU
> cache thrashing. In that case, it's not obvious whether enabling it
> unconditionally will turn out to be a good idea. But we'll have to have
> that code to be able to tell.

IMO once you accept that these things can change, consistency is more 
important than trying to guess what the "best" option may be.
Starting qemu on machine A and migrating to machine B should give the same 
guest environment as starting on machine B.

Paul

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24 12:26                 ` Paul Brook
@ 2010-02-24 12:40                   ` Michael S. Tsirkin
  0 siblings, 0 replies; 41+ messages in thread
From: Michael S. Tsirkin @ 2010-02-24 12:40 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

On Wed, Feb 24, 2010 at 12:26:53PM +0000, Paul Brook wrote:
> > > If we do have a software
> > > fallback then the feature bit is just for backwards compatibility, and
> > > should be enabled unconditionally (on current machine types).
> >
> > Software fallback might turn out to be slower than disabling the feature
> > in the guest. For example, and extra pass over packet might cause extra CPU
> > cache thrashing. In that case, it's not obvious whether enabling it
> > unconditionally will turn out to be a good idea. But we'll have to have
> > that code to be able to tell.
> 
> IMO once you accept that these things can change, consistency is more 
> important than trying to guess what the "best" option may be.

Yes, SW fallback might be nice to have. What's important is likely to
depend on specific user.

> Starting qemu on machine A and migrating to machine B should give the same 
> guest environment as starting on machine B.
> 
> Paul

So currently, the way we try to ensure this is by checking feature bits
against the list supported by backend, and failing migration if there's
a discrepancy.


-- 
MST

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24  3:14         ` Paul Brook
  2010-02-24  5:29           ` Michael S. Tsirkin
@ 2010-02-24 14:51           ` Anthony Liguori
  1 sibling, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-02-24 14:51 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel, Michael S. Tsirkin

On 02/23/2010 09:14 PM, Paul Brook wrote:
>> vnet_hdr is IMHO a really bad example to copy from.
>>
>> vnet_hdr leaks into the migration state via n->has_vnet_hdr.  What this
>> means is that if you want to migrate from -net tap -net nic,model=virtio
>> to -net user -net nic,model=virtio, it will fail.
>>
>> This is a hard problem to solve in qemu though because it would require
>> that we implement software GSO which so far, no one has stepped up to do.
>>      
> Or make virtio-net pass this on to the guest, and have that deal with the
> problem. If you implement software GSO, then devices can assume it's always
> present and don't need to probe.
>    

That would make migration guest cooperative.  This implies that the 
guests become an important part of the test matrix with respect to 
migration.  The result is a combinatorial expansion of the test matrix.

There's a lot of value in having transparent migration.  Maybe it makes 
sense to have some sort of guest notification for the purposes of 
optimization, but we should be very careful about that because the 
practical cost is huge.

Regards,

Anthony Liguori

> Paul
>    

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [Qemu-devel] Re: [PATCH-RFC 13/13] virtio-net: connect to vhost net backend
  2010-02-24 11:46               ` Michael S. Tsirkin
  2010-02-24 12:26                 ` Paul Brook
@ 2010-02-24 15:16                 ` Anthony Liguori
  1 sibling, 0 replies; 41+ messages in thread
From: Anthony Liguori @ 2010-02-24 15:16 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Paul Brook, qemu-devel

On 02/24/2010 05:46 AM, Michael S. Tsirkin wrote:
>
> Ah, you mean telling the guest to switch features on and off: natureally
> this would require guest driver changes.  This might be also non-trivial
> to implement. Consider a request that has been posted without checksum,
> suddenly we disable checksum support.  Guest will need a way to handle
> that.  Guest OSes might also not be prepared to handle device features
> going away.
>    

It's the same as cpuid flags.

Management tools should plan appropriately and not advertise virtio 
features that cannot be supported throughout the migration pool.  
Fortunately, someone had enough foresight to allow management software 
to disable virtio features on a per-feature basis :-)

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2010-02-24 15:16 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1263230079.git.mst@redhat.com>
2010-01-11 17:16 ` [Qemu-devel] [PATCH-RFC 01/13] virtio: export virtqueue structure Michael S. Tsirkin
2010-01-12 22:32   ` [Qemu-devel] " Anthony Liguori
2010-01-25 19:10     ` Michael S. Tsirkin
2010-01-25 19:23       ` Anthony Liguori
2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 02/13] kvm: add API to set ioeventfd Michael S. Tsirkin
2010-01-12 22:35   ` [Qemu-devel] " Anthony Liguori
2010-01-25 19:20     ` Michael S. Tsirkin
2010-01-25 19:28       ` Anthony Liguori
2010-01-25 19:28         ` Michael S. Tsirkin
2010-01-25 19:44           ` Anthony Liguori
2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 03/13] virtio: add iofd/irqfd support Michael S. Tsirkin
2010-01-12 22:36   ` [Qemu-devel] " Anthony Liguori
2010-01-13 10:50     ` Michael S. Tsirkin
2010-01-11 17:17 ` [Qemu-devel] [PATCH-RFC 04/13] virtio-pci: fill in irqfd/queufd support Michael S. Tsirkin
2010-01-11 17:19 ` [Qemu-devel] [PATCH-RFC 05/13] syborg_virtio: add irqfd/eventfd support Michael S. Tsirkin
2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 06/13] s390-virtio: fill in irqfd support Michael S. Tsirkin
2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 07/13] virtio: move typedef to qemu-common Michael S. Tsirkin
2010-01-11 17:20 ` [Qemu-devel] [PATCH-RFC 08/13] net/tap: add interface to get device fd Michael S. Tsirkin
2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 09/13] tap: add vhost/vhostfd options Michael S. Tsirkin
2010-01-12 22:39   ` [Qemu-devel] " Anthony Liguori
2010-01-13 10:59     ` Michael S. Tsirkin
2010-01-12 22:42   ` Anthony Liguori
2010-01-11 17:22 ` [Qemu-devel] [PATCH-RFC 10/13] tap: add API to retrieve vhost net header Michael S. Tsirkin
2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 12/13] virtio: add status change callback Michael S. Tsirkin
2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 13/13] virtio-net: connect to vhost net backend Michael S. Tsirkin
2010-01-25 19:58   ` [Qemu-devel] " Anthony Liguori
2010-01-25 20:27     ` Michael S. Tsirkin
2010-01-25 21:00       ` Anthony Liguori
2010-01-25 21:01         ` Michael S. Tsirkin
2010-01-25 21:05         ` Michael S. Tsirkin
2010-01-25 21:11           ` Anthony Liguori
2010-02-24  3:14         ` Paul Brook
2010-02-24  5:29           ` Michael S. Tsirkin
2010-02-24 11:30             ` Paul Brook
2010-02-24 11:46               ` Michael S. Tsirkin
2010-02-24 12:26                 ` Paul Brook
2010-02-24 12:40                   ` Michael S. Tsirkin
2010-02-24 15:16                 ` Anthony Liguori
2010-02-24 14:51           ` Anthony Liguori
2010-01-11 17:23 ` [Qemu-devel] [PATCH-RFC 11/13] vhost net support Michael S. Tsirkin
2010-01-12 22:45   ` [Qemu-devel] " Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).