[Qemu-devel] [PATCH 2/3] virtio network device

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 2/3] virtio network device
@ 2007-12-04 21:54 Anthony Liguori
  2007-12-04 22:12 ` Anthony Liguori
  2007-12-04 23:49 ` [Qemu-devel] " Dor Laor
  0 siblings, 2 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-12-04 21:54 UTC (permalink / raw)
  To: qemu-devel, Avi Kivity, Dor Laor, Rusty Russell

[-- Attachment #1: Type: text/plain, Size: 2 bytes --]




[-- Attachment #2: virtio-net.diff --]
[-- Type: text/x-patch, Size: 7193 bytes --]

Subject: [PATCH 2/3] virtio network device
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Avi Kivity <avi@qumranet.com>
Cc: Dor Laor <dor.laor@qumranet.com>

This patch implements the backend support for the virtio network device.  The
device is optimized for virtualized environments by limiting the number of
guest=>host transitions per-packet.  In the best case, the number of
transitions per-packet is < 1.

With some further optimizations, I have been able to obtain 1.5gbit/sec
host=>guest with this driver (compared to the 90mbit/sec from the rtl8139
card).  This requires additional patches not present in this series.

Index: qemu/Makefile.target
===================================================================
--- qemu.orig/Makefile.target	2007-12-04 14:15:20.000000000 -0600
+++ qemu/Makefile.target	2007-12-04 14:43:57.000000000 -0600
@@ -436,7 +436,7 @@
 VL_OBJS += rtl8139.o
 
 # virtio devices
-VL_OBJS += virtio.o
+VL_OBJS += virtio.o virtio-net.o
 
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
Index: qemu/hw/pci.c
===================================================================
--- qemu.orig/hw/pci.c	2007-12-04 14:15:20.000000000 -0600
+++ qemu/hw/pci.c	2007-12-04 14:44:06.000000000 -0600
@@ -25,6 +25,7 @@
 #include "pci.h"
 #include "console.h"
 #include "net.h"
+#include "pc.h"
 
 //#define DEBUG_PCI
 
@@ -591,9 +592,11 @@
         pci_rtl8139_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "pcnet") == 0) {
         pci_pcnet_init(bus, nd, devfn);
+    } else if (strcmp(nd->model, "virtio") == 0) {
+	virtio_net_init(bus, nd, devfn);
     } else if (strcmp(nd->model, "?") == 0) {
         fprintf(stderr, "qemu: Supported PCI NICs: i82551 i82557b i82559er"
-                        " ne2k_pci pcnet rtl8139\n");
+                        " ne2k_pci pcnet rtl8139 virtio\n");
         exit (1);
     } else {
         fprintf(stderr, "qemu: Unsupported NIC: %s\n", nd->model);
Index: qemu/hw/virtio-net.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ qemu/hw/virtio-net.c	2007-12-04 14:17:37.000000000 -0600
@@ -0,0 +1,178 @@
+/*
+ * Virtio Network Device
+ *
+ * Copyright IBM, Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "virtio.h"
+#include "net.h"
+#include "pc.h"
+
+/* from Linux's virtio_net.h */
+
+/* The ID for virtio_net */
+#define VIRTIO_ID_NET	1
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_NO_CSUM	0
+#define VIRTIO_NET_F_TSO4	1
+#define VIRTIO_NET_F_UFO	2
+#define VIRTIO_NET_F_TSO4_ECN	3
+#define VIRTIO_NET_F_TSO6	4
+#define VIRTIO_NET_F_MAC	5
+
+/* The config defining mac address (6 bytes) */
+struct virtio_net_config
+{
+    uint8_t mac[6];
+} __attribute__((packed));
+
+/* This is the first element of the scatter-gather list.  If you don't
+ * specify GSO or CSUM features, you can simply ignore the header. */
+struct virtio_net_hdr
+{
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	// Use csum_start, csum_offset
+    uint8_t flags;
+#define VIRTIO_NET_HDR_GSO_NONE		0	// Not a GSO frame
+#define VIRTIO_NET_HDR_GSO_TCPV4	1	// GSO frame, IPv4 TCP (TSO)
+/* FIXME: Do we need this?  If they said they can handle ECN, do they care? */
+#define VIRTIO_NET_HDR_GSO_TCPV4_ECN	2	// GSO frame, IPv4 TCP w/ ECN
+#define VIRTIO_NET_HDR_GSO_UDP		3	// GSO frame, IPv4 UDP (UFO)
+#define VIRTIO_NET_HDR_GSO_TCPV6	4	// GSO frame, IPv6 TCP
+    uint8_t gso_type;
+    uint16_t gso_size;
+    uint16_t csum_start;
+    uint16_t csum_offset;
+};
+
+typedef struct VirtIONet
+{
+    VirtIODevice vdev;
+    uint8_t mac[6];
+    VirtQueue *rx_vq;
+    VirtQueue *tx_vq;
+    VLANClientState *vc;
+    int can_receive;
+} VirtIONet;
+
+static VirtIONet *to_virtio_net(VirtIODevice *vdev)
+{
+    return (VirtIONet *)vdev;
+}
+
+static void virtio_net_update_config(VirtIODevice *vdev, uint8_t *config)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    struct virtio_net_config netcfg;
+
+    memcpy(netcfg.mac, n->mac, 6);
+    memcpy(config, &netcfg, sizeof(netcfg));
+}
+
+static uint32_t virtio_net_get_features(VirtIODevice *vdev)
+{
+    return (1 << VIRTIO_NET_F_MAC);
+}
+
+/* RX */
+
+static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    n->can_receive = 1;
+}
+
+static int virtio_net_can_receive(void *opaque)
+{
+    VirtIONet *n = opaque;
+
+    return (n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK) && n->can_receive;
+}
+
+static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+{
+    VirtIONet *n = opaque;
+    VirtQueueElement elem;
+    struct virtio_net_hdr *hdr;
+    int offset, i;
+
+    /* FIXME: the drivers really need to set their status better */
+    if (n->rx_vq->vring.avail == NULL) {
+	n->can_receive = 0;
+	return;
+    }
+
+    if (virtqueue_pop(n->rx_vq, &elem) == 0) {
+	/* wait until the guest adds some rx bufs */
+	n->can_receive = 0;
+	return;
+    }
+
+    hdr = (void *)elem.in_sg[0].iov_base;
+    hdr->flags = 0;
+    hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+    /* copy in packet.  ugh */
+    offset = 0;
+    i = 1;
+    while (offset < size && i < elem.in_num) {
+	int len = MIN(elem.in_sg[i].iov_len, size - offset);
+	memcpy(elem.in_sg[i].iov_base, buf + offset, len);
+	offset += len;
+	i++;
+    }
+
+    /* signal other side */
+    virtqueue_push(n->rx_vq, &elem, sizeof(*hdr) + offset);
+    virtio_notify(&n->vdev, n->rx_vq);
+}
+
+/* TX */
+static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    VirtQueueElement elem;
+
+    while (virtqueue_pop(vq, &elem)) {
+	int i;
+	size_t len = 0;
+
+	/* ignore the header for now */
+	for (i = 1; i < elem.out_num; i++) {
+	    qemu_send_packet(n->vc, elem.out_sg[i].iov_base,
+			     elem.out_sg[i].iov_len);
+	    len += elem.out_sg[i].iov_len;
+	}
+
+	virtqueue_push(vq, &elem, sizeof(struct virtio_net_hdr) + len);
+	virtio_notify(&n->vdev, vq);
+    }
+}
+
+void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
+{
+    VirtIONet *n;
+
+    n = (VirtIONet *)virtio_init_pci(bus, "virtio-net", 6900, 0x1000,
+				     0, VIRTIO_ID_NET,
+				     0x02, 0x00, 0x00,
+				     6, sizeof(VirtIONet));
+
+    n->vdev.update_config = virtio_net_update_config;
+    n->vdev.get_features = virtio_net_get_features;
+    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
+    n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
+    n->can_receive = 0;
+    memcpy(n->mac, nd->macaddr, 6);
+    n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
+				 virtio_net_can_receive, n);
+
+    return &n->vdev;
+}
Index: qemu/hw/pc.h
===================================================================
--- qemu.orig/hw/pc.h	2007-12-04 14:15:20.000000000 -0600
+++ qemu/hw/pc.h	2007-12-04 14:43:57.000000000 -0600
@@ -142,4 +142,9 @@
 
 void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 
+/* virtio-net.c */
+
+void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
+
+
 #endif

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] [PATCH 2/3] virtio network device
  2007-12-04 21:54 [Qemu-devel] [PATCH 2/3] virtio network device Anthony Liguori
@ 2007-12-04 22:12 ` Anthony Liguori
  2007-12-04 23:49 ` [Qemu-devel] " Dor Laor
  1 sibling, 0 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-12-04 22:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: Rusty Russell

Anthony Liguori wrote:
>
>
> Subject: [PATCH 2/3] virtio network device
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Avi Kivity <avi@qumranet.com>
> Cc: Dor Laor <dor.laor@qumranet.com>
>
> This patch implements the backend support for the virtio network device.  The
> device is optimized for virtualized environments by limiting the number of
> guest=>host transitions per-packet.  In the best case, the number of
> transitions per-packet is < 1.
>
> With some further optimizations, I have been able to obtain 1.5gbit/sec
> host=>guest with this driver (compared to the 90mbit/sec from the rtl8139
> card).  This requires additional patches not present in this series.

Let me qualify that these numbers are with KVM.  I haven't done 
performance testing using CPU emulation.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-04 21:54 [Qemu-devel] [PATCH 2/3] virtio network device Anthony Liguori
  2007-12-04 22:12 ` Anthony Liguori
@ 2007-12-04 23:49 ` Dor Laor
  2007-12-05 17:18   ` Anthony Liguori
  1 sibling, 1 reply; 17+ messages in thread
From: Dor Laor @ 2007-12-04 23:49 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Rusty Russell, qemu-devel

[-- Attachment #1: Type: text/plain, Size: 3065 bytes --]

Anthony Liguori wrote:

Index: qemu/hw/virtio-net.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ qemu/hw/virtio-net.c	2007-12-04 14:17:37.000000000 -0600

+
+static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
+{
+    VirtIONet *n = opaque;
+    VirtQueueElement elem;
+    struct virtio_net_hdr *hdr;
+    int offset, i;
+
+    /* FIXME: the drivers really need to set their status better */
+    if (n->rx_vq->vring.avail == NULL) {
+	n->can_receive = 0;
+	return;
+    }
+
+    if (virtqueue_pop(n->rx_vq, &elem) == 0) {
+	/* wait until the guest adds some rx bufs */
+	n->can_receive = 0;
+	return;
+    }
+
+    hdr = (void *)elem.in_sg[0].iov_base;
+    hdr->flags = 0;
+    hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+    /* copy in packet.  ugh */
+    offset = 0;
+    i = 1;
+    while (offset < size && i < elem.in_num) {
+	int len = MIN(elem.in_sg[i].iov_len, size - offset);
+	memcpy(elem.in_sg[i].iov_base, buf + offset, len);



Actually according to qemu's standard, one should use cpu_physical_memory_write/
cpu_physical_memory_read functions.
This is true also for reading the ring values.




+	offset += len;
+	i++;
+    }
+
+    /* signal other side */
+    virtqueue_push(n->rx_vq, &elem, sizeof(*hdr) + offset);
+    virtio_notify(&n->vdev, n->rx_vq);
+}
+
+/* TX */
+static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = to_virtio_net(vdev);
+    VirtQueueElement elem;
+
+    while (virtqueue_pop(vq, &elem)) {
+	int i;
+	size_t len = 0;
+
+	/* ignore the header for now */
+	for (i = 1; i < elem.out_num; i++) {
+	    qemu_send_packet(n->vc, elem.out_sg[i].iov_base,
+			     elem.out_sg[i].iov_len);
+	    len += elem.out_sg[i].iov_len;
+	}
+
+	virtqueue_push(vq, &elem, sizeof(struct virtio_net_hdr) + len);
+	virtio_notify(&n->vdev, vq);
+    }


The virtio_notify should be left out of the while loop to minimize 
irq injection.


+}
+
+void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
+{
+    VirtIONet *n;
+
+    n = (VirtIONet *)virtio_init_pci(bus, "virtio-net", 6900, 0x1000,
+				     0, VIRTIO_ID_NET,
+				     0x02, 0x00, 0x00,
+				     6, sizeof(VirtIONet));
+
+    n->vdev.update_config = virtio_net_update_config;
+    n->vdev.get_features = virtio_net_get_features;
+    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
+    n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
+    n->can_receive = 0;
+    memcpy(n->mac, nd->macaddr, 6);
+    n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
+				 virtio_net_can_receive, n);
+
+    return &n->vdev;
+}
Index: qemu/hw/pc.h
===================================================================
--- qemu.orig/hw/pc.h	2007-12-04 14:15:20.000000000 -0600
+++ qemu/hw/pc.h	2007-12-04 14:43:57.000000000 -0600
@@ -142,4 +142,9 @@
 
 void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
 
+/* virtio-net.c */
+
+void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
+
+
 #endif


[-- Attachment #2: Type: text/html, Size: 3567 bytes --]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-04 23:49 ` [Qemu-devel] " Dor Laor
@ 2007-12-05 17:18   ` Anthony Liguori
  2007-12-05 17:44     ` Paul Brook
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-12-05 17:18 UTC (permalink / raw)
  To: dor.laor; +Cc: Rusty Russell, qemu-devel

Dor Laor wrote:
> Anthony Liguori wrote:
> Index: qemu/hw/virtio-net.c
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ qemu/hw/virtio-net.c	2007-12-04 14:17:37.000000000 -0600
>
> +
> +static void virtio_net_receive(void *opaque, const uint8_t *buf, int size)
> +{
> +    VirtIONet *n = opaque;
> +    VirtQueueElement elem;
> +    struct virtio_net_hdr *hdr;
> +    int offset, i;
> +
> +    /* FIXME: the drivers really need to set their status better */
> +    if (n->rx_vq->vring.avail == NULL) {
> +	n->can_receive = 0;
> +	return;
> +    }
> +
> +    if (virtqueue_pop(n->rx_vq, &elem) == 0) {
> +	/* wait until the guest adds some rx bufs */
> +	n->can_receive = 0;
> +	return;
> +    }
> +
> +    hdr = (void *)elem.in_sg[0].iov_base;
> +    hdr->flags = 0;
> +    hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
> +
> +    /* copy in packet.  ugh */
> +    offset = 0;
> +    i = 1;
> +    while (offset < size && i < elem.in_num) {
> +	int len = MIN(elem.in_sg[i].iov_len, size - offset);
> +	memcpy(elem.in_sg[i].iov_base, buf + offset, len);
>
>
>
> Actually according to qemu's standard, one should use cpu_physical_memory_write/
> cpu_physical_memory_read functions.
> This is true also for reading the ring values.
>   

Yes, and unfortunately, cpu_physical_memory_{read,write} are copy 
interfaces.  We really don't want that for high speed I/O.

The only down-sides of not using cpu_physical_memory_{read,write} is 
that writes aren't dispatched to MMIO functions and dirty updating isn't 
handled automatically.  I do need to go through an audit the dirty 
updating.  It may be possible to do a 1-time check of MMIO memory and 
then have a fast and slow path.  I'm not sure how much it matter honestly.

Regards,

Anthony Liguori

>
>
> +	offset += len;
> +	i++;
> +    }
> +
> +    /* signal other side */
> +    virtqueue_push(n->rx_vq, &elem, sizeof(*hdr) + offset);
> +    virtio_notify(&n->vdev, n->rx_vq);
> +}
> +
> +/* TX */
> +static void virtio_net_handle_tx(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    VirtIONet *n = to_virtio_net(vdev);
> +    VirtQueueElement elem;
> +
> +    while (virtqueue_pop(vq, &elem)) {
> +	int i;
> +	size_t len = 0;
> +
> +	/* ignore the header for now */
> +	for (i = 1; i < elem.out_num; i++) {
> +	    qemu_send_packet(n->vc, elem.out_sg[i].iov_base,
> +			     elem.out_sg[i].iov_len);
> +	    len += elem.out_sg[i].iov_len;
> +	}
> +
> +	virtqueue_push(vq, &elem, sizeof(struct virtio_net_hdr) + len);
> +	virtio_notify(&n->vdev, vq);
> +    }
>
>
> The virtio_notify should be left out of the while loop to minimize 
> irq injection.
>
>
> +}
> +
> +void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
> +{
> +    VirtIONet *n;
> +
> +    n = (VirtIONet *)virtio_init_pci(bus, "virtio-net", 6900, 0x1000,
> +				     0, VIRTIO_ID_NET,
> +				     0x02, 0x00, 0x00,
> +				     6, sizeof(VirtIONet));
> +
> +    n->vdev.update_config = virtio_net_update_config;
> +    n->vdev.get_features = virtio_net_get_features;
> +    n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
> +    n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
> +    n->can_receive = 0;
> +    memcpy(n->mac, nd->macaddr, 6);
> +    n->vc = qemu_new_vlan_client(nd->vlan, virtio_net_receive,
> +				 virtio_net_can_receive, n);
> +
> +    return &n->vdev;
> +}
> Index: qemu/hw/pc.h
> ===================================================================
> --- qemu.orig/hw/pc.h	2007-12-04 14:15:20.000000000 -0600
> +++ qemu/hw/pc.h	2007-12-04 14:43:57.000000000 -0600
> @@ -142,4 +142,9 @@
>
>  void isa_ne2000_init(int base, qemu_irq irq, NICInfo *nd);
>
> +/* virtio-net.c */
> +
> +void *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn);
> +
> +
>  #endif
>   

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-05 17:18   ` Anthony Liguori
@ 2007-12-05 17:44     ` Paul Brook
  2007-12-05 20:20       ` Anthony Liguori
  0 siblings, 1 reply; 17+ messages in thread
From: Paul Brook @ 2007-12-05 17:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell

> > Actually according to qemu's standard, one should use
> > cpu_physical_memory_write/ cpu_physical_memory_read functions.
> > This is true also for reading the ring values.
>
> Yes, and unfortunately, cpu_physical_memory_{read,write} are copy
> interfaces.  We really don't want that for high speed I/O.

I really don't like doing direct access to guest ram without implementing a 
proper API for zero-copy/scatter-gather access. There was a list thread about 
this not so long ago.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-05 17:44     ` Paul Brook
@ 2007-12-05 20:20       ` Anthony Liguori
  2007-12-06  9:27         ` Jamie Lokier
  2007-12-08 13:22         ` Paul Brook
  0 siblings, 2 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-12-05 20:20 UTC (permalink / raw)
  To: Paul Brook; +Cc: Rusty Russell, qemu-devel

Paul Brook wrote:
>>> Actually according to qemu's standard, one should use
>>> cpu_physical_memory_write/ cpu_physical_memory_read functions.
>>> This is true also for reading the ring values.
>>>       
>> Yes, and unfortunately, cpu_physical_memory_{read,write} are copy
>> interfaces.  We really don't want that for high speed I/O.
>>     
>
> I really don't like doing direct access to guest ram without implementing a 
> proper API for zero-copy/scatter-gather access. There was a list thread about 
> this not so long ago.
>   

I agree that we need a proper API for sg ram access.  I'm going to look 
into that real soon since it's necessary to optimize the network/disk 
transports.

virtio makes things a bit trickier though.  There's a shared ring queue 
between the host and guest.  The ring queue is lock-less and depends on 
the ability to atomically increment ring queue indices to be SMP safe.  
Using a copy-API wouldn't be a problem for QEMU since the host and guest 
are always running in lock-step.  A copy API is actually needed to deal 
with differing host/guest alignment and endianness.

Once you introduce KVM though, this is no longer true since KVM supports 
true SMP.  The solution may be to implement some sort of 
atomic_increment function and then have that use a if (kvm) guard to do 
a direct access verses a copy.

Regards,

Anthony Liguori

> Paul
>   

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-05 20:20       ` Anthony Liguori
@ 2007-12-06  9:27         ` Jamie Lokier
  2007-12-08 13:22         ` Paul Brook
  1 sibling, 0 replies; 17+ messages in thread
From: Jamie Lokier @ 2007-12-06  9:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Rusty Russell, Paul Brook

Anthony Liguori wrote:
> virtio makes things a bit trickier though.  There's a shared ring queue 
> between the host and guest.

Does this work when the host and guest are different architectures, or
different word sizes (x86/x86_64)?

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-05 20:20       ` Anthony Liguori
  2007-12-06  9:27         ` Jamie Lokier
@ 2007-12-08 13:22         ` Paul Brook
  2007-12-08 14:09           ` Jamie Lokier
  2007-12-08 21:59           ` Anthony Liguori
  1 sibling, 2 replies; 17+ messages in thread
From: Paul Brook @ 2007-12-08 13:22 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Rusty Russell, qemu-devel

> virtio makes things a bit trickier though.  There's a shared ring queue
> between the host and guest.  The ring queue is lock-less and depends on
> the ability to atomically increment ring queue indices to be SMP safe.
> Using a copy-API wouldn't be a problem for QEMU since the host and guest
> are always running in lock-step.  A copy API is actually needed to deal
> with differing host/guest alignment and endianness.

That seems a rather poor design choice, as many architectures don't have an 
atomic increment instruction. Oh well.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 13:22         ` Paul Brook
@ 2007-12-08 14:09           ` Jamie Lokier
  2007-12-08 16:45             ` Paul Brook
  2007-12-08 21:59           ` Anthony Liguori
  1 sibling, 1 reply; 17+ messages in thread
From: Jamie Lokier @ 2007-12-08 14:09 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell

Paul Brook wrote:
> > virtio makes things a bit trickier though.  There's a shared ring queue
> > between the host and guest.  The ring queue is lock-less and depends on
> > the ability to atomically increment ring queue indices to be SMP safe.
> > Using a copy-API wouldn't be a problem for QEMU since the host and guest
> > are always running in lock-step.  A copy API is actually needed to deal
> > with differing host/guest alignment and endianness.
> 
> That seems a rather poor design choice, as many architectures don't have an 
> atomic increment instruction. Oh well.

Most have compare-and-swap or load-locked/store-conditional
instructions, though, which can be used to implement atomic increment.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 14:09           ` Jamie Lokier
@ 2007-12-08 16:45             ` Paul Brook
  2007-12-08 19:52               ` Blue Swirl
  2007-12-08 21:55               ` Jamie Lokier
  0 siblings, 2 replies; 17+ messages in thread
From: Paul Brook @ 2007-12-08 16:45 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell

On Saturday 08 December 2007, Jamie Lokier wrote:
> Paul Brook wrote:
> > > virtio makes things a bit trickier though.  There's a shared ring queue
> > > between the host and guest.  The ring queue is lock-less and depends on
> > > the ability to atomically increment ring queue indices to be SMP safe.
> > > Using a copy-API wouldn't be a problem for QEMU since the host and
> > > guest are always running in lock-step.  A copy API is actually needed
> > > to deal with differing host/guest alignment and endianness.
> >
> > That seems a rather poor design choice, as many architectures don't have
> > an atomic increment instruction. Oh well.
>
> Most have compare-and-swap or load-locked/store-conditional
> instructions, though, which can be used to implement atomic increment.

Yes, but your "hardware" implementation has to make sure it interacts with 
those properly. It's certainly possible to implement lockless lists without 
requiring atomic increment. Most high-end hardware manages it and that 
doesn't even have coherent DMA.

Paul

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 16:45             ` Paul Brook
@ 2007-12-08 19:52               ` Blue Swirl
  2007-12-08 21:55               ` Jamie Lokier
  1 sibling, 0 replies; 17+ messages in thread
From: Blue Swirl @ 2007-12-08 19:52 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell

On 12/8/07, Paul Brook <paul@codesourcery.com> wrote:
> On Saturday 08 December 2007, Jamie Lokier wrote:
> > Paul Brook wrote:
> > > > virtio makes things a bit trickier though.  There's a shared ring queue
> > > > between the host and guest.  The ring queue is lock-less and depends on
> > > > the ability to atomically increment ring queue indices to be SMP safe.
> > > > Using a copy-API wouldn't be a problem for QEMU since the host and
> > > > guest are always running in lock-step.  A copy API is actually needed
> > > > to deal with differing host/guest alignment and endianness.
> > >
> > > That seems a rather poor design choice, as many architectures don't have
> > > an atomic increment instruction. Oh well.
> >
> > Most have compare-and-swap or load-locked/store-conditional
> > instructions, though, which can be used to implement atomic increment.
>
> Yes, but your "hardware" implementation has to make sure it interacts with
> those properly. It's certainly possible to implement lockless lists without
> requiring atomic increment. Most high-end hardware manages it and that
> doesn't even have coherent DMA.

If we start adding locks for IO, could we use the same locking model
more widely or make it generic so that it would support a SMP host as
well?

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 16:45             ` Paul Brook
  2007-12-08 19:52               ` Blue Swirl
@ 2007-12-08 21:55               ` Jamie Lokier
  2007-12-08 22:02                 ` Anthony Liguori
  1 sibling, 1 reply; 17+ messages in thread
From: Jamie Lokier @ 2007-12-08 21:55 UTC (permalink / raw)
  To: Paul Brook; +Cc: Anthony Liguori, Rusty Russell, qemu-devel

Paul Brook wrote:
> > > > virtio makes things a bit trickier though.  There's a shared ring queue
> > > > between the host and guest.  The ring queue is lock-less and depends on
> > > > the ability to atomically increment ring queue indices to be SMP safe.
> > > > Using a copy-API wouldn't be a problem for QEMU since the host and
> > > > guest are always running in lock-step.  A copy API is actually needed
> > > > to deal with differing host/guest alignment and endianness.
> > >
> > > That seems a rather poor design choice, as many architectures don't have
> > > an atomic increment instruction. Oh well.
> >
> > Most have compare-and-swap or load-locked/store-conditional
> > instructions, though, which can be used to implement atomic increment.
> 
> Yes, but your "hardware" implementation has to make sure it interacts with 
> those properly. It's certainly possible to implement lockless lists without 
> requiring atomic increment. Most high-end hardware manages it and that 
> doesn't even have coherent DMA.

I'm inclined to agree that a lockless structure (including not using
an atomic op) for the most commonly used paths, such as appending to a
ring, would be better.  It increases the potential portability for
emulation/virtualisation across all architectures now and in the
future, and it would almost certainly perform better on architectures
other than x86.

However, occasionally you need to enter the host for synchronisation.
E.g. when a ring is empty/full.

In that case, sometimes using atomic ops in the way that futexes are
used in Linux/Glibc can optimise the details of those transitions, but
it would be best if they were optional optimisations, for
cross-platform, cross-architure portability.

There's a particularly awkward problem when taking an x86 atomic op in
the guest, and generating code on the non-x86 host which doesn't have
any equivalent op.  What's the right thing to do?

Since virtio is driven by virtualisation projects rather than
emulation, is it possible this hasn't been thought of at all, making
virtio unusable for cross-architecture emulation?  That would be
really unfortunate.

-- Jamie

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 13:22         ` Paul Brook
  2007-12-08 14:09           ` Jamie Lokier
@ 2007-12-08 21:59           ` Anthony Liguori
  1 sibling, 0 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-12-08 21:59 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell

Paul Brook wrote:
>> virtio makes things a bit trickier though.  There's a shared ring queue
>> between the host and guest.  The ring queue is lock-less and depends on
>> the ability to atomically increment ring queue indices to be SMP safe.
>> Using a copy-API wouldn't be a problem for QEMU since the host and guest
>> are always running in lock-step.  A copy API is actually needed to deal
>> with differing host/guest alignment and endianness.
>>     
>
> That seems a rather poor design choice, as many architectures don't have an 
> atomic increment instruction. Oh well.
>   

Sorry, I should have been more clear.  An atomic increment isn't needed, 
just an atomic store.

Regards,

Anthony Liguori

> Paul
>
>
>
>   

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 21:55               ` Jamie Lokier
@ 2007-12-08 22:02                 ` Anthony Liguori
  2007-12-12  1:24                   ` Rusty Russell
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-12-08 22:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony Liguori, Rusty Russell, Paul Brook

Jamie Lokier wrote:
> Paul Brook wrote:
>   
>>>>> virtio makes things a bit trickier though.  There's a shared ring queue
>>>>> between the host and guest.  The ring queue is lock-less and depends on
>>>>> the ability to atomically increment ring queue indices to be SMP safe.
>>>>> Using a copy-API wouldn't be a problem for QEMU since the host and
>>>>> guest are always running in lock-step.  A copy API is actually needed
>>>>> to deal with differing host/guest alignment and endianness.
>>>>>           
>>>> That seems a rather poor design choice, as many architectures don't have
>>>> an atomic increment instruction. Oh well.
>>>>         
>>> Most have compare-and-swap or load-locked/store-conditional
>>> instructions, though, which can be used to implement atomic increment.
>>>       
>> Yes, but your "hardware" implementation has to make sure it interacts with 
>> those properly. It's certainly possible to implement lockless lists without 
>> requiring atomic increment. Most high-end hardware manages it and that 
>> doesn't even have coherent DMA.
>>     
>
> I'm inclined to agree that a lockless structure (including not using
> an atomic op) for the most commonly used paths, such as appending to a
> ring, would be better.  It increases the potential portability for
> emulation/virtualisation across all architectures now and in the
> future, and it would almost certainly perform better on architectures
> other than x86.
>
> However, occasionally you need to enter the host for synchronisation.
> E.g. when a ring is empty/full.
>
> In that case, sometimes using atomic ops in the way that futexes are
> used in Linux/Glibc can optimise the details of those transitions, but
> it would be best if they were optional optimisations, for
> cross-platform, cross-architure portability.
>
> There's a particularly awkward problem when taking an x86 atomic op in
> the guest, and generating code on the non-x86 host which doesn't have
> any equivalent op.  What's the right thing to do?
>
> Since virtio is driven by virtualisation projects rather than
> emulation, is it possible this hasn't been thought of at all, making
> virtio unusable for cross-architecture emulation?  That would be
> really unfortunate.
>   

virtio has been designed for virtualization, yes.  There aren't really 
restrictions that prevent it's use when doing cross-architecture 
emulation (yet) with QEMU.

If QEMU ever got true SMP support, then virtio would not work as it 
requires 16-bit atomic writes which AFAIK is not possible on a number of 
non-x86 architectures.

Regards,

Anthony Liguori

> -- Jamie
>
>
>
>   

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-08 22:02                 ` Anthony Liguori
@ 2007-12-12  1:24                   ` Rusty Russell
  2007-12-12  1:40                     ` Anthony Liguori
  0 siblings, 1 reply; 17+ messages in thread
From: Rusty Russell @ 2007-12-12  1:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Anthony Liguori, qemu-devel, Paul Brook

On Sunday 09 December 2007 09:02:48 Anthony Liguori wrote:
> If QEMU ever got true SMP support, then virtio would not work as it
> requires 16-bit atomic writes which AFAIK is not possible on a number of
> non-x86 architectures.

Hmm?  Where is this requirement coming from?

I think everyone should stop using the word "atomic" in virtio discussions; 
it's confusing.

Rusty.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-12  1:24                   ` Rusty Russell
@ 2007-12-12  1:40                     ` Anthony Liguori
  2007-12-18  2:31                       ` Rusty Russell
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-12-12  1:40 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Anthony Liguori, qemu-devel, Paul Brook

Rusty Russell wrote:
> On Sunday 09 December 2007 09:02:48 Anthony Liguori wrote:
>   
>> If QEMU ever got true SMP support, then virtio would not work as it
>> requires 16-bit atomic writes which AFAIK is not possible on a number of
>> non-x86 architectures.
>>     
>
> Hmm?  Where is this requirement coming from?
>
> I think everyone should stop using the word "atomic" in virtio discussions; 
> it's confusing.
>   

The virtio ring queue indices are 16-bit and are readable to one end 
while writable on the other end.  To ensure that this can be done in a 
lock-less way, it's necessary to atomically update the index.  Atomic is 
the right word here because if the 16-bit write gets converted into two 
8-bit writes, then very bad things could happen with SMP.

Regards,

Anthony Liguori

> Rusty.
>
>   

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 2/3] virtio network device
  2007-12-12  1:40                     ` Anthony Liguori
@ 2007-12-18  2:31                       ` Rusty Russell
  0 siblings, 0 replies; 17+ messages in thread
From: Rusty Russell @ 2007-12-18  2:31 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Anthony Liguori, qemu-devel, Paul Brook

On Wednesday 12 December 2007 12:40:43 Anthony Liguori wrote:
> Rusty Russell wrote:
> > On Sunday 09 December 2007 09:02:48 Anthony Liguori wrote:
> >> If QEMU ever got true SMP support, then virtio would not work as it
> >> requires 16-bit atomic writes which AFAIK is not possible on a number of
> >> non-x86 architectures.
> >
> > Hmm?  Where is this requirement coming from?
> >
> > I think everyone should stop using the word "atomic" in virtio
> > discussions; it's confusing.
>
> The virtio ring queue indices are 16-bit and are readable to one end
> while writable on the other end.  To ensure that this can be done in a
> lock-less way, it's necessary to atomically update the index.  Atomic is
> the right word here because if the 16-bit write gets converted into two
> 8-bit writes, then very bad things could happen with SMP.

Of course, but that's insane.  Your assertion that it's not possible on a 
number of non-x86 architectures is what I'm questioning here.  You're 
confusing the inability of architectures to atomically *modify* a 16 bit 
value and our requirement, where even if you found an architecture which 
couldn't do 16 bit writes, you can do it as a 32 bit write.

Hope that clarifies,
Rusty.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2007-12-18  2:32 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-04 21:54 [Qemu-devel] [PATCH 2/3] virtio network device Anthony Liguori
2007-12-04 22:12 ` Anthony Liguori
2007-12-04 23:49 ` [Qemu-devel] " Dor Laor
2007-12-05 17:18   ` Anthony Liguori
2007-12-05 17:44     ` Paul Brook
2007-12-05 20:20       ` Anthony Liguori
2007-12-06  9:27         ` Jamie Lokier
2007-12-08 13:22         ` Paul Brook
2007-12-08 14:09           ` Jamie Lokier
2007-12-08 16:45             ` Paul Brook
2007-12-08 19:52               ` Blue Swirl
2007-12-08 21:55               ` Jamie Lokier
2007-12-08 22:02                 ` Anthony Liguori
2007-12-12  1:24                   ` Rusty Russell
2007-12-12  1:40                     ` Anthony Liguori
2007-12-18  2:31                       ` Rusty Russell
2007-12-08 21:59           ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).