[RFC] virtio-blk PCI backend

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [RFC] virtio-blk PCI backend
@ 2007-11-08  2:51 Anthony Liguori
       [not found] ` <11944902733951-git-send-email-aliguori-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-11-08  2:51 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f; +Cc: Anthony Liguori

This still needs quite a lot of work but I wanted to post it for reference.

Regards,

Anthony Liguori

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 65f449e..3032337 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -448,6 +448,8 @@ VL_OBJS += rtl8139.o
 # PCI Hypercall
 VL_OBJS+= hypercall.o
 
+VL_OBJS += virtio.o
+
 ifeq ($(TARGET_BASE_ARCH), i386)
 # Hardware support
 VL_OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o $(AUDIODRV)
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 8aae814..9b17938 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -943,6 +943,11 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, int boot_device,
 #ifdef USE_HYPERCALL
     pci_hypercall_init(pci_bus);
 #endif
+
+    if (1) {
+	virtio_init_pci(pci_bus, "virtio", 0x5002, 0x2258);
+    }
+
     if (pci_enabled) {
         pci_piix3_ide_init(pci_bus, bs_table, piix3_devfn + 1, i8259);
     } else {
diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
new file mode 100644
index 0000000..94efe5a
--- /dev/null
+++ b/qemu/hw/virtio.c
@@ -0,0 +1,486 @@
+#include "vl.h"
+#include <linux/virtio_pci.h>
+#include <err.h>
+
+#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0)
+
+#define wmb() asm volatile("sfence" ::: "memory")
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT	1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE	2
+
+/* This means don't notify other side when buffer added. */
+#define VRING_USED_F_NO_NOTIFY	1
+/* This means don't interrupt guest when buffer consumed. */
+#define VRING_AVAIL_F_NO_INTERRUPT	1
+
+typedef struct VRingDesc
+{
+    uint64_t addr;
+    uint32_t len;
+    uint16_t flags;
+    uint16_t next;
+} VRingDesc;
+
+typedef struct VRingAvail
+{
+    uint16_t flags;
+    uint16_t idx;
+    uint16_t ring[0];
+} VRingAvail;
+
+typedef struct VRingUsedElem
+{
+    uint32_t id;
+    uint32_t len;
+} VRingUsedElem;
+
+typedef struct VRingUsed
+{
+    uint16_t flags;
+    uint16_t idx;
+    VRingUsedElem ring[0];
+} VRingUsed;
+
+typedef struct VRing
+{
+    unsigned int num;
+    VRingDesc *desc;
+    VRingAvail *avail;
+    VRingUsed *used;
+} VRing;
+
+struct VirtQueue
+{
+    VRing vring;
+    uint32_t pfn;
+    uint16_t last_avail_idx;
+    void (*handle_output)(void *opaque, VirtIODevice *vdev, VirtQueue *vq);
+    void *opaque;
+};
+
+#define VIRTIO_PCI_QUEUE_MAX	16
+
+struct VirtIODevice
+{
+    PCIDevice pci_dev;
+    const char *name;
+    int irq;
+    uint32_t addr;
+    uint16_t vendor;
+    uint16_t device;
+    uint8_t status;
+    uint8_t isr;
+    uint16_t queue_sel;
+    uint32_t features;
+    uint32_t (*get_config)(void *opaque, unsigned offset);
+    void (*set_config)(void *opaque, unsigned offset, uint32_t val);
+    uint32_t (*get_features)(void *opaque);
+    void (*set_features)(void *opaque, uint32_t val);
+    void *opaque;
+    VirtQueue vq[VIRTIO_PCI_QUEUE_MAX];
+};
+
+static void vring_init(VirtQueue *vq, void *p)
+{
+    vq->vring.desc = p;
+    vq->vring.avail = p + vq->vring.num * sizeof(VRingDesc);
+    vq->vring.used = p + (vq->vring.num + 1) * (sizeof(VRingDesc) + sizeof(uint16_t));
+}
+
+static VirtIODevice *to_virtio_device(PCIDevice *pci_dev)
+{
+    return (VirtIODevice *)pci_dev;
+}
+
+static unsigned next_desc(VirtQueue *vq, unsigned int i)
+{
+    unsigned int next;
+
+    /* If this descriptor says it doesn't chain, we're done. */
+    if (!(vq->vring.desc[i].flags & VRING_DESC_F_NEXT))
+	return vq->vring.num;
+
+    /* Check they're not leading us off end of descriptors. */
+    next = vq->vring.desc[i].next;
+    /* Make sure compiler knows to grab that: we don't want it changing! */
+    wmb();
+
+    if (next >= vq->vring.num)
+	errx(1, "Desc next is %u", next);
+
+    return next;
+}
+
+static void *check_pointer(unsigned long addr, unsigned int size)
+{
+    if ((addr + size) > ram_size)
+	errx(1, "bad guest");
+    return phys_ram_base + addr;
+}
+
+static unsigned get_vq_desc(VirtQueue *vq,
+			    struct iovec iov[],
+			    unsigned int *out_num, unsigned int *in_num)
+{
+	unsigned int i, head;
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	if ((uint16_t)(vq->vring.avail->idx - vq->last_avail_idx) > vq->vring.num)
+		errx(1, "Guest moved used index from %u to %u",
+		     vq->last_avail_idx, vq->vring.avail->idx);
+
+	/* If there's nothing new since last we looked, return invalid. */
+	if (vq->vring.avail->idx == vq->last_avail_idx)
+		return vq->vring.num;
+
+	/* Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen. */
+	head = vq->vring.avail->ring[vq->last_avail_idx++ % vq->vring.num];
+
+	/* If their number is silly, that's a fatal mistake. */
+	if (head >= vq->vring.num)
+		errx(1, "Guest says index %u is available", head);
+
+	/* When we start there are none of either input nor output. */
+	*out_num = *in_num = 0;
+
+	i = head;
+	do {
+		/* Grab the first descriptor, and check it's OK. */
+		iov[*out_num + *in_num].iov_len = vq->vring.desc[i].len;
+		iov[*out_num + *in_num].iov_base
+			= check_pointer(vq->vring.desc[i].addr,
+					vq->vring.desc[i].len);
+		/* If this is an input descriptor, increment that count. */
+		if (vq->vring.desc[i].flags & VRING_DESC_F_WRITE)
+			(*in_num)++;
+		else {
+			/* If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors. */
+			if (*in_num)
+				errx(1, "Descriptor has out after in");
+			(*out_num)++;
+		}
+
+		/* If we've got too many, that implies a descriptor loop. */
+		if (*out_num + *in_num > vq->vring.num)
+			errx(1, "Looped descriptor");
+	} while ((i = next_desc(vq, i)) != vq->vring.num);
+
+	return head;
+}
+
+static void virtio_ring_kick(VirtIODevice *vdev, VirtQueue *vq)
+{
+    if (vq->vring.desc)
+	vq->handle_output(vq->opaque, vdev, vq);
+}
+
+static void virtio_update_irq(VirtIODevice *vdev)
+{
+    qemu_set_irq(vdev->pci_dev.irq[0], vdev->isr & 1);
+}
+
+static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    ram_addr_t pa;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_GUEST_FEATURES:
+	if (vdev->set_features)
+	    vdev->set_features(vdev->opaque, val);
+	vdev->features = val;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	pa = (ram_addr_t)val << TARGET_PAGE_BITS;
+	vdev->vq[vdev->queue_sel].pfn = val;
+	if (pa < (ram_size - TARGET_PAGE_SIZE))
+	    vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa);
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	if (val < VIRTIO_PCI_QUEUE_MAX)
+	    vdev->queue_sel = val;
+	break;
+    case VIRTIO_PCI_QUEUE_NOTIFY:
+	if (val < VIRTIO_PCI_QUEUE_MAX)
+	    virtio_ring_kick(vdev, &vdev->vq[val]);
+	break;
+    case VIRTIO_PCI_STATUS:
+	vdev->status = val & 0xFF;
+	break;
+    default:
+	if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config)
+	    vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val);
+	break;
+    }
+}
+
+static uint32_t virtio_ioport_read(void *opaque, uint32_t addr)
+{
+    VirtIODevice *vdev = to_virtio_device(opaque);
+    uint32_t ret = 0xFFFFFFFF;
+
+    addr -= vdev->addr;
+
+    switch (addr) {
+    case VIRTIO_PCI_HOST_FEATURES:
+	ret = vdev->get_features(vdev->opaque);
+	break;
+    case VIRTIO_PCI_GUEST_FEATURES:
+	ret = vdev->features;
+	break;
+    case VIRTIO_PCI_QUEUE_PFN:
+	ret = vdev->vq[vdev->queue_sel].pfn;
+	break;
+    case VIRTIO_PCI_QUEUE_NUM:
+	ret = vdev->vq[vdev->queue_sel].vring.num;
+	break;
+    case VIRTIO_PCI_QUEUE_SEL:
+	ret = vdev->queue_sel;
+	break;
+    case VIRTIO_PCI_STATUS:
+	ret = vdev->status;
+	break;
+    case VIRTIO_PCI_ISR:
+	/* reading from the ISR also clears it. */
+	ret = vdev->isr;
+	vdev->isr = 0;
+	virtio_update_irq(vdev);
+	break;
+    default:
+	if (addr >= VIRTIO_PCI_CONFIG)
+	    ret = vdev->get_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG);
+	break;
+    }
+
+    return ret;
+}
+
+static void virtio_map(PCIDevice *pci_dev, int region_num,
+		       uint32_t addr, uint32_t size, int type)
+{
+    VirtIODevice *vdev = to_virtio_device(pci_dev);
+
+    vdev->addr = addr;
+    register_ioport_write(addr, size, 1, virtio_ioport_write, vdev);
+    register_ioport_write(addr, size, 2, virtio_ioport_write, vdev);
+    register_ioport_write(addr, size, 4, virtio_ioport_write, vdev);
+    register_ioport_read(addr, size, 1, virtio_ioport_read, vdev);
+    register_ioport_read(addr, size, 2, virtio_ioport_read, vdev);
+    register_ioport_read(addr, size, 4, virtio_ioport_read, vdev);
+}
+
+/* Convert an iovec element to the given type.
+ *
+ * This is a fairly ugly trick: we need to know the size of the type and
+ * alignment requirement to check the pointer is kosher.  It's also nice to
+ * have the name of the type in case we report failure.
+ *
+ * Typing those three things all the time is cumbersome and error prone, so we
+ * have a macro which sets them all up and passes to the real function. */
+#define convert(iov, type) \
+	((type *)_convert((iov), sizeof(type), __alignof__(type), #type))
+
+static void *_convert(struct iovec *iov, size_t size, size_t align,
+		      const char *name)
+{
+	if (iov->iov_len != size)
+		errx(1, "Bad iovec size %zu for %s", iov->iov_len, name);
+	if ((unsigned long)iov->iov_base % align != 0)
+		errx(1, "Bad alignment %p for %s", iov->iov_base, name);
+	return iov->iov_base;
+}
+
+static void add_used(VirtQueue *vq, unsigned int head, int len)
+{
+	VRingUsedElem *used;
+
+	/* Get a pointer to the next entry in the used ring. */
+	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
+	used->id = head;
+	used->len = len;
+	/* Make sure buffer is written before we update index. */
+	wmb();
+	vq->vring.used->idx++;
+}
+
+void virtio_add_queue(VirtIODevice *vdev, unsigned int num, 
+		      void (*handle_output)(void *, VirtIODevice *, VirtQueue *),
+		      void *opaque)
+{
+    int i;
+
+    for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+	if (vdev->vq[i].vring.num == 0)
+	    break;
+    }
+
+    /* FIXME bug_on i == VIRTIO_PCI_QUEUE_MAX */
+
+    vdev->vq[i].vring.num = num;
+    vdev->vq[i].handle_output = handle_output;
+    vdev->vq[i].opaque = opaque;
+}
+
+#include <linux/virtio_blk.h>
+#include <stdbool.h>
+
+#define BLK_MAX_QUEUE_SIZE	127
+
+static bool virtio_blk_handle_request(BlockDriverState *bs,
+				      VirtIODevice *vdev, VirtQueue *vq)
+{
+    struct iovec iov[vq->vring.num];
+    unsigned int head, out_num, in_num, wlen;
+    struct virtio_blk_inhdr *in;
+    struct virtio_blk_outhdr *out;
+
+    head = get_vq_desc(vq, iov, &out_num, &in_num);
+    /* FIXME: if get_vq_desc returns an err, we don't need to know vq */
+    if (head == vq->vring.num) {
+	return false;
+    }
+
+    if (out_num == 0 || in_num == 0) {
+	return false;
+    }
+
+    out = convert(&iov[0], struct virtio_blk_outhdr);
+    in = convert(&iov[out_num+in_num-1], struct virtio_blk_inhdr);
+    off_t off = out->sector;
+
+    if (out->type & VIRTIO_BLK_T_SCSI_CMD) {
+	in->status = VIRTIO_BLK_S_UNSUPP;
+	wlen = sizeof(in);
+    } else if (out->type & VIRTIO_BLK_T_OUT) {
+	int i;
+
+	wlen = sizeof(in);
+	for (i = 0; i < out_num - 1; i++) {
+	    bdrv_write(bs, off, iov[i + 1].iov_base,
+		       iov[i + 1].iov_len / 512);
+	    off += iov[i + 1].iov_len / 512;
+	}
+	in->status = VIRTIO_BLK_S_OK;
+    } else {
+	int i;
+
+	wlen = sizeof(in);
+	for (i = 0; i < in_num - 1; i++) {
+	    bdrv_read(bs, off, iov[i + 1].iov_base,
+		      iov[i + 1].iov_len / 512);
+	    off += iov[i + 1].iov_len / 512;
+	    wlen += iov[i + 1].iov_len;
+	}
+	in->status = VIRTIO_BLK_S_OK;
+    }
+
+    add_used(vq, head, wlen);
+    /* FIXME: the blk code shouldn't have to know implementation details of the
+       virtio layer so this should be abstracted into a ->kick() */
+    vdev->isr = 1;
+    virtio_update_irq(vdev);
+
+    return true;
+}
+
+static void virtio_blk_handle_output(void *opaque, VirtIODevice *vdev,
+				     VirtQueue *vq)
+{
+    BlockDriverState *bs = opaque;
+
+    while (virtio_blk_handle_request(bs, vdev, vq));
+}
+
+static uint32_t virtio_blk_get_config(void *opaque, uint32_t addr)
+{
+    BlockDriverState *bs = opaque;
+    int64_t capacity;
+
+    switch (addr) {
+    case VIRTIO_CONFIG_BLK_F_CAPACITY:
+	bdrv_get_geometry(bs, &capacity);
+	return (uint32_t)capacity;
+    case VIRTIO_CONFIG_BLK_F_CAPACITY + 4:
+	bdrv_get_geometry(bs, &capacity);
+	return (uint32_t)(capacity >> 32);
+    case VIRTIO_CONFIG_BLK_F_SIZE_MAX:
+	return 8192;
+    case VIRTIO_CONFIG_BLK_F_SEG_MAX:
+	return BLK_MAX_QUEUE_SIZE - 2;
+    }
+
+    return 0xFFFFFFFF;
+}
+
+static uint32_t virtio_blk_get_features(void *opaque)
+{
+    return  (1 << VIRTIO_BLK_F_SIZE_MAX) |
+	    (1 << VIRTIO_BLK_F_SEG_MAX);
+}
+
+static void virtio_blk_init(VirtIODevice *vdev, BlockDriverState *bs)
+{
+    vdev->get_config = virtio_blk_get_config;
+    vdev->get_features = virtio_blk_get_features;
+    vdev->opaque = bs;
+
+    virtio_add_queue(vdev, BLK_MAX_QUEUE_SIZE, virtio_blk_handle_output, bs);
+}
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendorid, uint16_t deviceid)
+{
+    PCIDevice *pci_dev;
+    VirtIODevice *vdev;
+    uint8_t *config;
+    uint16_t virtio_devid = 0x02;
+
+    pci_dev = pci_register_device(bus, name, sizeof(VirtIODevice),
+				  -1, NULL, NULL);
+    vdev = to_virtio_device(pci_dev);
+
+    vdev->status = 0;
+    vdev->isr = 0;
+    vdev->queue_sel = 0;
+    memset(vdev->vq, 0, sizeof(vdev->vq));
+
+    config = pci_dev->config;
+    config[0x00] = vendorid & 0xFF;
+    config[0x01] = (vendorid >> 8) & 0xFF;
+    config[0x02] = deviceid & 0xFF;
+    config[0x03] = (deviceid >> 8) & 0xFF;
+
+    config[0x09] = 0x00;
+    config[0x0a] = 0x00;
+    config[0x0b] = 0x05;
+    config[0x0e] = 0x00;
+
+    config[0x2c] = vendorid & 0xFF;
+    config[0x2d] = (vendorid >> 8) & 0xFF;
+    config[0x2e] = virtio_devid & 0xFF;
+    config[0x2f] = (virtio_devid >> 8) & 0xFF;
+    
+    config[0x3d] = 1;
+
+    vdev->irq = 16;
+    vdev->name = name;
+
+    pci_register_io_region(pci_dev, 0, 128,
+			   PCI_ADDRESS_SPACE_IO, virtio_map);
+    
+    if (1) {
+	BlockDriverState *bs = bdrv_new("vda");
+	if (bdrv_open(bs, "/home/anthony/images/linux.img", BDRV_O_SNAPSHOT))
+	    exit(1);
+	virtio_blk_init(vdev, bs);
+    }
+
+    return vdev;
+}
diff --git a/qemu/vl.h b/qemu/vl.h
index 01aeabc..985035e 100644
--- a/qemu/vl.h
+++ b/qemu/vl.h
@@ -1392,6 +1392,19 @@ typedef struct ADBDevice ADBDevice;
 void pci_hypercall_init(PCIBus *bus);
 void vmchannel_init(CharDriverState *hd, uint32_t deviceid, uint32_t index);
 
+/* virtio.c */
+
+typedef struct VirtQueue VirtQueue;
+typedef struct VirtIODevice VirtIODevice;
+
+VirtIODevice *virtio_init_pci(PCIBus *bus, const char *name,
+			      uint16_t vendorid, uint16_t deviceid);
+void virtio_set_id(VirtIODevice *vdev, uint16_t vendor, uint16_t device);
+void virtio_add_queue(VirtIODevice *vdev, unsigned int num, 
+		      void (*handle_output)(void *, VirtIODevice *, VirtQueue *),
+		      void *opaque);
+void virtio_add_config(VirtIODevice *vdev, int type, void *data, size_t size);
+
 /* buf = NULL means polling */
 typedef int ADBDeviceRequest(ADBDevice *d, uint8_t *buf_out,
                               const uint8_t *buf, int len);

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found] ` <11944902733951-git-send-email-aliguori-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-11-08  6:24   ` Avi Kivity
       [not found]     ` <4732ABA0.5090603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-11-09  0:25   ` Dor Laor
  1 sibling, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2007-11-08  6:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> +    case VIRTIO_PCI_QUEUE_NOTIFY:
> +	if (val < VIRTIO_PCI_QUEUE_MAX)
> +	    virtio_ring_kick(vdev, &vdev->vq[val]);
> +	break;
>   

I see you're not using hypercalls for this, presumably for compatibility
with -no-kvm.  Well I think I have a solution: advertise vmcall,
vmmcall, pio to some port, and int $some_vector as hypercall feature
bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively).  Early
setup code could patch the instruction as appropriate (I hear code
patching is now taught in second grade).

(kvm could advertise all four, or maybe just the first two)

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]     ` <4732ABA0.5090603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-08 13:57       ` Anthony Liguori
       [not found]         ` <473315DB.9030803-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-11-08 13:57 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>   
>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>> +	if (val < VIRTIO_PCI_QUEUE_MAX)
>> +	    virtio_ring_kick(vdev, &vdev->vq[val]);
>> +	break;
>>   
>>     
>
> I see you're not using hypercalls for this, presumably for compatibility
> with -no-kvm.

More than just that.  By stick to PIO, we are compatible with just about 
any VMM.  For instance, we get Xen support for free.  If we used 
hypercalls, even if we agreed on a way to determine which number to use 
and how to make those calls, it would still be difficult to implement in 
something like Xen.

>   Well I think I have a solution: advertise vmcall,
> vmmcall, pio to some port, and int $some_vector as hypercall feature
> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively).  Early
> setup code could patch the instruction as appropriate (I hear code
> patching is now taught in second grade).
>   

That ties our device to our particular hypercall implementation.  If we 
were going to do this, I'd prefer to advertise it in the device I 
think.  I really would need to look at the performance though of vmcall 
and an edge triggered interrupt.  It would have to be pretty compelling 
to warrant the additional complexity I think.

Regards,

Anthony Liguori

> (kvm could advertise all four, or maybe just the first two)
>
>   


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]         ` <473315DB.9030803-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-11-08 14:02           ` Avi Kivity
       [not found]             ` <4733170B.70206-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2007-11-08 14:02 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>  
>>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>>> +    break;
>>>       
>>
>> I see you're not using hypercalls for this, presumably for compatibility
>> with -no-kvm.
>
> More than just that.  By stick to PIO, we are compatible with just 
> about any VMM.  For instance, we get Xen support for free.  If we used 
> hypercalls, even if we agreed on a way to determine which number to 
> use and how to make those calls, it would still be difficult to 
> implement in something like Xen.
>

But pio through the config space basically means you're committed to 
handling it in qemu.  We want a more flexible mechanism.

Detecting how to make hypercalls can be left to paravirt_ops.

(for Xen you'd use an event channel; and for kvm the virtio kick hypercall).

>>   Well I think I have a solution: advertise vmcall,
>> vmmcall, pio to some port, and int $some_vector as hypercall feature
>> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively).  Early
>> setup code could patch the instruction as appropriate (I hear code
>> patching is now taught in second grade).
>>   
>
> That ties our device to our particular hypercall implementation.  If 
> we were going to do this, I'd prefer to advertise it in the device I 
> think.  I really would need to look at the performance though of 
> vmcall and an edge triggered interrupt.  It would have to be pretty 
> compelling to warrant the additional complexity I think. 

vmcall costs will go down, and we don't want to use different mechanisms 
for high bandwidth and low bandwidth devices.



-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]             ` <4733170B.70206-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-08 15:09               ` Anthony Liguori
       [not found]                 ` <473326B4.2080307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-11-08 15:09 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>> Avi Kivity wrote:
>>> Anthony Liguori wrote:
>>>  
>>>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>>>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>>>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>>>> +    break;
>>>>       
>>>
>>> I see you're not using hypercalls for this, presumably for 
>>> compatibility
>>> with -no-kvm.
>>
>> More than just that.  By stick to PIO, we are compatible with just 
>> about any VMM.  For instance, we get Xen support for free.  If we 
>> used hypercalls, even if we agreed on a way to determine which number 
>> to use and how to make those calls, it would still be difficult to 
>> implement in something like Xen.
>>
>
> But pio through the config space basically means you're committed to 
> handling it in qemu.  We want a more flexible mechanism.

There's no reason that the PIO operations couldn't be handled in the 
kernel.  You'll already need some level of cooperation in userspace 
unless you plan on implementing the PCI bus in kernel space too.  It's 
easy enough in the pci_map function in QEMU to just notify the kernel 
that it should listen on a particular PIO range.

> Detecting how to make hypercalls can be left to paravirt_ops.
>
> (for Xen you'd use an event channel; and for kvm the virtio kick 
> hypercall).
>
>>>   Well I think I have a solution: advertise vmcall,
>>> vmmcall, pio to some port, and int $some_vector as hypercall feature
>>> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively).  Early
>>> setup code could patch the instruction as appropriate (I hear code
>>> patching is now taught in second grade).
>>>   
>>
>> That ties our device to our particular hypercall implementation.  If 
>> we were going to do this, I'd prefer to advertise it in the device I 
>> think.  I really would need to look at the performance though of 
>> vmcall and an edge triggered interrupt.  It would have to be pretty 
>> compelling to warrant the additional complexity I think. 
>
> vmcall costs will go down, and we don't want to use different 
> mechanisms for high bandwidth and low bandwidth devices.

vmcalls will certainly get faster but I doubt that the cost difference 
between vmcall and pio will ever be greater than a few hundred cycles.  
The only performance sensitive operation here would be the kick and I 
don't think a few hundred cycles in the kick path is ever going to be 
that significant for overall performance.

So why introduce the extra complexity?

Regards,

Anthony Liguori
>
>


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                 ` <473326B4.2080307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-11-08 15:19                   ` Avi Kivity
       [not found]                     ` <473328EC.4090905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-11-08 15:31                   ` Avi Kivity
  1 sibling, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2007-11-08 15:19 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> Avi Kivity wrote:
>   
>> Anthony Liguori wrote:
>>     
>>> Avi Kivity wrote:
>>>       
>>>> Anthony Liguori wrote:
>>>>  
>>>>         
>>>>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>>>>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>>>>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>>>>> +    break;
>>>>>       
>>>>>           
>>>> I see you're not using hypercalls for this, presumably for 
>>>> compatibility
>>>> with -no-kvm.
>>>>         
>>> More than just that.  By stick to PIO, we are compatible with just 
>>> about any VMM.  For instance, we get Xen support for free.  If we 
>>> used hypercalls, even if we agreed on a way to determine which number 
>>> to use and how to make those calls, it would still be difficult to 
>>> implement in something like Xen.
>>>
>>>       
>> But pio through the config space basically means you're committed to 
>> handling it in qemu.  We want a more flexible mechanism.
>>     
>
> There's no reason that the PIO operations couldn't be handled in the 
> kernel.  You'll already need some level of cooperation in userspace 
> unless you plan on implementing the PCI bus in kernel space too.  It's 
> easy enough in the pci_map function in QEMU to just notify the kernel 
> that it should listen on a particular PIO range.
>
>   

This is a config space write, right?  If so, the range is the regular 
0xcf8-0xcff and it has to be very specially handled.

>> Detecting how to make hypercalls can be left to paravirt_ops.
>>
>> (for Xen you'd use an event channel; and for kvm the virtio kick 
>> hypercall).
>>
>>     
>>>>   Well I think I have a solution: advertise vmcall,
>>>> vmmcall, pio to some port, and int $some_vector as hypercall feature
>>>> bits in cpuid (for kvm, kvm, qemu, and kvm-lite respectively).  Early
>>>> setup code could patch the instruction as appropriate (I hear code
>>>> patching is now taught in second grade).
>>>>   
>>>>         
>>> That ties our device to our particular hypercall implementation.  If 
>>> we were going to do this, I'd prefer to advertise it in the device I 
>>> think.  I really would need to look at the performance though of 
>>> vmcall and an edge triggered interrupt.  It would have to be pretty 
>>> compelling to warrant the additional complexity I think. 
>>>       
>> vmcall costs will go down, and we don't want to use different 
>> mechanisms for high bandwidth and low bandwidth devices.
>>     
>
> vmcalls will certainly get faster but I doubt that the cost difference 
> between vmcall and pio will ever be greater than a few hundred cycles.  
> The only performance sensitive operation here would be the kick and I 
> don't think a few hundred cycles in the kick path is ever going to be 
> that significant for overall performance.
>
>   

Why do you think the different will be a few hundred cycles?  And if you 
have a large number of devices, searching the list becomes expensive too.

> So why introduce the extra complexity?
>   

Overall I think it reduces comlexity if we have in-kernel devices.  
Anyway we can add additional signalling methods later.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                 ` <473326B4.2080307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-11-08 15:19                   ` Avi Kivity
@ 2007-11-08 15:31                   ` Avi Kivity
       [not found]                     ` <47332BB7.2000900-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2007-11-08 15:31 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
>>>>  
>>>>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>>>>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>>>>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>>>>> +    break;
>>>>>       
>>>>
>>>> I see you're not using hypercalls for this, presumably for 
>>>> compatibility
>>>> with -no-kvm.
>>>
>>> More than just that.  By stick to PIO, we are compatible with just 
>>> about any VMM.  For instance, we get Xen support for free.  If we 
>>> used hypercalls, even if we agreed on a way to determine which 
>>> number to use and how to make those calls, it would still be 
>>> difficult to implement in something like Xen.
>>>
>>
>> But pio through the config space basically means you're committed to 
>> handling it in qemu.  We want a more flexible mechanism.
>
> There's no reason that the PIO operations couldn't be handled in the 
> kernel.  You'll already need some level of cooperation in userspace 
> unless you plan on implementing the PCI bus in kernel space too.  It's 
> easy enough in the pci_map function in QEMU to just notify the kernel 
> that it should listen on a particular PIO range.

With my new  understanding of what this is all about, I suggest each 
virtqueue having an ID filled in by the host.  This ID is globally 
unique, and is used as an argument for kick.  It would map into a Xen 
domain id + event channel number, a number to be written into a pio port 
for kvm-lite or non-hypercall kvm, the argument for a kick hypercall on 
kvm, or whatever.

This is independent of virtio-pci, which is good.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                     ` <473328EC.4090905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-08 16:22                       ` Anthony Liguori
       [not found]                         ` <473337B9.8040503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Anthony Liguori @ 2007-11-08 16:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
>> There's no reason that the PIO operations couldn't be handled in the 
>> kernel.  You'll already need some level of cooperation in userspace 
>> unless you plan on implementing the PCI bus in kernel space too.  
>> It's easy enough in the pci_map function in QEMU to just notify the 
>> kernel that it should listen on a particular PIO range.
>>
>>   
>
> This is a config space write, right?  If so, the range is the regular 
> 0xcf8-0xcff and it has to be very specially handled.

This is a per-device IO slot and as best as I can tell, the PCI device 
advertises the size of the region and the OS then identifies a range of 
PIO space to use and tells the PCI device about it.  So we would just 
need to implement a generic userspace virtio PCI device in QEMU that did 
an ioctl to the kernel when this happened to tell the kernel what region 
to listen on for a particular device.

>> vmcalls will certainly get faster but I doubt that the cost 
>> difference between vmcall and pio will ever be greater than a few 
>> hundred cycles.  The only performance sensitive operation here would 
>> be the kick and I don't think a few hundred cycles in the kick path 
>> is ever going to be that significant for overall performance.
>>
>>   
>
> Why do you think the different will be a few hundred cycles?

The only difference in hardware between a PIO exit and a vmcall is that 
you don't have write out an exit reason in the VMC[SB].  So the 
performance difference between PIO/vmcall shouldn't be that great (and 
if it were, the difference would probably be obvious today).  That's 
different from, say, a PF exit because with a PF, you also have to 
attempt to resolve it by walking the guest page table before determining 
that you do in fact need to exit.

>   And if you have a large number of devices, searching the list 
> becomes expensive too.

The PIO address space is relatively small.  You could do a radix tree or 
even a direct array lookup if you are concerned about performance.

>> So why introduce the extra complexity?
>>   
>
> Overall I think it reduces comlexity if we have in-kernel devices.  
> Anyway we can add additional signalling methods later.

In-kernel virtio backends add quite a lot of complexity.  Just the 
mechanism to setup the device is complicated enough.  I suspect that 
it'll be necessary down the road for performance but I certainly don't 
think it's a simplification.

Regards,

Anthony Liguori

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                     ` <47332BB7.2000900-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-08 19:02                       ` Anthony Liguori
  0 siblings, 0 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-11-08 19:02 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Avi Kivity wrote:
> Anthony Liguori wrote:
>>>>>  
>>>>>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>>>>>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>>>>>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>>>>>> +    break;
>>>>>>       
>>>>>
>>>>> I see you're not using hypercalls for this, presumably for 
>>>>> compatibility
>>>>> with -no-kvm.
>>>>
>>>> More than just that.  By stick to PIO, we are compatible with just 
>>>> about any VMM.  For instance, we get Xen support for free.  If we 
>>>> used hypercalls, even if we agreed on a way to determine which 
>>>> number to use and how to make those calls, it would still be 
>>>> difficult to implement in something like Xen.
>>>>
>>>
>>> But pio through the config space basically means you're committed to 
>>> handling it in qemu.  We want a more flexible mechanism.
>>
>> There's no reason that the PIO operations couldn't be handled in the 
>> kernel.  You'll already need some level of cooperation in userspace 
>> unless you plan on implementing the PCI bus in kernel space too.  
>> It's easy enough in the pci_map function in QEMU to just notify the 
>> kernel that it should listen on a particular PIO range.
>
> With my new  understanding of what this is all about, I suggest each 
> virtqueue having an ID filled in by the host.  This ID is globally 
> unique, and is used as an argument for kick.  It would map into a Xen 
> domain id + event channel number, a number to be written into a pio 
> port for kvm-lite or non-hypercall kvm, the argument for a kick 
> hypercall on kvm, or whatever.

Yeah, right now, I maintain a virtqueue "selector" within virtio-pci and 
use that for notification.  This index is also exposed in the 
config->find_vq() within virtio.  Changing that to an opaque ID would 
require introducing another mechanism to enumerate the virtqueues since 
you couldn't just start from 0 and keep going until you hit an invalid 
virtqueue.

I'm not sure I'm convinced that you couldn't just hide this "id" notion 
in the virtio-xen implementation if you needed to.

Regards,

Anthony Liguori

> This is independent of virtio-pci, which is good.
>


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                         ` <473337B9.8040503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
@ 2007-11-09  0:13                           ` Dor Laor
       [not found]                             ` <4733A635.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  2007-11-11  9:23                           ` Avi Kivity
  1 sibling, 1 reply; 17+ messages in thread
From: Dor Laor @ 2007-11-09  0:13 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Avi Kivity


[-- Attachment #1.1: Type: text/plain, Size: 3574 bytes --]

Anthony Liguori wrote:
> Avi Kivity wrote:
>   
>>> There's no reason that the PIO operations couldn't be handled in the 
>>> kernel.  You'll already need some level of cooperation in userspace 
>>> unless you plan on implementing the PCI bus in kernel space too.  
>>> It's easy enough in the pci_map function in QEMU to just notify the 
>>> kernel that it should listen on a particular PIO range.
>>>
>>>   
>>>       
>> This is a config space write, right?  If so, the range is the regular 
>> 0xcf8-0xcff and it has to be very specially handled.
>>     
>
> This is a per-device IO slot and as best as I can tell, the PCI device 
> advertises the size of the region and the OS then identifies a range of 
> PIO space to use and tells the PCI device about it.  So we would just 
> need to implement a generic userspace virtio PCI device in QEMU that did 
> an ioctl to the kernel when this happened to tell the kernel what region 
> to listen on for a particular device.
>
>   
>>> vmcalls will certainly get faster but I doubt that the cost 
>>> difference between vmcall and pio will ever be greater than a few 
>>> hundred cycles.  The only performance sensitive operation here would 
>>> be the kick and I don't think a few hundred cycles in the kick path 
>>> is ever going to be that significant for overall performance.
>>>
>>>   
>>>       
>> Why do you think the different will be a few hundred cycles?
>>     
>
> The only difference in hardware between a PIO exit and a vmcall is that 
> you don't have write out an exit reason in the VMC[SB].  So the 
> performance difference between PIO/vmcall shouldn't be that great (and 
> if it were, the difference would probably be obvious today).  That's 
> different from, say, a PF exit because with a PF, you also have to 
> attempt to resolve it by walking the guest page table before determining 
> that you do in fact need to exit.
>
>   
>>   And if you have a large number of devices, searching the list 
>> becomes expensive too.
>>     
>
> The PIO address space is relatively small.  You could do a radix tree or 
> even a direct array lookup if you are concerned about performance.
>
>   
>>> So why introduce the extra complexity?
>>>   
>>>       
>> Overall I think it reduces comlexity if we have in-kernel devices.  
>> Anyway we can add additional signalling methods later.
>>     
>
> In-kernel virtio backends add quite a lot of complexity.  Just the 
> mechanism to setup the device is complicated enough.  I suspect that 
> it'll be necessary down the road for performance but I certainly don't 
> think it's a simplification.
>
>   
I believe that the network interface will quickly go to the kernel since 
copy takes most of the
cpu time and qemu does not support scatter gather dma at the moment.
Nevertheless using pio seems good enough, Anthony's suggestion of 
notifying the kernel using ioctls
is logical. If we'll run into troubles further on we can add a hypercall 
capability and if exist use hypercalls
instead of pios.
> Regards,
>
> Anthony Liguori
>
>
> -------------------------------------------------------------------------
> This SF.net email is sponsored by: Splunk Inc.
> Still grepping through log files to find problems?  Stop.
> Now Search log events and configuration files using AJAX and a browser.
> Download your FREE copy of Splunk now >> http://get.splunk.com/
> _______________________________________________
> kvm-devel mailing list
> kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
> https://lists.sourceforge.net/lists/listinfo/kvm-devel
>
>   


[-- Attachment #1.2: Type: text/html, Size: 4752 bytes --]

[-- Attachment #2: Type: text/plain, Size: 314 bytes --]

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

[-- Attachment #3: Type: text/plain, Size: 186 bytes --]

_______________________________________________
kvm-devel mailing list
kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/kvm-devel

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found] ` <11944902733951-git-send-email-aliguori-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-11-08  6:24   ` Avi Kivity
@ 2007-11-09  0:25   ` Dor Laor
       [not found]     ` <4733A917.5000303-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 17+ messages in thread
From: Dor Laor @ 2007-11-09  0:25 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> This still needs quite a lot of work but I wanted to post it for reference.
>
> Regards,
>
> Anthony Liguori
>
> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>   
...
Why change Rusty's codding standard? It will be harder to track changes.
> +typedef struct VRingDesc
> +{
> +    uint64_t addr;
> +    uint32_t len;
> +    uint16_t flags;
> +    uint16_t next;
> +} VRingDesc;
> +
> +typedef struct VRingAvail
> +{
> +    uint16_t flags;
> +    uint16_t idx;
> +    uint16_t ring[0];
> +} VRingAvail;
> +
> +typedef struct VRingUsedElem
> +{
> +    uint32_t id;
> +    uint32_t len;
> +} VRingUsedElem;
> +
> +typedef struct VRingUsed
> +{
> +    uint16_t flags;
> +    uint16_t idx;
> +    VRingUsedElem ring[0];
> +} VRingUsed;
> +
> +typedef struct VRing
> +{
> +    unsigned int num;
> +    VRingDesc *desc;
> +    VRingAvail *avail;
> +    VRingUsed *used;
> +} VRing;
> +
....
> +static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
> +{
> +    VirtIODevice *vdev = to_virtio_device(opaque);
> +    ram_addr_t pa;
> +
> +    addr -= vdev->addr;
> +
> +    switch (addr) {
> +    case VIRTIO_PCI_GUEST_FEATURES:
> +	if (vdev->set_features)
> +	    vdev->set_features(vdev->opaque, val);
> +	vdev->features = val;
> +	break;
> +    case VIRTIO_PCI_QUEUE_PFN:
> +	pa = (ram_addr_t)val << TARGET_PAGE_BITS;
> +	vdev->vq[vdev->queue_sel].pfn = val;
>   
Some validity checks are missing, you assume you have the queue_sel.
> +	if (pa < (ram_size - TARGET_PAGE_SIZE))
> +	    vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa);
> +	break;
> +    case VIRTIO_PCI_QUEUE_SEL:
> +	if (val < VIRTIO_PCI_QUEUE_MAX)
> +	    vdev->queue_sel = val;
> +	break;
> +    case VIRTIO_PCI_QUEUE_NOTIFY:
> +	if (val < VIRTIO_PCI_QUEUE_MAX)
> +	    virtio_ring_kick(vdev, &vdev->vq[val]);
> +	break;
> +    case VIRTIO_PCI_STATUS:
> +	vdev->status = val & 0xFF;
>   
we should keep another internal status and it will track the 
initialization of all the above fields (
pfn, queue_sel,..) the device will be active once all of them were 
initialized by the guest
> +	break;
> +    default:
> +	if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config)
> +	    vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val);
> +	break;
> +    }
> +}
> +
>   

What about having block/net/9p.. in separate files? It will grow over time.
> +#include <linux/virtio_blk.h>
> +#include <stdbool.h>
> +
> +#define BLK_MAX_QUEUE_SIZE	127
> +
> +static bool virtio_blk_handle_request(BlockDriverState *bs,
> +				      VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    struct iovec iov[vq->vring.num];
> +    unsigned int head, out_num, in_num, wlen;
> +    struct virtio_blk_inhdr *in;
> +    struct virtio_blk_outhdr *out;
> +
Great job, Dor.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]     ` <4733A917.5000303-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-09  1:38       ` Anthony Liguori
  0 siblings, 0 replies; 17+ messages in thread
From: Anthony Liguori @ 2007-11-09  1:38 UTC (permalink / raw)
  To: dor.laor-atKUWr5tajBWk0Htik3J/w
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Dor Laor wrote:
> Anthony Liguori wrote:
>> This still needs quite a lot of work but I wanted to post it for 
>> reference.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>>   
> ...
> Why change Rusty's codding standard? It will be harder to track changes.

Because Linux kernel coding standards aren't QEMU coding standards.  
Besides, this is supposed to be an ABI so it shouldn't be changing all 
that much :-)

I posted the QEMU bits as soon as I got it working.  I still have a lot 
to do in it however I have addressed some of the things you brought up 
already.

>> +    case VIRTIO_PCI_QUEUE_PFN:
>> +    pa = (ram_addr_t)val << TARGET_PAGE_BITS;
>> +    vdev->vq[vdev->queue_sel].pfn = val;
>>   
> Some validity checks are missing, you assume you have the queue_sel.

Yes, the code is carefully written so that queue_sel is always valid.

>> +    if (pa < (ram_size - TARGET_PAGE_SIZE))
>> +        vring_init(&vdev->vq[vdev->queue_sel], phys_ram_base + pa);
>> +    break;
>> +    case VIRTIO_PCI_QUEUE_SEL:
>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>> +        vdev->queue_sel = val;
>> +    break;
>> +    case VIRTIO_PCI_QUEUE_NOTIFY:
>> +    if (val < VIRTIO_PCI_QUEUE_MAX)
>> +        virtio_ring_kick(vdev, &vdev->vq[val]);
>> +    break;
>> +    case VIRTIO_PCI_STATUS:
>> +    vdev->status = val & 0xFF;
>>   
> we should keep another internal status and it will track the 
> initialization of all the above fields (
> pfn, queue_sel,..) the device will be active once all of them were 
> initialized by the guest

Hrm, I don't follow.  The only thing that has to be written to by the 
guest is the PFN which also has the effect of activating the queue.

>> +    break;
>> +    default:
>> +    if (addr >= VIRTIO_PCI_CONFIG && vdev->set_config)
>> +        vdev->set_config(vdev->opaque, addr - VIRTIO_PCI_CONFIG, val);
>> +    break;
>> +    }
>> +}
>> +
>>   
>
> What about having block/net/9p.. in separate files? It will grow over 
> time.

Yup, already have that in my own queue.

The latest version queue for the kernel side is at 
http://hg.codemonkey.ws/virtio-pci  (based on the master branch of 
Rusty's virtio tree).  The latest queue for the QEMU side is at 
http://hg.codemonkey.ws/qemu-virtio.  I have a functioning block and 9p 
transport.  I'll continue cleaning up tomorrow and will hopefully post 
another set of patches early next week.

Unfortunately, I uncovered a bug in the in-kernel APIC code today so you 
need to run with -no-kvm-irqchip if you want to use multiple virtio 
devices at once :-/

Regards,

Anthony Liguori

>> +#include <linux/virtio_blk.h>
>> +#include <stdbool.h>
>> +
>> +#define BLK_MAX_QUEUE_SIZE    127
>> +
>> +static bool virtio_blk_handle_request(BlockDriverState *bs,
>> +                      VirtIODevice *vdev, VirtQueue *vq)
>> +{
>> +    struct iovec iov[vq->vring.num];
>> +    unsigned int head, out_num, in_num, wlen;
>> +    struct virtio_blk_inhdr *in;
>> +    struct virtio_blk_outhdr *out;
>> +
> Great job, Dor.


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                         ` <473337B9.8040503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
  2007-11-09  0:13                           ` Dor Laor
@ 2007-11-11  9:23                           ` Avi Kivity
  1 sibling, 0 replies; 17+ messages in thread
From: Avi Kivity @ 2007-11-11  9:23 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Anthony Liguori wrote:
> Avi Kivity wrote:
>>> There's no reason that the PIO operations couldn't be handled in the 
>>> kernel.  You'll already need some level of cooperation in userspace 
>>> unless you plan on implementing the PCI bus in kernel space too.  
>>> It's easy enough in the pci_map function in QEMU to just notify the 
>>> kernel that it should listen on a particular PIO range.
>>>
>>>   
>>
>> This is a config space write, right?  If so, the range is the regular 
>> 0xcf8-0xcff and it has to be very specially handled.
>
> This is a per-device IO slot and as best as I can tell, the PCI device 
> advertises the size of the region and the OS then identifies a range 
> of PIO space to use and tells the PCI device about it.  So we would 
> just need to implement a generic userspace virtio PCI device in QEMU 
> that did an ioctl to the kernel when this happened to tell the kernel 
> what region to listen on for a particular device.
>

I'll just go and read the patches more carefully before making any more 
stupid remarks about the code.


>>> vmcalls will certainly get faster but I doubt that the cost 
>>> difference between vmcall and pio will ever be greater than a few 
>>> hundred cycles.  The only performance sensitive operation here would 
>>> be the kick and I don't think a few hundred cycles in the kick path 
>>> is ever going to be that significant for overall performance.
>>>
>>>   
>>
>> Why do you think the different will be a few hundred cycles?
>
> The only difference in hardware between a PIO exit and a vmcall is 
> that you don't have write out an exit reason in the VMC[SB].  So the 
> performance difference between PIO/vmcall shouldn't be that great (and 
> if it were, the difference would probably be obvious today).  That's 
> different from, say, a PF exit because with a PF, you also have to 
> attempt to resolve it by walking the guest page table before 
> determining that you do in fact need to exit.
>

You have to look at the pio bitmaps with pio.  Point taken though.

>
>>> So why introduce the extra complexity?
>>>   
>>
>> Overall I think it reduces comlexity if we have in-kernel devices.  
>> Anyway we can add additional signalling methods later.
>
> In-kernel virtio backends add quite a lot of complexity.  Just the 
> mechanism to setup the device is complicated enough.  I suspect that 
> it'll be necessary down the road for performance but I certainly don't 
> think it's a simplification.

I didn't mean that in-kernel devices simplify things (they don't), but 
that using hypercalls is simpler for in-kernel than pio.


-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Splunk Inc.
Still grepping through log files to find problems?  Stop.
Now Search log events and configuration files using AJAX and a browser.
Download your FREE copy of Splunk now >> http://get.splunk.com/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                             ` <4733A635.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-20  8:39                               ` Christian Borntraeger
       [not found]                                 ` <200711200939.19410.borntraeger-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Christian Borntraeger @ 2007-11-20  8:39 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f,
	dor.laor-atKUWr5tajBWk0Htik3J/w
  Cc: Avi Kivity

Am Freitag, 9. November 2007 schrieb Dor Laor:
> I believe that the network interface will quickly go to the kernel since 
> copy takes most of the
> cpu time and qemu does not support scatter gather dma at the moment.
> Nevertheless using pio seems good enough, Anthony's suggestion of 
> notifying the kernel using ioctls
> is logical. If we'll run into troubles further on we can add a hypercall 
> capability and if exist use hypercalls
> instead of pios.

Sorry for being late in this thread.
We (s390) will need a hypercall as we do not have port I/O. I think it should be
possible to default to hypercall on s390 and use pio everywhere else.

Christian

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                                 ` <200711200939.19410.borntraeger-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
@ 2007-11-20 10:00                                   ` Avi Kivity
       [not found]                                     ` <4742B053.8080301-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Avi Kivity @ 2007-11-20 10:00 UTC (permalink / raw)
  To: Christian Borntraeger; +Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Christian Borntraeger wrote:
> Am Freitag, 9. November 2007 schrieb Dor Laor:
>   
>> I believe that the network interface will quickly go to the kernel since 
>> copy takes most of the
>> cpu time and qemu does not support scatter gather dma at the moment.
>> Nevertheless using pio seems good enough, Anthony's suggestion of 
>> notifying the kernel using ioctls
>> is logical. If we'll run into troubles further on we can add a hypercall 
>> capability and if exist use hypercalls
>> instead of pios.
>>     
>
> Sorry for being late in this thread.
> We (s390) will need a hypercall as we do not have port I/O. I think it should be
> possible to default to hypercall on s390 and use pio everywhere else.
>   

Or be generic: advertise the methods available according to host 
(kvm/x86, qemu/x86, kvm/s390) and let the guest pick.

-- 
error compiling committee.c: too many arguments to function


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                                     ` <4742B053.8080301-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
@ 2007-11-20 10:17                                       ` Arnd Bergmann
       [not found]                                         ` <200711201117.17900.arnd-r2nGTMty4D4@public.gmane.org>
  0 siblings, 1 reply; 17+ messages in thread
From: Arnd Bergmann @ 2007-11-20 10:17 UTC (permalink / raw)
  To: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
  Cc: Christian Borntraeger, Avi Kivity

On Tuesday 20 November 2007, Avi Kivity wrote:
> 
> >
> > Sorry for being late in this thread.
> > We (s390) will need a hypercall as we do not have port I/O. I think it should be
> > possible to default to hypercall on s390 and use pio everywhere else.
> >   
> 
> Or be generic: advertise the methods available according to host 
> (kvm/x86, qemu/x86, kvm/s390) and let the guest pick.

Not sure if I'm following the reasoning here. Shouldn't the method be
inherent to the virtio bus driver?

When you use a PCI based virtio bus, the natural choice would be PIO
in some way, but you could also have a different virtio implementation
on PCI that uses hcalls. This choice is completely up to virtio-pci.

On s390, you have a different virtio backend altogether, so you always
use DIAG or hcall instead of whatever virtio-pci does.

The virtio-blk and other high-level drivers don't need to care about
what transport the bus uses in the first place.

	Arnd <><

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [RFC] virtio-blk PCI backend
       [not found]                                         ` <200711201117.17900.arnd-r2nGTMty4D4@public.gmane.org>
@ 2007-11-20 11:05                                           ` Carsten Otte
  0 siblings, 0 replies; 17+ messages in thread
From: Carsten Otte @ 2007-11-20 11:05 UTC (permalink / raw)
  To: Arnd Bergmann
  Cc: kvm-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f, Avi Kivity,
	borntrae-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8

Arnd Bergmann wrote:
> Not sure if I'm following the reasoning here. Shouldn't the method be
> inherent to the virtio bus driver?
> 
> When you use a PCI based virtio bus, the natural choice would be PIO
> in some way, but you could also have a different virtio implementation
> on PCI that uses hcalls. This choice is completely up to virtio-pci.
> 
> On s390, you have a different virtio backend altogether, so you always
> use DIAG or hcall instead of whatever virtio-pci does.
> 
> The virtio-blk and other high-level drivers don't need to care about
> what transport the bus uses in the first place.

If you look at HPA's virtio PCI bus in lguest, it uses PCI device 
organization but no other PCI features. That's where we're heading, 
because we don't want a different virtio backend.

-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2005.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2007-11-20 11:05 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-11-08  2:51 [RFC] virtio-blk PCI backend Anthony Liguori
     [not found] ` <11944902733951-git-send-email-aliguori-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-11-08  6:24   ` Avi Kivity
     [not found]     ` <4732ABA0.5090603-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-08 13:57       ` Anthony Liguori
     [not found]         ` <473315DB.9030803-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-11-08 14:02           ` Avi Kivity
     [not found]             ` <4733170B.70206-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-08 15:09               ` Anthony Liguori
     [not found]                 ` <473326B4.2080307-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-11-08 15:19                   ` Avi Kivity
     [not found]                     ` <473328EC.4090905-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-08 16:22                       ` Anthony Liguori
     [not found]                         ` <473337B9.8040503-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
2007-11-09  0:13                           ` Dor Laor
     [not found]                             ` <4733A635.1080004-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-20  8:39                               ` Christian Borntraeger
     [not found]                                 ` <200711200939.19410.borntraeger-tA70FqPdS9bQT0dZR+AlfA@public.gmane.org>
2007-11-20 10:00                                   ` Avi Kivity
     [not found]                                     ` <4742B053.8080301-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-20 10:17                                       ` Arnd Bergmann
     [not found]                                         ` <200711201117.17900.arnd-r2nGTMty4D4@public.gmane.org>
2007-11-20 11:05                                           ` Carsten Otte
2007-11-11  9:23                           ` Avi Kivity
2007-11-08 15:31                   ` Avi Kivity
     [not found]                     ` <47332BB7.2000900-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-08 19:02                       ` Anthony Liguori
2007-11-09  0:25   ` Dor Laor
     [not found]     ` <4733A917.5000303-atKUWr5tajBWk0Htik3J/w@public.gmane.org>
2007-11-09  1:38       ` Anthony Liguori

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox