* [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules [not found] <cover.1251388414.git.mst@redhat.com> @ 2009-08-27 16:06 ` Michael S. Tsirkin 2009-08-28 15:31 ` Gregory Haskins 2009-08-27 16:07 ` [PATCHv5 2/3] mm: reduce atomic use on use_mm fast path Michael S. Tsirkin 2009-08-27 16:07 ` [PATCHv5 3/3] vhost_net: a kernel-level virtio server Michael S. Tsirkin 2 siblings, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-08-27 16:06 UTC (permalink / raw) To: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm vhost net module wants to do copy to/from user from a kernel thread, which needs use_mm (like what fs/aio has). Move that into mm/ and export to modules. Acked-by: Andrew Morton <akpm@linux-foundation.org> Acked-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- fs/aio.c | 47 +---------------------------------- include/linux/mmu_context.h | 9 ++++++ mm/Makefile | 2 +- mm/mmu_context.c | 58 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 47 deletions(-) create mode 100644 include/linux/mmu_context.h create mode 100644 mm/mmu_context.c diff --git a/fs/aio.c b/fs/aio.c index d065b2c..fc21c23 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -24,6 +24,7 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/mmu_context.h> #include <linux/slab.h> #include <linux/timer.h> #include <linux/aio.h> @@ -34,7 +35,6 @@ #include <asm/kmap_types.h> #include <asm/uaccess.h> -#include <asm/mmu_context.h> #if DEBUG > 1 #define dprintk printk @@ -595,51 +595,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) } /* - * use_mm - * Makes the calling kernel thread take on the specified - * mm context. - * Called by the retry thread execute retries within the - * iocb issuer's mm context, so that copy_from/to_user - * operations work seamlessly for aio. - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void use_mm(struct mm_struct *mm) -{ - struct mm_struct *active_mm; - struct task_struct *tsk = current; - - task_lock(tsk); - active_mm = tsk->active_mm; - atomic_inc(&mm->mm_count); - tsk->mm = mm; - tsk->active_mm = mm; - switch_mm(active_mm, mm, tsk); - task_unlock(tsk); - - mmdrop(active_mm); -} - -/* - * unuse_mm - * Reverses the effect of use_mm, i.e. releases the - * specified mm context which was earlier taken on - * by the calling kernel thread - * (Note: this routine is intended to be called only - * from a kernel thread context) - */ -static void unuse_mm(struct mm_struct *mm) -{ - struct task_struct *tsk = current; - - task_lock(tsk); - tsk->mm = NULL; - /* active_mm is still 'mm' */ - enter_lazy_tlb(mm, tsk); - task_unlock(tsk); -} - -/* * Queue up a kiocb to be retried. Assumes that the kiocb * has already been marked as kicked, and places it on * the retry run list for the corresponding ioctx, if it diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h new file mode 100644 index 0000000..70fffeb --- /dev/null +++ b/include/linux/mmu_context.h @@ -0,0 +1,9 @@ +#ifndef _LINUX_MMU_CONTEXT_H +#define _LINUX_MMU_CONTEXT_H + +struct mm_struct; + +void use_mm(struct mm_struct *mm); +void unuse_mm(struct mm_struct *mm); + +#endif diff --git a/mm/Makefile b/mm/Makefile index 5e0bd64..46c3892 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o $(mmu-y) obj-y += init-mm.o obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 0000000..9989c2f --- /dev/null +++ b/mm/mmu_context.c @@ -0,0 +1,58 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/sched.h> + +#include <asm/mmu_context.h> + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * Called by the retry thread execute retries within the + * iocb issuer's mm context, so that copy_from/to_user + * operations work seamlessly for aio. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + atomic_inc(&mm->mm_count); + tsk->mm = mm; + tsk->active_mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); + + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); -- 1.6.2.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 83+ messages in thread
* Re: [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules 2009-08-27 16:06 ` [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules Michael S. Tsirkin @ 2009-08-28 15:31 ` Gregory Haskins 0 siblings, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-08-28 15:31 UTC (permalink / raw) To: Michael S. Tsirkin, akpm Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, hpa, Rusty Russell, s.hetze [-- Attachment #1: Type: text/plain, Size: 634 bytes --] Michael S. Tsirkin wrote: > vhost net module wants to do copy to/from user from a kernel thread, > which needs use_mm (like what fs/aio has). Move that into mm/ and > export to modules. Michael, Andrew, I am just curious: Is there any technical reason why a kthread cannot have a long-term use_mm() in effect? (Assuming this makes sense for the design, of course). For the cases there we know the kthread will always service the same context (such as with venettap/vhost, it may make sense to do a use_mm() at init time and just leave it until the thread exits. Will this break anything? Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* [PATCHv5 2/3] mm: reduce atomic use on use_mm fast path [not found] <cover.1251388414.git.mst@redhat.com> 2009-08-27 16:06 ` [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules Michael S. Tsirkin @ 2009-08-27 16:07 ` Michael S. Tsirkin 2009-08-27 16:07 ` [PATCHv5 3/3] vhost_net: a kernel-level virtio server Michael S. Tsirkin 2 siblings, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-08-27 16:07 UTC (permalink / raw) To: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm When mm switched to matches that of active mm, we don't need to increment and then drop the mm count. Making that conditional reduces contention on that cache line on SMP systems. Acked-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- mm/mmu_context.c | 9 ++++++--- 1 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mmu_context.c b/mm/mmu_context.c index 9989c2f..0777654 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -27,13 +27,16 @@ void use_mm(struct mm_struct *mm) task_lock(tsk); active_mm = tsk->active_mm; - atomic_inc(&mm->mm_count); + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } tsk->mm = mm; - tsk->active_mm = mm; switch_mm(active_mm, mm, tsk); task_unlock(tsk); - mmdrop(active_mm); + if (active_mm != mm) + mmdrop(active_mm); } EXPORT_SYMBOL_GPL(use_mm); -- 1.6.2.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 83+ messages in thread
* [PATCHv5 3/3] vhost_net: a kernel-level virtio server [not found] <cover.1251388414.git.mst@redhat.com> 2009-08-27 16:06 ` [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules Michael S. Tsirkin 2009-08-27 16:07 ` [PATCHv5 2/3] mm: reduce atomic use on use_mm fast path Michael S. Tsirkin @ 2009-08-27 16:07 ` Michael S. Tsirkin 2009-09-03 18:39 ` Ira W. Snyder 2009-09-25 17:01 ` Ira W. Snyder 2 siblings, 2 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-08-27 16:07 UTC (permalink / raw) To: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tun-like device. In this version I only support raw socket as a backend, which can be bound to e.g. SR IOV, or to macvlan device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. I have done some light benchmarking (with v4), compared to userspace, I see improved latency (as I save up to 4 system calls per packet) but not bandwidth/CPU (as TSO and interrupt mitigation are not supported). For ping benchmark (where there's no TSO) troughput is also improved. Features that I plan to look at in the future: - tap support - TSO - interrupt mitigation - zero copy Acked-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- MAINTAINERS | 10 + arch/x86/kvm/Kconfig | 1 + drivers/Makefile | 1 + drivers/vhost/Kconfig | 11 + drivers/vhost/Makefile | 2 + drivers/vhost/net.c | 475 ++++++++++++++++++++++++++++++ drivers/vhost/vhost.c | 688 ++++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 122 ++++++++ include/linux/Kbuild | 1 + include/linux/miscdevice.h | 1 + include/linux/vhost.h | 101 +++++++ 11 files changed, 1413 insertions(+), 0 deletions(-) create mode 100644 drivers/vhost/Kconfig create mode 100644 drivers/vhost/Makefile create mode 100644 drivers/vhost/net.c create mode 100644 drivers/vhost/vhost.c create mode 100644 drivers/vhost/vhost.h create mode 100644 include/linux/vhost.h diff --git a/MAINTAINERS b/MAINTAINERS index b1114cf..de4587f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5431,6 +5431,16 @@ S: Maintained F: Documentation/filesystems/vfat.txt F: fs/fat/ +VIRTIO HOST (VHOST) +P: Michael S. Tsirkin +M: mst@redhat.com +L: kvm@vger.kernel.org +L: virtualization@lists.osdl.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/vhost/ +F: include/linux/vhost.h + VIA RHINE NETWORK DRIVER M: Roger Luethi <rl@hellgate.ch> S: Maintained diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b84e571..94f44d9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -64,6 +64,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/drivers/Makefile b/drivers/Makefile index bc4205d..1551ae1 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VIRTIO) += virtio/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig new file mode 100644 index 0000000..d955406 --- /dev/null +++ b/drivers/vhost/Kconfig @@ -0,0 +1,11 @@ +config VHOST_NET + tristate "Host kernel accelerator for virtio net" + depends on NET && EVENTFD + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest networking with virtio_net. Not to be confused with virtio_net + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_net. + diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile new file mode 100644 index 0000000..72dd020 --- /dev/null +++ b/drivers/vhost/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_VHOST_NET) += vhost_net.o +vhost_net-y := vhost.o net.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 0000000..2210eaa --- /dev/null +++ b/drivers/vhost/net.c @@ -0,0 +1,475 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-net server in host kernel. + */ + +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/mmu_context.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/rcupdate.h> +#include <linux/file.h> + +#include <linux/net.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> + +#include <net/sock.h> + +#include "vhost.h" + +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + /* We use a kind of RCU to access sock pointer. + * All readers access it from workqueue, which makes it possible to + * flush the workqueue instead of synchronize_rcu. Therefore readers do + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of + * work item execution acts instead of rcu_read_lock() and the end of + * work item execution acts instead of rcu_read_lock(). + * Writers use device mutex. */ + struct socket *sock; + struct vhost_poll poll[VHOST_NET_VQ_MAX]; +}; + +/* Pop first len bytes from iovec. Return number of segments used. */ +static int move_iovec_hdr(struct iovec *from, struct iovec *to, + size_t len, int iov_count) +{ + int seg = 0; + size_t size; + while (len && seg < iov_count) { + size = min(from->iov_len, len); + to->iov_base = from->iov_base; + to->iov_len = size; + from->iov_len -= size; + from->iov_base += size; + len -= size; + ++from; + ++to; + ++seg; + } + return seg; +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_tx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + unsigned head, out, in, s; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + size_t len; + int err; + struct socket *sock = rcu_dereference(net->sock); + if (!sock || !sock_writeable(sock->sk)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) + break; + if (in) { + vq_err(vq, "Unexpected descriptor format for TX: " + "out %d, int %d\n", out, in); + break; + } + /* Skip header. TODO: support TSO. */ + s = move_iovec_hdr(vq->iov, vq->hdr, + sizeof(struct virtio_net_hdr), out); + msg.msg_iovlen = out; + len = iov_length(vq->iov, out); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for TX: " + "%ld expected %zd\n", + iov_length(vq->hdr, s), + sizeof(struct virtio_net_hdr)); + break; + } + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(NULL, sock, &msg, len); + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + if (err != len) + pr_err("Truncated TX packet: " + " len %d != %zd\n", err, len); + vhost_add_used_and_trigger(&net->dev, vq, head, 0); + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + unsigned head, out, in, s; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_iov = vq->iov, + .msg_flags = MSG_DONTWAIT, + }; + + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + size_t len; + int err; + struct socket *sock = rcu_dereference(net->sock); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + vhost_no_notify(vq); + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); + /* OK, now we need to know about added descriptors. */ + if (head == vq->num && vhost_notify(vq)) + /* They could have slipped one in as we were doing that: + * check again. */ + continue; + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) + break; + /* We don't need to be notified again. */ + vhost_no_notify(vq); + if (out) { + vq_err(vq, "Unexpected descriptor format for RX: " + "out %d, int %d\n", + out, in); + break; + } + /* Skip header. TODO: support TSO/mergeable rx buffers. */ + s = move_iovec_hdr(vq->iov, vq->hdr, sizeof hdr, in); + msg.msg_iovlen = in; + len = iov_length(vq->iov, in); + /* Sanity check */ + if (!len) { + vq_err(vq, "Unexpected header len for RX: " + "%zd expected %zd\n", + iov_length(vq->hdr, s), sizeof hdr); + break; + } + err = sock->ops->recvmsg(NULL, sock, &msg, + len, MSG_DONTWAIT | MSG_TRUNC); + /* TODO: Check specific error and bomb out unless EAGAIN? */ + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + /* TODO: Should check and handle checksum. */ + if (err > len) { + pr_err("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); + vhost_discard_vq_desc(vq); + continue; + } + len = err; + err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, sizeof hdr); + if (err) { + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", + vq->iov->iov_base, err); + break; + } + vhost_add_used_and_trigger(&net->dev, vq, head, + len + sizeof hdr); + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +static void handle_tx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); +} + +static void handle_rx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); +} + +static void handle_tx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + handle_tx(net); +} + +static void handle_rx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + handle_rx(net); +} + +static int vhost_net_open(struct inode *inode, struct file *f) +{ + struct vhost_net *n = kzalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + f->private_data = n; + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + return 0; +} + +static struct socket *vhost_net_stop(struct vhost_net *n) +{ + struct socket *sock = n->sock; + rcu_assign_pointer(n->sock, NULL); + if (sock) { + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); + } + return sock; +} + +static int vhost_net_release(struct inode *inode, struct file *f) +{ + struct vhost_net *n = f->private_data; + struct socket *sock; + + sock = vhost_net_stop(n); + vhost_dev_cleanup(&n->dev); + if (sock) + fput(sock->file); + kfree(n); + return 0; +} + +static void vhost_net_flush(struct vhost_net *n) +{ + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_TX].poll); + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_RX].poll); +} + +static long vhost_net_set_socket(struct vhost_net *n, int fd) +{ + struct { + struct sockaddr_ll sa; + char buf[MAX_ADDR_LEN]; + } uaddr; + struct socket *sock, *oldsock = NULL; + int uaddr_len = sizeof uaddr, r; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto done; + + if (fd == -1) { + /* Disconnect from socket and device. */ + oldsock = vhost_net_stop(n); + goto done; + } + + sock = sockfd_lookup(fd, &r); + if (!sock) { + r = -ENOTSOCK; + goto done; + } + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto done; + } + + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, + &uaddr_len, 0); + if (r) + goto done; + + if (uaddr.sa.sll_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto done; + } + + /* start polling new socket */ + if (sock == oldsock) + goto done; + + if (oldsock) { + vhost_poll_stop(n->poll + VHOST_NET_VQ_TX); + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); + } + oldsock = n->sock; + rcu_assign_pointer(n->sock, sock); + vhost_poll_start(n->poll + VHOST_NET_VQ_TX, sock->file); + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); +done: + mutex_unlock(&n->dev.mutex); + if (oldsock) { + vhost_net_flush(n); + fput(oldsock->file); + } + return r; +} + +static long vhost_net_reset_owner(struct vhost_net *n) +{ + struct socket *sock = NULL; + long r; + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto done; + sock = vhost_net_stop(n); + r = vhost_dev_reset_owner(&n->dev); +done: + mutex_unlock(&n->dev.mutex); + if (sock) + fput(sock->file); + return r; +} + +static void vhost_net_set_features(struct vhost_net *n, u64 features) +{ + mutex_unlock(&n->dev.mutex); + n->dev.acked_features = features; + mutex_unlock(&n->dev.mutex); + vhost_net_flush(n); +} + +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_net *n = f->private_data; + void __user *argp = (void __user *)arg; + u32 __user *featurep = argp; + int __user *fdp = argp; + u64 features; + int fd, r; + switch (ioctl) { + case VHOST_NET_SET_SOCKET: + r = get_user(fd, fdp); + if (r < 0) + return r; + return vhost_net_set_socket(n, fd); + case VHOST_GET_FEATURES: + features = VHOST_FEATURES; + return put_user(features, featurep); + case VHOST_ACK_FEATURES: + r = get_user(features, featurep); + /* No features for now */ + if (r < 0) + return r; + if (features & ~VHOST_FEATURES) + return -EOPNOTSUPP; + vhost_net_set_features(n, features); + return 0; + case VHOST_RESET_OWNER: + return vhost_net_reset_owner(n); + default: + return vhost_dev_ioctl(&n->dev, ioctl, arg); + } +} + +#ifdef CONFIG_COMPAT +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +const static struct file_operations vhost_net_fops = { + .owner = THIS_MODULE, + .release = vhost_net_release, + .unlocked_ioctl = vhost_net_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_net_compat_ioctl, +#endif + .open = vhost_net_open, +}; + +static struct miscdevice vhost_net_misc = { + VHOST_NET_MINOR, + "vhost-net", + &vhost_net_fops, +}; + +int vhost_net_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + r = misc_register(&vhost_net_misc); + if (r) + goto err_reg; + return 0; +err_reg: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_net_init); + +void vhost_net_exit(void) +{ + misc_deregister(&vhost_net_misc); + vhost_cleanup(); +} +module_exit(vhost_net_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c new file mode 100644 index 0000000..6925cc1 --- /dev/null +++ b/drivers/vhost/vhost.c @@ -0,0 +1,688 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * Inspiration, some code, and most witty comments come from + * Documentation/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Generic code for virtio server in host kernel. + */ + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/rcupdate.h> +#include <linux/poll.h> +#include <linux/file.h> + +#include <linux/net.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> + +#include <net/sock.h> + +#include "vhost.h" + +enum { + VHOST_MEMORY_MAX_NREGIONS = 64, +}; + +static struct workqueue_struct *vhost_workqueue; + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + poll = container_of(pt, struct vhost_poll, table); + + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll; + poll = container_of(wait, struct vhost_poll, wait); + if (!((unsigned long)key & poll->mask)) + return 0; + + queue_work(vhost_workqueue, &poll->work); + return 0; +} + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask) +{ + INIT_WORK(&poll->work, func); + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; +} + +/* Start polling a file. We add ourselves to file's wait queue. The caller must + * keep a reference to a file until after vhost_poll_stop is called. */ +void vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + unsigned long mask; + mask = file->f_op->poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); +} + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + remove_wait_queue(poll->wqh, &poll->wait); +} + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + flush_work(&poll->work); +} + +long vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue *vqs, int nvqs) +{ + int i; + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + + for (i = 0; i < dev->nvqs; ++i) { + dev->vqs[i].dev = dev; + mutex_init(&dev->vqs[i].mutex); + if (dev->vqs[i].handle_kick) + vhost_poll_init(&dev->vqs[i].poll, + dev->vqs[i].handle_kick, + POLLIN); + } + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + /* Are you the owner? If not, I don't think you mean to do that */ + return dev->mm == current->mm ? 0 : -EPERM; +} + +/* Caller should have device mutex */ +static long vhost_dev_set_owner(struct vhost_dev *dev) +{ + /* Is there an owner already? */ + if (dev->mm) + return -EBUSY; + /* No owner, become one */ + dev->mm = get_task_mm(current); + return 0; +} + +/* Caller should have device mutex */ +long vhost_dev_reset_owner(struct vhost_dev *dev) +{ + struct vhost_memory *memory; + + /* Restore memory to default 1:1 mapping. */ + memory = kmalloc(offsetof(struct vhost_memory, regions) + + 2 * sizeof *memory->regions, GFP_KERNEL); + if (!memory) + return -ENOMEM; + + vhost_dev_cleanup(dev); + + memory->nregions = 2; + memory->regions[0].guest_phys_addr = 1; + memory->regions[0].userspace_addr = 1; + memory->regions[0].memory_size = ~0ULL; + memory->regions[1].guest_phys_addr = 0; + memory->regions[1].userspace_addr = 0; + memory->regions[1].memory_size = 1; + dev->memory = memory; + return 0; +} + +/* Caller should have device mutex */ +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { + vhost_poll_stop(&dev->vqs[i].poll); + vhost_poll_flush(&dev->vqs[i].poll); + } + if (dev->vqs[i].error_ctx) + eventfd_ctx_put(dev->vqs[i].error_ctx); + if (dev->vqs[i].error) + fput(dev->vqs[i].error); + if (dev->vqs[i].kick) + fput(dev->vqs[i].kick); + if (dev->vqs[i].call_ctx) + eventfd_ctx_put(dev->vqs[i].call_ctx); + if (dev->vqs[i].call) + fput(dev->vqs[i].call); + dev->vqs[i].error_ctx = NULL; + dev->vqs[i].error = NULL; + dev->vqs[i].kick = NULL; + dev->vqs[i].call_ctx = NULL; + dev->vqs[i].call = NULL; + } + /* No one will access memory at this point */ + kfree(dev->memory); + dev->memory = NULL; + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} + +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) +{ + struct vhost_memory mem, *newmem, *oldmem; + unsigned long size = offsetof(struct vhost_memory, regions); + long r; + r = copy_from_user(&mem, m, size); + if (r) + return r; + if (mem.padding) + return -EOPNOTSUPP; + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) + return -E2BIG; + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); + if (!newmem) + return -ENOMEM; + + memcpy(newmem, &mem, size); + r = copy_from_user(newmem->regions, m->regions, + mem.nregions * sizeof *m->regions); + if (r) { + kfree(newmem); + return r; + } + oldmem = d->memory; + rcu_assign_pointer(d->memory, newmem); + synchronize_rcu(); + kfree(oldmem); + return 0; +} + +static int init_used(struct vhost_virtqueue *vq) +{ + int r = put_user(vq->used_flags, &vq->used->flags); + if (r) + return r; + return get_user(vq->last_used_idx, &vq->used->idx); +} + +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) +{ + struct file *eventfp, *filep = NULL, + *pollstart = NULL, *pollstop = NULL; + struct eventfd_ctx *ctx = NULL; + u32 __user *idxp = argp; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + struct vhost_vring_file f; + struct vhost_vring_addr a; + u32 idx; + long r; + + r = get_user(idx, idxp); + if (r < 0) + return r; + if (idx > d->nvqs) + return -ENOBUFS; + + vq = d->vqs + idx; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->num = s.num; + break; + case VHOST_SET_VRING_BASE: + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->avail_idx = vq->last_avail_idx = s.num; + break; + case VHOST_GET_VRING_BASE: + s.index = idx; + s.num = vq->last_avail_idx; + r = copy_to_user(argp, &s, sizeof s); + break; + case VHOST_SET_VRING_DESC: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->desc = (void __user *)(long)a.user_addr; + break; + case VHOST_SET_VRING_AVAIL: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->avail = (void __user *)(long)a.user_addr; + /* Forget the cached index value. */ + vq->avail_idx = vq->last_avail_idx; + break; + case VHOST_SET_VRING_USED: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->used = (void __user *)(long)a.user_addr; + r = init_used(vq); + if (r) + break; + break; + case VHOST_SET_VRING_KICK: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->kick) { + pollstop = filep = vq->kick; + pollstart = vq->kick = eventfp; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_CALL: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->call) { + filep = vq->call; + ctx = vq->call_ctx; + vq->call = eventfp; + vq->call_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_ERR: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->error) { + filep = vq->error; + vq->error = eventfp; + ctx = vq->error_ctx; + vq->error_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + default: + r = -ENOIOCTLCMD; + } + + if (pollstop && vq->handle_kick) + vhost_poll_stop(&vq->poll); + + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + + if (pollstart && vq->handle_kick) + vhost_poll_start(&vq->poll, vq->kick); + + mutex_unlock(&vq->mutex); + + if (pollstop && vq->handle_kick) + vhost_poll_flush(&vq->poll); + return 0; +} + +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r; + + mutex_lock(&d->mutex); + /* If you are not the owner, you can become one */ + if (ioctl == VHOST_SET_OWNER) { + r = vhost_dev_set_owner(d); + goto done; + } + + /* You must be the owner to do anything else */ + r = vhost_dev_check_owner(d); + if (r) + goto done; + + switch (ioctl) { + case VHOST_SET_MEM_TABLE: + r = vhost_set_memory(d, argp); + break; + default: + r = vhost_set_vring(d, ioctl, argp); + break; + } +done: + mutex_unlock(&d->mutex); + return r; +} + +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, + __u64 addr, __u32 len) +{ + struct vhost_memory_region *reg; + int i; + /* linear search is not brilliant, but we really have on the order of 6 + * regions in practice */ + for (i = 0; i < mem->nregions; ++i) { + reg = mem->regions + i; + if (reg->guest_phys_addr <= addr && + reg->guest_phys_addr + reg->memory_size - 1 >= addr) + return reg; + } + return NULL; +} + +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, + struct iovec iov[], int iov_size) +{ + const struct vhost_memory_region *reg; + struct vhost_memory *mem; + struct iovec *_iov; + u64 s = 0; + int ret = 0; + + rcu_read_lock(); + + mem = rcu_dereference(dev->memory); + while ((u64)len > s) { + u64 size; + if (ret >= iov_size) { + ret = -ENOBUFS; + break; + } + reg = find_region(mem, addr, len); + if (!reg) { + ret = -EFAULT; + break; + } + _iov = iov + ret; + size = reg->memory_size - addr + reg->guest_phys_addr; + _iov->iov_len = min((u64)len, size); + _iov->iov_base = (void *) + (reg->userspace_addr + addr - reg->guest_phys_addr); + s += size; + addr += size; + ++ret; + } + + rcu_read_unlock(); + return ret; +} + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, or vq->vring.num if we're + * at the end. */ +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) + return vq->num; + + /* Check they're not leading us off end of descriptors. */ + next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + /* We will use the result as an index in an array, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + if (next >= vq->num) { + vq_err(vq, "Desc next is %u > %u", next, vq->num); + return vq->num; + } + + return next; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which + * is never a valid descriptor number) if none was found. */ +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + struct vring_desc desc; + unsigned int i, head; + u16 last_avail_idx; + int ret; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + if (get_user(vq->avail_idx, &vq->avail->idx)) { + vq_err(vq, "Failed to access avail idx at %p\n", + &vq->avail->idx); + return vq->num; + } + + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, vq->avail_idx); + return vq->num; + } + + /* If there's nothing new since last we looked, return invalid. */ + if (vq->avail_idx == last_avail_idx) + return vq->num; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + last_avail_idx, + &vq->avail->ring[last_avail_idx % vq->num]); + return vq->num; + } + + /* If their number is silly, that's an error. */ + if (head >= vq->num) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return vq->num; + } + + vq->last_avail_idx++; + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + unsigned iov_count = *in_num + *out_num; + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) { + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", + i, vq->desc + i); + return vq->num; + } + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, + VHOST_NET_MAX_SG - iov_count); + if (ret < 0) { + vq_err(vq, "Translation failure %d descriptor idx %d\n", + ret, i); + return vq->num; + } + /* If this is an input descriptor, increment that count. */ + if (desc.flags & VRING_DESC_F_WRITE) + *in_num += ret; + else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return vq->num; + } + *out_num += ret; + } + } while ((i = next_desc(vq, &desc)) != vq->num); + return head; +} + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) +{ + vq->last_avail_idx--; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using vq->call. */ +int vhost_add_used(struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vq->used->ring[vq->last_used_idx % vq->num]; + if (put_user(head, &used->id)) { + vq_err(vq, "Failed to write used id"); + return -EFAULT; + } + if (put_user(len, &used->len)) { + vq_err(vq, "Failed to write used len"); + return -EFAULT; + } + /* Make sure buffer is written before we update index. */ + wmb(); + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { + vq_err(vq, "Failed to increment used idx"); + return -EFAULT; + } + vq->last_used_idx++; + return 0; +} + +/* This actually sends the interrupt for this virtqueue */ +void vhost_trigger_irq(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ + __u16 flags = 0; + if (get_user(flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return; + } + + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && + (!vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) || + vq->avail_idx != vq->last_avail_idx)) + return; + + /* Send the Guest an interrupt tell them we used something up. */ + if (vq->call_ctx) + eventfd_signal(vq->call_ctx, 1); +} + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_trigger(struct vhost_dev *dev, + struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_trigger_irq(dev, vq); +} + +/* OK, now we need to know about added descriptors. */ +bool vhost_notify(struct vhost_virtqueue *vq) +{ + int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) + return false; + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) + vq_err(vq, "Failed to disable notification: %d\n", r); + /* They could have slipped one in as we were doing that: make + * sure it's written, tell caller it needs to check again. */ + mb(); + return true; +} + +/* We don't need to be notified again. */ +void vhost_no_notify(struct vhost_virtqueue *vq) +{ + int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) + return; + vq->used_flags |= VRING_USED_F_NO_NOTIFY; + r = put_user(vq->used_flags, &vq->used->flags); + if (r) + vq_err(vq, "Failed to enable notification: %d\n", r); +} + +int vhost_init(void) +{ + vhost_workqueue = create_workqueue("vhost"); + if (!vhost_workqueue) + return -ENOMEM; + return 0; +} + +void vhost_cleanup(void) +{ + destroy_workqueue(vhost_workqueue); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h new file mode 100644 index 0000000..8e13d06 --- /dev/null +++ b/drivers/vhost/vhost.h @@ -0,0 +1,122 @@ +#ifndef _VHOST_H +#define _VHOST_H + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/skbuff.h> +#include <linux/uio.h> +#include <linux/virtio_config.h> + +struct vhost_device; + +enum { + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_t wait; + /* struct which will handle all actual work. */ + struct work_struct work; + unsigned long mask; +}; + +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask); +void vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + struct file *kick; + struct file *call; + struct file *error; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + work_func_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Caches available index value from user. */ + u16 avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Used flags */ + u16 used_flags; + + struct iovec iov[VHOST_NET_MAX_SG]; + struct iovec hdr[VHOST_NET_MAX_SG]; +}; + +struct vhost_dev { + /* Readers use RCU to access memory table pointer. + * Writers use mutex below.*/ + struct vhost_memory *memory; + struct mm_struct *mm; + struct vhost_virtqueue *vqs; + int nvqs; + struct mutex mutex; + unsigned acked_features; +}; + +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_check_owner(struct vhost_dev *); +long vhost_dev_reset_owner(struct vhost_dev *); +void vhost_dev_cleanup(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); + +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *); + +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +void vhost_trigger_irq(struct vhost_dev *, struct vhost_virtqueue *); +void vhost_add_used_and_trigger(struct vhost_dev *, struct vhost_virtqueue *, + unsigned int head, int len); +void vhost_no_notify(struct vhost_virtqueue *); +bool vhost_notify(struct vhost_virtqueue *); + +int vhost_init(void); +void vhost_cleanup(void); + +#define vq_err(vq, fmt, ...) do { \ + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +enum { + VHOST_FEATURES = 1 << VIRTIO_F_NOTIFY_ON_EMPTY, +}; + +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) +{ + return dev->acked_features & (1 << bit); +} + +#endif diff --git a/include/linux/Kbuild b/include/linux/Kbuild index dec2f18..975df9a 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -360,6 +360,7 @@ unifdef-y += uio.h unifdef-y += unistd.h unifdef-y += usbdevice_fs.h unifdef-y += utsname.h +unifdef-y += vhost.h unifdef-y += videodev2.h unifdef-y += videodev.h unifdef-y += virtio_config.h diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 0521177..781a8bb 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -30,6 +30,7 @@ #define HPET_MINOR 228 #define FUSE_MINOR 229 #define KVM_MINOR 232 +#define VHOST_NET_MINOR 233 #define MISC_DYNAMIC_MINOR 255 struct device; diff --git a/include/linux/vhost.h b/include/linux/vhost.h new file mode 100644 index 0000000..3f441a9 --- /dev/null +++ b/include/linux/vhost.h @@ -0,0 +1,101 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/ioctl.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; +}; + +struct vhost_vring_addr { + unsigned int index; + unsigned int padding; + __u64 user_addr; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 padding; /* read/write protection? */ +}; + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits are used for + * vhost specific features. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Ring setup. These parameters can not be modified while ring is running + * (bound to a device). */ +/* Set number of descriptors in ring */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Start of array of descriptors (virtually contiguous) */ +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Used structure address */ +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) +/* Available structure address */ +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) + +/* VHOST_NET specific defines */ + +/* Attach virtio net device to a raw socket. The socket must be already + * bound to an ethernet device, this device will be used for transmit. + * Pass -1 to unbind from the socket and the transmit device. + * This can be used to stop the device (e.g. for migration). */ +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int) + +#endif -- 1.6.2.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply related [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-27 16:07 ` [PATCHv5 3/3] vhost_net: a kernel-level virtio server Michael S. Tsirkin @ 2009-09-03 18:39 ` Ira W. Snyder 2009-09-07 10:15 ` Michael S. Tsirkin 2009-09-25 17:01 ` Ira W. Snyder 1 sibling, 1 reply; 83+ messages in thread From: Ira W. Snyder @ 2009-09-03 18:39 UTC (permalink / raw) To: Michael S. Tsirkin Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > What it is: vhost net is a character device that can be used to reduce > the number of system calls involved in virtio networking. > Existing virtio net code is used in the guest without modification. > > There's similarity with vringfd, with some differences and reduced scope > - uses eventfd for signalling > - structures can be moved around in memory at any time (good for migration) > - support memory table and not just an offset (needed for kvm) > > common virtio related code has been put in a separate file vhost.c and > can be made into a separate module if/when more backends appear. I used > Rusty's lguest.c as the source for developing this part : this supplied > me with witty comments I wouldn't be able to write myself. > > What it is not: vhost net is not a bus, and not a generic new system > call. No assumptions are made on how guest performs hypercalls. > Userspace hypervisors are supported as well as kvm. > > How it works: Basically, we connect virtio frontend (configured by > userspace) to a backend. The backend could be a network device, or a > tun-like device. In this version I only support raw socket as a backend, > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > also configured by userspace, including vlan/mac etc. > > Status: > This works for me, and I haven't see any crashes. > I have done some light benchmarking (with v4), compared to userspace, I > see improved latency (as I save up to 4 system calls per packet) but not > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > ping benchmark (where there's no TSO) troughput is also improved. > > Features that I plan to look at in the future: > - tap support > - TSO > - interrupt mitigation > - zero copy > Hello Michael, I've started looking at vhost with the intention of using it over PCI to connect physical machines together. The part that I am struggling with the most is figuring out which parts of the rings are in the host's memory, and which parts are in the guest's memory. If I understand everything correctly, the rings are all userspace addresses, which means that they can be moved around in physical memory, and get pushed out to swap. AFAIK, this is impossible to handle when connecting two physical systems, you'd need the rings available in IO memory (PCI memory), so you can ioreadXX() them instead. To the best of my knowledge, I shouldn't be using copy_to_user() on an __iomem address. Also, having them migrate around in memory would be a bad thing. Also, I'm having trouble figuring out how the packet contents are actually copied from one system to the other. Could you point this out for me? Is there somewhere I can find the userspace code (kvm, qemu, lguest, etc.) code needed for interacting with the vhost misc device so I can get a better idea of how userspace is supposed to work? (Features negotiation, etc.) Thanks, Ira > Acked-by: Arnd Bergmann <arnd@arndb.de> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > --- > MAINTAINERS | 10 + > arch/x86/kvm/Kconfig | 1 + > drivers/Makefile | 1 + > drivers/vhost/Kconfig | 11 + > drivers/vhost/Makefile | 2 + > drivers/vhost/net.c | 475 ++++++++++++++++++++++++++++++ > drivers/vhost/vhost.c | 688 ++++++++++++++++++++++++++++++++++++++++++++ > drivers/vhost/vhost.h | 122 ++++++++ > include/linux/Kbuild | 1 + > include/linux/miscdevice.h | 1 + > include/linux/vhost.h | 101 +++++++ > 11 files changed, 1413 insertions(+), 0 deletions(-) > create mode 100644 drivers/vhost/Kconfig > create mode 100644 drivers/vhost/Makefile > create mode 100644 drivers/vhost/net.c > create mode 100644 drivers/vhost/vhost.c > create mode 100644 drivers/vhost/vhost.h > create mode 100644 include/linux/vhost.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index b1114cf..de4587f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -5431,6 +5431,16 @@ S: Maintained > F: Documentation/filesystems/vfat.txt > F: fs/fat/ > > +VIRTIO HOST (VHOST) > +P: Michael S. Tsirkin > +M: mst@redhat.com > +L: kvm@vger.kernel.org > +L: virtualization@lists.osdl.org > +L: netdev@vger.kernel.org > +S: Maintained > +F: drivers/vhost/ > +F: include/linux/vhost.h > + > VIA RHINE NETWORK DRIVER > M: Roger Luethi <rl@hellgate.ch> > S: Maintained > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index b84e571..94f44d9 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -64,6 +64,7 @@ config KVM_AMD > > # OK, it's a little counter-intuitive to do this, but it puts it neatly under > # the virtualization menu. > +source drivers/vhost/Kconfig > source drivers/lguest/Kconfig > source drivers/virtio/Kconfig > > diff --git a/drivers/Makefile b/drivers/Makefile > index bc4205d..1551ae1 100644 > --- a/drivers/Makefile > +++ b/drivers/Makefile > @@ -105,6 +105,7 @@ obj-$(CONFIG_HID) += hid/ > obj-$(CONFIG_PPC_PS3) += ps3/ > obj-$(CONFIG_OF) += of/ > obj-$(CONFIG_SSB) += ssb/ > +obj-$(CONFIG_VHOST_NET) += vhost/ > obj-$(CONFIG_VIRTIO) += virtio/ > obj-$(CONFIG_VLYNQ) += vlynq/ > obj-$(CONFIG_STAGING) += staging/ > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > new file mode 100644 > index 0000000..d955406 > --- /dev/null > +++ b/drivers/vhost/Kconfig > @@ -0,0 +1,11 @@ > +config VHOST_NET > + tristate "Host kernel accelerator for virtio net" > + depends on NET && EVENTFD > + ---help--- > + This kernel module can be loaded in host kernel to accelerate > + guest networking with virtio_net. Not to be confused with virtio_net > + module itself which needs to be loaded in guest kernel. > + > + To compile this driver as a module, choose M here: the module will > + be called vhost_net. > + > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > new file mode 100644 > index 0000000..72dd020 > --- /dev/null > +++ b/drivers/vhost/Makefile > @@ -0,0 +1,2 @@ > +obj-$(CONFIG_VHOST_NET) += vhost_net.o > +vhost_net-y := vhost.o net.o > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > new file mode 100644 > index 0000000..2210eaa > --- /dev/null > +++ b/drivers/vhost/net.c > @@ -0,0 +1,475 @@ > +/* Copyright (C) 2009 Red Hat, Inc. > + * Author: Michael S. Tsirkin <mst@redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * virtio-net server in host kernel. > + */ > + > +#include <linux/compat.h> > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <linux/mmu_context.h> > +#include <linux/miscdevice.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/rcupdate.h> > +#include <linux/file.h> > + > +#include <linux/net.h> > +#include <linux/if_packet.h> > +#include <linux/if_arp.h> > + > +#include <net/sock.h> > + > +#include "vhost.h" > + > +enum { > + VHOST_NET_VQ_RX = 0, > + VHOST_NET_VQ_TX = 1, > + VHOST_NET_VQ_MAX = 2, > +}; > + > +struct vhost_net { > + struct vhost_dev dev; > + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; > + /* We use a kind of RCU to access sock pointer. > + * All readers access it from workqueue, which makes it possible to > + * flush the workqueue instead of synchronize_rcu. Therefore readers do > + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of > + * work item execution acts instead of rcu_read_lock() and the end of > + * work item execution acts instead of rcu_read_lock(). > + * Writers use device mutex. */ > + struct socket *sock; > + struct vhost_poll poll[VHOST_NET_VQ_MAX]; > +}; > + > +/* Pop first len bytes from iovec. Return number of segments used. */ > +static int move_iovec_hdr(struct iovec *from, struct iovec *to, > + size_t len, int iov_count) > +{ > + int seg = 0; > + size_t size; > + while (len && seg < iov_count) { > + size = min(from->iov_len, len); > + to->iov_base = from->iov_base; > + to->iov_len = size; > + from->iov_len -= size; > + from->iov_base += size; > + len -= size; > + ++from; > + ++to; > + ++seg; > + } > + return seg; > +} > + > +/* Expects to be always run from workqueue - which acts as > + * read-size critical section for our kind of RCU. */ > +static void handle_tx(struct vhost_net *net) > +{ > + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; > + unsigned head, out, in, s; > + struct msghdr msg = { > + .msg_name = NULL, > + .msg_namelen = 0, > + .msg_control = NULL, > + .msg_controllen = 0, > + .msg_iov = vq->iov, > + .msg_flags = MSG_DONTWAIT, > + }; > + size_t len; > + int err; > + struct socket *sock = rcu_dereference(net->sock); > + if (!sock || !sock_writeable(sock->sk)) > + return; > + > + use_mm(net->dev.mm); > + mutex_lock(&vq->mutex); > + for (;;) { > + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); > + /* Nothing new? Wait for eventfd to tell us they refilled. */ > + if (head == vq->num) > + break; > + if (in) { > + vq_err(vq, "Unexpected descriptor format for TX: " > + "out %d, int %d\n", out, in); > + break; > + } > + /* Skip header. TODO: support TSO. */ > + s = move_iovec_hdr(vq->iov, vq->hdr, > + sizeof(struct virtio_net_hdr), out); > + msg.msg_iovlen = out; > + len = iov_length(vq->iov, out); > + /* Sanity check */ > + if (!len) { > + vq_err(vq, "Unexpected header len for TX: " > + "%ld expected %zd\n", > + iov_length(vq->hdr, s), > + sizeof(struct virtio_net_hdr)); > + break; > + } > + /* TODO: Check specific error and bomb out unless ENOBUFS? */ > + err = sock->ops->sendmsg(NULL, sock, &msg, len); > + if (err < 0) { > + vhost_discard_vq_desc(vq); > + break; > + } > + if (err != len) > + pr_err("Truncated TX packet: " > + " len %d != %zd\n", err, len); > + vhost_add_used_and_trigger(&net->dev, vq, head, 0); > + } > + > + mutex_unlock(&vq->mutex); > + unuse_mm(net->dev.mm); > +} > + > +/* Expects to be always run from workqueue - which acts as > + * read-size critical section for our kind of RCU. */ > +static void handle_rx(struct vhost_net *net) > +{ > + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; > + unsigned head, out, in, s; > + struct msghdr msg = { > + .msg_name = NULL, > + .msg_namelen = 0, > + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ > + .msg_controllen = 0, > + .msg_iov = vq->iov, > + .msg_flags = MSG_DONTWAIT, > + }; > + > + struct virtio_net_hdr hdr = { > + .flags = 0, > + .gso_type = VIRTIO_NET_HDR_GSO_NONE > + }; > + > + size_t len; > + int err; > + struct socket *sock = rcu_dereference(net->sock); > + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) > + return; > + > + use_mm(net->dev.mm); > + mutex_lock(&vq->mutex); > + vhost_no_notify(vq); > + > + for (;;) { > + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); > + /* OK, now we need to know about added descriptors. */ > + if (head == vq->num && vhost_notify(vq)) > + /* They could have slipped one in as we were doing that: > + * check again. */ > + continue; > + /* Nothing new? Wait for eventfd to tell us they refilled. */ > + if (head == vq->num) > + break; > + /* We don't need to be notified again. */ > + vhost_no_notify(vq); > + if (out) { > + vq_err(vq, "Unexpected descriptor format for RX: " > + "out %d, int %d\n", > + out, in); > + break; > + } > + /* Skip header. TODO: support TSO/mergeable rx buffers. */ > + s = move_iovec_hdr(vq->iov, vq->hdr, sizeof hdr, in); > + msg.msg_iovlen = in; > + len = iov_length(vq->iov, in); > + /* Sanity check */ > + if (!len) { > + vq_err(vq, "Unexpected header len for RX: " > + "%zd expected %zd\n", > + iov_length(vq->hdr, s), sizeof hdr); > + break; > + } > + err = sock->ops->recvmsg(NULL, sock, &msg, > + len, MSG_DONTWAIT | MSG_TRUNC); > + /* TODO: Check specific error and bomb out unless EAGAIN? */ > + if (err < 0) { > + vhost_discard_vq_desc(vq); > + break; > + } > + /* TODO: Should check and handle checksum. */ > + if (err > len) { > + pr_err("Discarded truncated rx packet: " > + " len %d > %zd\n", err, len); > + vhost_discard_vq_desc(vq); > + continue; > + } > + len = err; > + err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, sizeof hdr); > + if (err) { > + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", > + vq->iov->iov_base, err); > + break; > + } > + vhost_add_used_and_trigger(&net->dev, vq, head, > + len + sizeof hdr); > + } > + > + mutex_unlock(&vq->mutex); > + unuse_mm(net->dev.mm); > +} > + > +static void handle_tx_kick(struct work_struct *work) > +{ > + struct vhost_virtqueue *vq; > + struct vhost_net *net; > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + net = container_of(vq->dev, struct vhost_net, dev); > + handle_tx(net); > +} > + > +static void handle_rx_kick(struct work_struct *work) > +{ > + struct vhost_virtqueue *vq; > + struct vhost_net *net; > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + net = container_of(vq->dev, struct vhost_net, dev); > + handle_rx(net); > +} > + > +static void handle_tx_net(struct work_struct *work) > +{ > + struct vhost_net *net; > + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); > + handle_tx(net); > +} > + > +static void handle_rx_net(struct work_struct *work) > +{ > + struct vhost_net *net; > + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); > + handle_rx(net); > +} > + > +static int vhost_net_open(struct inode *inode, struct file *f) > +{ > + struct vhost_net *n = kzalloc(sizeof *n, GFP_KERNEL); > + int r; > + if (!n) > + return -ENOMEM; > + f->private_data = n; > + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; > + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; > + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); > + if (r < 0) { > + kfree(n); > + return r; > + } > + > + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); > + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); > + return 0; > +} > + > +static struct socket *vhost_net_stop(struct vhost_net *n) > +{ > + struct socket *sock = n->sock; > + rcu_assign_pointer(n->sock, NULL); > + if (sock) { > + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); > + } > + return sock; > +} > + > +static int vhost_net_release(struct inode *inode, struct file *f) > +{ > + struct vhost_net *n = f->private_data; > + struct socket *sock; > + > + sock = vhost_net_stop(n); > + vhost_dev_cleanup(&n->dev); > + if (sock) > + fput(sock->file); > + kfree(n); > + return 0; > +} > + > +static void vhost_net_flush(struct vhost_net *n) > +{ > + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); > + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_TX].poll); > + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_RX].poll); > +} > + > +static long vhost_net_set_socket(struct vhost_net *n, int fd) > +{ > + struct { > + struct sockaddr_ll sa; > + char buf[MAX_ADDR_LEN]; > + } uaddr; > + struct socket *sock, *oldsock = NULL; > + int uaddr_len = sizeof uaddr, r; > + > + mutex_lock(&n->dev.mutex); > + r = vhost_dev_check_owner(&n->dev); > + if (r) > + goto done; > + > + if (fd == -1) { > + /* Disconnect from socket and device. */ > + oldsock = vhost_net_stop(n); > + goto done; > + } > + > + sock = sockfd_lookup(fd, &r); > + if (!sock) { > + r = -ENOTSOCK; > + goto done; > + } > + > + /* Parameter checking */ > + if (sock->sk->sk_type != SOCK_RAW) { > + r = -ESOCKTNOSUPPORT; > + goto done; > + } > + > + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, > + &uaddr_len, 0); > + if (r) > + goto done; > + > + if (uaddr.sa.sll_family != AF_PACKET) { > + r = -EPFNOSUPPORT; > + goto done; > + } > + > + /* start polling new socket */ > + if (sock == oldsock) > + goto done; > + > + if (oldsock) { > + vhost_poll_stop(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); > + } > + oldsock = n->sock; > + rcu_assign_pointer(n->sock, sock); > + vhost_poll_start(n->poll + VHOST_NET_VQ_TX, sock->file); > + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); > +done: > + mutex_unlock(&n->dev.mutex); > + if (oldsock) { > + vhost_net_flush(n); > + fput(oldsock->file); > + } > + return r; > +} > + > +static long vhost_net_reset_owner(struct vhost_net *n) > +{ > + struct socket *sock = NULL; > + long r; > + mutex_lock(&n->dev.mutex); > + r = vhost_dev_check_owner(&n->dev); > + if (r) > + goto done; > + sock = vhost_net_stop(n); > + r = vhost_dev_reset_owner(&n->dev); > +done: > + mutex_unlock(&n->dev.mutex); > + if (sock) > + fput(sock->file); > + return r; > +} > + > +static void vhost_net_set_features(struct vhost_net *n, u64 features) > +{ > + mutex_unlock(&n->dev.mutex); > + n->dev.acked_features = features; > + mutex_unlock(&n->dev.mutex); > + vhost_net_flush(n); > +} > + > +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + struct vhost_net *n = f->private_data; > + void __user *argp = (void __user *)arg; > + u32 __user *featurep = argp; > + int __user *fdp = argp; > + u64 features; > + int fd, r; > + switch (ioctl) { > + case VHOST_NET_SET_SOCKET: > + r = get_user(fd, fdp); > + if (r < 0) > + return r; > + return vhost_net_set_socket(n, fd); > + case VHOST_GET_FEATURES: > + features = VHOST_FEATURES; > + return put_user(features, featurep); > + case VHOST_ACK_FEATURES: > + r = get_user(features, featurep); > + /* No features for now */ > + if (r < 0) > + return r; > + if (features & ~VHOST_FEATURES) > + return -EOPNOTSUPP; > + vhost_net_set_features(n, features); > + return 0; > + case VHOST_RESET_OWNER: > + return vhost_net_reset_owner(n); > + default: > + return vhost_dev_ioctl(&n->dev, ioctl, arg); > + } > +} > + > +#ifdef CONFIG_COMPAT > +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); > +} > +#endif > + > +const static struct file_operations vhost_net_fops = { > + .owner = THIS_MODULE, > + .release = vhost_net_release, > + .unlocked_ioctl = vhost_net_ioctl, > +#ifdef CONFIG_COMPAT > + .compat_ioctl = vhost_net_compat_ioctl, > +#endif > + .open = vhost_net_open, > +}; > + > +static struct miscdevice vhost_net_misc = { > + VHOST_NET_MINOR, > + "vhost-net", > + &vhost_net_fops, > +}; > + > +int vhost_net_init(void) > +{ > + int r = vhost_init(); > + if (r) > + goto err_init; > + r = misc_register(&vhost_net_misc); > + if (r) > + goto err_reg; > + return 0; > +err_reg: > + vhost_cleanup(); > +err_init: > + return r; > + > +} > +module_init(vhost_net_init); > + > +void vhost_net_exit(void) > +{ > + misc_deregister(&vhost_net_misc); > + vhost_cleanup(); > +} > +module_exit(vhost_net_exit); > + > +MODULE_VERSION("0.0.1"); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR("Michael S. Tsirkin"); > +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > new file mode 100644 > index 0000000..6925cc1 > --- /dev/null > +++ b/drivers/vhost/vhost.c > @@ -0,0 +1,688 @@ > +/* Copyright (C) 2009 Red Hat, Inc. > + * Copyright (C) 2006 Rusty Russell IBM Corporation > + * > + * Author: Michael S. Tsirkin <mst@redhat.com> > + * > + * Inspiration, some code, and most witty comments come from > + * Documentation/lguest/lguest.c, by Rusty Russell > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * Generic code for virtio server in host kernel. > + */ > + > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <linux/mm.h> > +#include <linux/miscdevice.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/rcupdate.h> > +#include <linux/poll.h> > +#include <linux/file.h> > + > +#include <linux/net.h> > +#include <linux/if_packet.h> > +#include <linux/if_arp.h> > + > +#include <net/sock.h> > + > +#include "vhost.h" > + > +enum { > + VHOST_MEMORY_MAX_NREGIONS = 64, > +}; > + > +static struct workqueue_struct *vhost_workqueue; > + > +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, > + poll_table *pt) > +{ > + struct vhost_poll *poll; > + poll = container_of(pt, struct vhost_poll, table); > + > + poll->wqh = wqh; > + add_wait_queue(wqh, &poll->wait); > +} > + > +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, > + void *key) > +{ > + struct vhost_poll *poll; > + poll = container_of(wait, struct vhost_poll, wait); > + if (!((unsigned long)key & poll->mask)) > + return 0; > + > + queue_work(vhost_workqueue, &poll->work); > + return 0; > +} > + > +/* Init poll structure */ > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > + unsigned long mask) > +{ > + INIT_WORK(&poll->work, func); > + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); > + init_poll_funcptr(&poll->table, vhost_poll_func); > + poll->mask = mask; > +} > + > +/* Start polling a file. We add ourselves to file's wait queue. The caller must > + * keep a reference to a file until after vhost_poll_stop is called. */ > +void vhost_poll_start(struct vhost_poll *poll, struct file *file) > +{ > + unsigned long mask; > + mask = file->f_op->poll(file, &poll->table); > + if (mask) > + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); > +} > + > +/* Stop polling a file. After this function returns, it becomes safe to drop the > + * file reference. You must also flush afterwards. */ > +void vhost_poll_stop(struct vhost_poll *poll) > +{ > + remove_wait_queue(poll->wqh, &poll->wait); > +} > + > +/* Flush any work that has been scheduled. When calling this, don't hold any > + * locks that are also used by the callback. */ > +void vhost_poll_flush(struct vhost_poll *poll) > +{ > + flush_work(&poll->work); > +} > + > +long vhost_dev_init(struct vhost_dev *dev, > + struct vhost_virtqueue *vqs, int nvqs) > +{ > + int i; > + dev->vqs = vqs; > + dev->nvqs = nvqs; > + mutex_init(&dev->mutex); > + > + for (i = 0; i < dev->nvqs; ++i) { > + dev->vqs[i].dev = dev; > + mutex_init(&dev->vqs[i].mutex); > + if (dev->vqs[i].handle_kick) > + vhost_poll_init(&dev->vqs[i].poll, > + dev->vqs[i].handle_kick, > + POLLIN); > + } > + return 0; > +} > + > +/* Caller should have device mutex */ > +long vhost_dev_check_owner(struct vhost_dev *dev) > +{ > + /* Are you the owner? If not, I don't think you mean to do that */ > + return dev->mm == current->mm ? 0 : -EPERM; > +} > + > +/* Caller should have device mutex */ > +static long vhost_dev_set_owner(struct vhost_dev *dev) > +{ > + /* Is there an owner already? */ > + if (dev->mm) > + return -EBUSY; > + /* No owner, become one */ > + dev->mm = get_task_mm(current); > + return 0; > +} > + > +/* Caller should have device mutex */ > +long vhost_dev_reset_owner(struct vhost_dev *dev) > +{ > + struct vhost_memory *memory; > + > + /* Restore memory to default 1:1 mapping. */ > + memory = kmalloc(offsetof(struct vhost_memory, regions) + > + 2 * sizeof *memory->regions, GFP_KERNEL); > + if (!memory) > + return -ENOMEM; > + > + vhost_dev_cleanup(dev); > + > + memory->nregions = 2; > + memory->regions[0].guest_phys_addr = 1; > + memory->regions[0].userspace_addr = 1; > + memory->regions[0].memory_size = ~0ULL; > + memory->regions[1].guest_phys_addr = 0; > + memory->regions[1].userspace_addr = 0; > + memory->regions[1].memory_size = 1; > + dev->memory = memory; > + return 0; > +} > + > +/* Caller should have device mutex */ > +void vhost_dev_cleanup(struct vhost_dev *dev) > +{ > + int i; > + for (i = 0; i < dev->nvqs; ++i) { > + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { > + vhost_poll_stop(&dev->vqs[i].poll); > + vhost_poll_flush(&dev->vqs[i].poll); > + } > + if (dev->vqs[i].error_ctx) > + eventfd_ctx_put(dev->vqs[i].error_ctx); > + if (dev->vqs[i].error) > + fput(dev->vqs[i].error); > + if (dev->vqs[i].kick) > + fput(dev->vqs[i].kick); > + if (dev->vqs[i].call_ctx) > + eventfd_ctx_put(dev->vqs[i].call_ctx); > + if (dev->vqs[i].call) > + fput(dev->vqs[i].call); > + dev->vqs[i].error_ctx = NULL; > + dev->vqs[i].error = NULL; > + dev->vqs[i].kick = NULL; > + dev->vqs[i].call_ctx = NULL; > + dev->vqs[i].call = NULL; > + } > + /* No one will access memory at this point */ > + kfree(dev->memory); > + dev->memory = NULL; > + if (dev->mm) > + mmput(dev->mm); > + dev->mm = NULL; > +} > + > +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) > +{ > + struct vhost_memory mem, *newmem, *oldmem; > + unsigned long size = offsetof(struct vhost_memory, regions); > + long r; > + r = copy_from_user(&mem, m, size); > + if (r) > + return r; > + if (mem.padding) > + return -EOPNOTSUPP; > + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) > + return -E2BIG; > + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); > + if (!newmem) > + return -ENOMEM; > + > + memcpy(newmem, &mem, size); > + r = copy_from_user(newmem->regions, m->regions, > + mem.nregions * sizeof *m->regions); > + if (r) { > + kfree(newmem); > + return r; > + } > + oldmem = d->memory; > + rcu_assign_pointer(d->memory, newmem); > + synchronize_rcu(); > + kfree(oldmem); > + return 0; > +} > + > +static int init_used(struct vhost_virtqueue *vq) > +{ > + int r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + return r; > + return get_user(vq->last_used_idx, &vq->used->idx); > +} > + > +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) > +{ > + struct file *eventfp, *filep = NULL, > + *pollstart = NULL, *pollstop = NULL; > + struct eventfd_ctx *ctx = NULL; > + u32 __user *idxp = argp; > + struct vhost_virtqueue *vq; > + struct vhost_vring_state s; > + struct vhost_vring_file f; > + struct vhost_vring_addr a; > + u32 idx; > + long r; > + > + r = get_user(idx, idxp); > + if (r < 0) > + return r; > + if (idx > d->nvqs) > + return -ENOBUFS; > + > + vq = d->vqs + idx; > + > + mutex_lock(&vq->mutex); > + > + switch (ioctl) { > + case VHOST_SET_VRING_NUM: > + r = copy_from_user(&s, argp, sizeof s); > + if (r < 0) > + break; > + if (s.num > 0xffff) { > + r = -EINVAL; > + break; > + } > + vq->num = s.num; > + break; > + case VHOST_SET_VRING_BASE: > + r = copy_from_user(&s, argp, sizeof s); > + if (r < 0) > + break; > + if (s.num > 0xffff) { > + r = -EINVAL; > + break; > + } > + vq->avail_idx = vq->last_avail_idx = s.num; > + break; > + case VHOST_GET_VRING_BASE: > + s.index = idx; > + s.num = vq->last_avail_idx; > + r = copy_to_user(argp, &s, sizeof s); > + break; > + case VHOST_SET_VRING_DESC: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->desc = (void __user *)(long)a.user_addr; > + break; > + case VHOST_SET_VRING_AVAIL: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->avail = (void __user *)(long)a.user_addr; > + /* Forget the cached index value. */ > + vq->avail_idx = vq->last_avail_idx; > + break; > + case VHOST_SET_VRING_USED: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->used = (void __user *)(long)a.user_addr; > + r = init_used(vq); > + if (r) > + break; > + break; > + case VHOST_SET_VRING_KICK: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->kick) { > + pollstop = filep = vq->kick; > + pollstart = vq->kick = eventfp; > + } else > + filep = eventfp; > + break; > + case VHOST_SET_VRING_CALL: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->call) { > + filep = vq->call; > + ctx = vq->call_ctx; > + vq->call = eventfp; > + vq->call_ctx = eventfp ? > + eventfd_ctx_fileget(eventfp) : NULL; > + } else > + filep = eventfp; > + break; > + case VHOST_SET_VRING_ERR: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->error) { > + filep = vq->error; > + vq->error = eventfp; > + ctx = vq->error_ctx; > + vq->error_ctx = eventfp ? > + eventfd_ctx_fileget(eventfp) : NULL; > + } else > + filep = eventfp; > + break; > + default: > + r = -ENOIOCTLCMD; > + } > + > + if (pollstop && vq->handle_kick) > + vhost_poll_stop(&vq->poll); > + > + if (ctx) > + eventfd_ctx_put(ctx); > + if (filep) > + fput(filep); > + > + if (pollstart && vq->handle_kick) > + vhost_poll_start(&vq->poll, vq->kick); > + > + mutex_unlock(&vq->mutex); > + > + if (pollstop && vq->handle_kick) > + vhost_poll_flush(&vq->poll); > + return 0; > +} > + > +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) > +{ > + void __user *argp = (void __user *)arg; > + long r; > + > + mutex_lock(&d->mutex); > + /* If you are not the owner, you can become one */ > + if (ioctl == VHOST_SET_OWNER) { > + r = vhost_dev_set_owner(d); > + goto done; > + } > + > + /* You must be the owner to do anything else */ > + r = vhost_dev_check_owner(d); > + if (r) > + goto done; > + > + switch (ioctl) { > + case VHOST_SET_MEM_TABLE: > + r = vhost_set_memory(d, argp); > + break; > + default: > + r = vhost_set_vring(d, ioctl, argp); > + break; > + } > +done: > + mutex_unlock(&d->mutex); > + return r; > +} > + > +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, > + __u64 addr, __u32 len) > +{ > + struct vhost_memory_region *reg; > + int i; > + /* linear search is not brilliant, but we really have on the order of 6 > + * regions in practice */ > + for (i = 0; i < mem->nregions; ++i) { > + reg = mem->regions + i; > + if (reg->guest_phys_addr <= addr && > + reg->guest_phys_addr + reg->memory_size - 1 >= addr) > + return reg; > + } > + return NULL; > +} > + > +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, > + struct iovec iov[], int iov_size) > +{ > + const struct vhost_memory_region *reg; > + struct vhost_memory *mem; > + struct iovec *_iov; > + u64 s = 0; > + int ret = 0; > + > + rcu_read_lock(); > + > + mem = rcu_dereference(dev->memory); > + while ((u64)len > s) { > + u64 size; > + if (ret >= iov_size) { > + ret = -ENOBUFS; > + break; > + } > + reg = find_region(mem, addr, len); > + if (!reg) { > + ret = -EFAULT; > + break; > + } > + _iov = iov + ret; > + size = reg->memory_size - addr + reg->guest_phys_addr; > + _iov->iov_len = min((u64)len, size); > + _iov->iov_base = (void *) > + (reg->userspace_addr + addr - reg->guest_phys_addr); > + s += size; > + addr += size; > + ++ret; > + } > + > + rcu_read_unlock(); > + return ret; > +} > + > +/* Each buffer in the virtqueues is actually a chain of descriptors. This > + * function returns the next descriptor in the chain, or vq->vring.num if we're > + * at the end. */ > +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) > +{ > + unsigned int next; > + > + /* If this descriptor says it doesn't chain, we're done. */ > + if (!(desc->flags & VRING_DESC_F_NEXT)) > + return vq->num; > + > + /* Check they're not leading us off end of descriptors. */ > + next = desc->next; > + /* Make sure compiler knows to grab that: we don't want it changing! */ > + /* We will use the result as an index in an array, so most > + * architectures only need a compiler barrier here. */ > + read_barrier_depends(); > + > + if (next >= vq->num) { > + vq_err(vq, "Desc next is %u > %u", next, vq->num); > + return vq->num; > + } > + > + return next; > +} > + > +/* This looks in the virtqueue and for the first available buffer, and converts > + * it to an iovec for convenient access. Since descriptors consist of some > + * number of output then some number of input descriptors, it's actually two > + * iovecs, but we pack them into one and note how many of each there were. > + * > + * This function returns the descriptor number found, or vq->num (which > + * is never a valid descriptor number) if none was found. */ > +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, > + struct iovec iov[], > + unsigned int *out_num, unsigned int *in_num) > +{ > + struct vring_desc desc; > + unsigned int i, head; > + u16 last_avail_idx; > + int ret; > + > + /* Check it isn't doing very strange things with descriptor numbers. */ > + last_avail_idx = vq->last_avail_idx; > + if (get_user(vq->avail_idx, &vq->avail->idx)) { > + vq_err(vq, "Failed to access avail idx at %p\n", > + &vq->avail->idx); > + return vq->num; > + } > + > + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { > + vq_err(vq, "Guest moved used index from %u to %u", > + last_avail_idx, vq->avail_idx); > + return vq->num; > + } > + > + /* If there's nothing new since last we looked, return invalid. */ > + if (vq->avail_idx == last_avail_idx) > + return vq->num; > + > + /* Grab the next descriptor number they're advertising, and increment > + * the index we've seen. */ > + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { > + vq_err(vq, "Failed to read head: idx %d address %p\n", > + last_avail_idx, > + &vq->avail->ring[last_avail_idx % vq->num]); > + return vq->num; > + } > + > + /* If their number is silly, that's an error. */ > + if (head >= vq->num) { > + vq_err(vq, "Guest says index %u > %u is available", > + head, vq->num); > + return vq->num; > + } > + > + vq->last_avail_idx++; > + > + /* When we start there are none of either input nor output. */ > + *out_num = *in_num = 0; > + > + i = head; > + do { > + unsigned iov_count = *in_num + *out_num; > + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) { > + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", > + i, vq->desc + i); > + return vq->num; > + } > + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, > + VHOST_NET_MAX_SG - iov_count); > + if (ret < 0) { > + vq_err(vq, "Translation failure %d descriptor idx %d\n", > + ret, i); > + return vq->num; > + } > + /* If this is an input descriptor, increment that count. */ > + if (desc.flags & VRING_DESC_F_WRITE) > + *in_num += ret; > + else { > + /* If it's an output descriptor, they're all supposed > + * to come before any input descriptors. */ > + if (*in_num) { > + vq_err(vq, "Descriptor has out after in: " > + "idx %d\n", i); > + return vq->num; > + } > + *out_num += ret; > + } > + } while ((i = next_desc(vq, &desc)) != vq->num); > + return head; > +} > + > +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) > +{ > + vq->last_avail_idx--; > +} > + > +/* After we've used one of their buffers, we tell them about it. We'll then > + * want to send them an interrupt, using vq->call. */ > +int vhost_add_used(struct vhost_virtqueue *vq, > + unsigned int head, int len) > +{ > + struct vring_used_elem *used; > + > + /* The virtqueue contains a ring of used buffers. Get a pointer to the > + * next entry in that used ring. */ > + used = &vq->used->ring[vq->last_used_idx % vq->num]; > + if (put_user(head, &used->id)) { > + vq_err(vq, "Failed to write used id"); > + return -EFAULT; > + } > + if (put_user(len, &used->len)) { > + vq_err(vq, "Failed to write used len"); > + return -EFAULT; > + } > + /* Make sure buffer is written before we update index. */ > + wmb(); > + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { > + vq_err(vq, "Failed to increment used idx"); > + return -EFAULT; > + } > + vq->last_used_idx++; > + return 0; > +} > + > +/* This actually sends the interrupt for this virtqueue */ > +void vhost_trigger_irq(struct vhost_dev *dev, struct vhost_virtqueue *vq) > +{ > + __u16 flags = 0; > + if (get_user(flags, &vq->avail->flags)) { > + vq_err(vq, "Failed to get flags"); > + return; > + } > + > + /* If they don't want an interrupt, don't send one, unless empty. */ > + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && > + (!vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) || > + vq->avail_idx != vq->last_avail_idx)) > + return; > + > + /* Send the Guest an interrupt tell them we used something up. */ > + if (vq->call_ctx) > + eventfd_signal(vq->call_ctx, 1); > +} > + > +/* And here's the combo meal deal. Supersize me! */ > +void vhost_add_used_and_trigger(struct vhost_dev *dev, > + struct vhost_virtqueue *vq, > + unsigned int head, int len) > +{ > + vhost_add_used(vq, head, len); > + vhost_trigger_irq(dev, vq); > +} > + > +/* OK, now we need to know about added descriptors. */ > +bool vhost_notify(struct vhost_virtqueue *vq) > +{ > + int r; > + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) > + return false; > + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; > + r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + vq_err(vq, "Failed to disable notification: %d\n", r); > + /* They could have slipped one in as we were doing that: make > + * sure it's written, tell caller it needs to check again. */ > + mb(); > + return true; > +} > + > +/* We don't need to be notified again. */ > +void vhost_no_notify(struct vhost_virtqueue *vq) > +{ > + int r; > + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) > + return; > + vq->used_flags |= VRING_USED_F_NO_NOTIFY; > + r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + vq_err(vq, "Failed to enable notification: %d\n", r); > +} > + > +int vhost_init(void) > +{ > + vhost_workqueue = create_workqueue("vhost"); > + if (!vhost_workqueue) > + return -ENOMEM; > + return 0; > +} > + > +void vhost_cleanup(void) > +{ > + destroy_workqueue(vhost_workqueue); > +} > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > new file mode 100644 > index 0000000..8e13d06 > --- /dev/null > +++ b/drivers/vhost/vhost.h > @@ -0,0 +1,122 @@ > +#ifndef _VHOST_H > +#define _VHOST_H > + > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/mm.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/poll.h> > +#include <linux/file.h> > +#include <linux/skbuff.h> > +#include <linux/uio.h> > +#include <linux/virtio_config.h> > + > +struct vhost_device; > + > +enum { > + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, > +}; > + > +/* Poll a file (eventfd or socket) */ > +/* Note: there's nothing vhost specific about this structure. */ > +struct vhost_poll { > + poll_table table; > + wait_queue_head_t *wqh; > + wait_queue_t wait; > + /* struct which will handle all actual work. */ > + struct work_struct work; > + unsigned long mask; > +}; > + > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > + unsigned long mask); > +void vhost_poll_start(struct vhost_poll *poll, struct file *file); > +void vhost_poll_stop(struct vhost_poll *poll); > +void vhost_poll_flush(struct vhost_poll *poll); > + > +/* The virtqueue structure describes a queue attached to a device. */ > +struct vhost_virtqueue { > + struct vhost_dev *dev; > + > + /* The actual ring of buffers. */ > + struct mutex mutex; > + unsigned int num; > + struct vring_desc __user *desc; > + struct vring_avail __user *avail; > + struct vring_used __user *used; > + struct file *kick; > + struct file *call; > + struct file *error; > + struct eventfd_ctx *call_ctx; > + struct eventfd_ctx *error_ctx; > + > + struct vhost_poll poll; > + > + /* The routine to call when the Guest pings us, or timeout. */ > + work_func_t handle_kick; > + > + /* Last available index we saw. */ > + u16 last_avail_idx; > + > + /* Caches available index value from user. */ > + u16 avail_idx; > + > + /* Last index we used. */ > + u16 last_used_idx; > + > + /* Used flags */ > + u16 used_flags; > + > + struct iovec iov[VHOST_NET_MAX_SG]; > + struct iovec hdr[VHOST_NET_MAX_SG]; > +}; > + > +struct vhost_dev { > + /* Readers use RCU to access memory table pointer. > + * Writers use mutex below.*/ > + struct vhost_memory *memory; > + struct mm_struct *mm; > + struct vhost_virtqueue *vqs; > + int nvqs; > + struct mutex mutex; > + unsigned acked_features; > +}; > + > +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > +long vhost_dev_check_owner(struct vhost_dev *); > +long vhost_dev_reset_owner(struct vhost_dev *); > +void vhost_dev_cleanup(struct vhost_dev *); > +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); > + > +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, > + struct iovec iov[], > + unsigned int *out_num, unsigned int *in_num); > +void vhost_discard_vq_desc(struct vhost_virtqueue *); > + > +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); > +void vhost_trigger_irq(struct vhost_dev *, struct vhost_virtqueue *); > +void vhost_add_used_and_trigger(struct vhost_dev *, struct vhost_virtqueue *, > + unsigned int head, int len); > +void vhost_no_notify(struct vhost_virtqueue *); > +bool vhost_notify(struct vhost_virtqueue *); > + > +int vhost_init(void); > +void vhost_cleanup(void); > + > +#define vq_err(vq, fmt, ...) do { \ > + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > + if ((vq)->error_ctx) \ > + eventfd_signal((vq)->error_ctx, 1);\ > + } while (0) > + > +enum { > + VHOST_FEATURES = 1 << VIRTIO_F_NOTIFY_ON_EMPTY, > +}; > + > +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) > +{ > + return dev->acked_features & (1 << bit); > +} > + > +#endif > diff --git a/include/linux/Kbuild b/include/linux/Kbuild > index dec2f18..975df9a 100644 > --- a/include/linux/Kbuild > +++ b/include/linux/Kbuild > @@ -360,6 +360,7 @@ unifdef-y += uio.h > unifdef-y += unistd.h > unifdef-y += usbdevice_fs.h > unifdef-y += utsname.h > +unifdef-y += vhost.h > unifdef-y += videodev2.h > unifdef-y += videodev.h > unifdef-y += virtio_config.h > diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h > index 0521177..781a8bb 100644 > --- a/include/linux/miscdevice.h > +++ b/include/linux/miscdevice.h > @@ -30,6 +30,7 @@ > #define HPET_MINOR 228 > #define FUSE_MINOR 229 > #define KVM_MINOR 232 > +#define VHOST_NET_MINOR 233 > #define MISC_DYNAMIC_MINOR 255 > > struct device; > diff --git a/include/linux/vhost.h b/include/linux/vhost.h > new file mode 100644 > index 0000000..3f441a9 > --- /dev/null > +++ b/include/linux/vhost.h > @@ -0,0 +1,101 @@ > +#ifndef _LINUX_VHOST_H > +#define _LINUX_VHOST_H > +/* Userspace interface for in-kernel virtio accelerators. */ > + > +/* vhost is used to reduce the number of system calls involved in virtio. > + * > + * Existing virtio net code is used in the guest without modification. > + * > + * This header includes interface used by userspace hypervisor for > + * device configuration. > + */ > + > +#include <linux/types.h> > +#include <linux/compiler.h> > +#include <linux/ioctl.h> > +#include <linux/virtio_config.h> > +#include <linux/virtio_ring.h> > + > +struct vhost_vring_state { > + unsigned int index; > + unsigned int num; > +}; > + > +struct vhost_vring_file { > + unsigned int index; > + int fd; > +}; > + > +struct vhost_vring_addr { > + unsigned int index; > + unsigned int padding; > + __u64 user_addr; > +}; > + > +struct vhost_memory_region { > + __u64 guest_phys_addr; > + __u64 memory_size; /* bytes */ > + __u64 userspace_addr; > + __u64 padding; /* read/write protection? */ > +}; > + > +struct vhost_memory { > + __u32 nregions; > + __u32 padding; > + struct vhost_memory_region regions[0]; > +}; > + > +/* ioctls */ > + > +#define VHOST_VIRTIO 0xAF > + > +/* Features bitmask for forward compatibility. Transport bits are used for > + * vhost specific features. */ > +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) > +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) > + > +/* Set current process as the (exclusive) owner of this file descriptor. This > + * must be called before any other vhost command. Further calls to > + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ > +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) > +/* Give up ownership, and reset the device to default values. > + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ > +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) > + > +/* Set up/modify memory layout */ > +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) > + > +/* Ring setup. These parameters can not be modified while ring is running > + * (bound to a device). */ > +/* Set number of descriptors in ring */ > +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) > +/* Start of array of descriptors (virtually contiguous) */ > +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) > +/* Used structure address */ > +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) > +/* Available structure address */ > +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) > +/* Base value where queue looks for available descriptors */ > +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Get accessor: reads index, writes value in num */ > +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > + > +/* The following ioctls use eventfd file descriptors to signal and poll > + * for events. */ > + > +/* Set eventfd to poll for added buffers */ > +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) > +/* Set eventfd to signal when buffers have beed used */ > +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) > +/* Set eventfd to signal an error */ > +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) > + > +/* VHOST_NET specific defines */ > + > +/* Attach virtio net device to a raw socket. The socket must be already > + * bound to an ethernet device, this device will be used for transmit. > + * Pass -1 to unbind from the socket and the transmit device. > + * This can be used to stop the device (e.g. for migration). */ > +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int) > + > +#endif > -- > 1.6.2.5 > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-03 18:39 ` Ira W. Snyder @ 2009-09-07 10:15 ` Michael S. Tsirkin 2009-09-08 17:20 ` Ira W. Snyder 0 siblings, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-07 10:15 UTC (permalink / raw) To: Ira W. Snyder Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Thu, Sep 03, 2009 at 11:39:45AM -0700, Ira W. Snyder wrote: > On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > > What it is: vhost net is a character device that can be used to reduce > > the number of system calls involved in virtio networking. > > Existing virtio net code is used in the guest without modification. > > > > There's similarity with vringfd, with some differences and reduced scope > > - uses eventfd for signalling > > - structures can be moved around in memory at any time (good for migration) > > - support memory table and not just an offset (needed for kvm) > > > > common virtio related code has been put in a separate file vhost.c and > > can be made into a separate module if/when more backends appear. I used > > Rusty's lguest.c as the source for developing this part : this supplied > > me with witty comments I wouldn't be able to write myself. > > > > What it is not: vhost net is not a bus, and not a generic new system > > call. No assumptions are made on how guest performs hypercalls. > > Userspace hypervisors are supported as well as kvm. > > > > How it works: Basically, we connect virtio frontend (configured by > > userspace) to a backend. The backend could be a network device, or a > > tun-like device. In this version I only support raw socket as a backend, > > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > > also configured by userspace, including vlan/mac etc. > > > > Status: > > This works for me, and I haven't see any crashes. > > I have done some light benchmarking (with v4), compared to userspace, I > > see improved latency (as I save up to 4 system calls per packet) but not > > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > > ping benchmark (where there's no TSO) troughput is also improved. > > > > Features that I plan to look at in the future: > > - tap support > > - TSO > > - interrupt mitigation > > - zero copy > > > > Hello Michael, > > I've started looking at vhost with the intention of using it over PCI to > connect physical machines together. > > The part that I am struggling with the most is figuring out which parts > of the rings are in the host's memory, and which parts are in the > guest's memory. All rings are in guest's memory, to match existing virtio code. vhost assumes that the memory space of the hypervisor userspace process covers the whole of guest memory. And there's a translation table. Ring addresses are userspace addresses, they do not undergo translation. > If I understand everything correctly, the rings are all userspace > addresses, which means that they can be moved around in physical memory, > and get pushed out to swap. Unless they are locked, yes. > AFAIK, this is impossible to handle when > connecting two physical systems, you'd need the rings available in IO > memory (PCI memory), so you can ioreadXX() them instead. To the best of > my knowledge, I shouldn't be using copy_to_user() on an __iomem address. > Also, having them migrate around in memory would be a bad thing. > > Also, I'm having trouble figuring out how the packet contents are > actually copied from one system to the other. Could you point this out > for me? The code in net/packet/af_packet.c does it when vhost calls sendmsg. > Is there somewhere I can find the userspace code (kvm, qemu, lguest, > etc.) code needed for interacting with the vhost misc device so I can > get a better idea of how userspace is supposed to work? Look in archives for kvm@vger.kernel.org. the subject is qemu-kvm: vhost net. > (Features > negotiation, etc.) > > Thanks, > Ira That's not yet implemented as there are no features yet. I'm working on tap support, which will add a feature bit. Overall, qemu does an ioctl to query supported features, and then acks them with another ioctl. I'm also trying to avoid duplicating functionality available elsewhere. So that to check e.g. TSO support, you'd just look at the underlying hardware device you are binding to. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-07 10:15 ` Michael S. Tsirkin @ 2009-09-08 17:20 ` Ira W. Snyder 2009-09-08 20:14 ` Michael S. Tsirkin 2009-09-11 16:00 ` Gregory Haskins 0 siblings, 2 replies; 83+ messages in thread From: Ira W. Snyder @ 2009-09-08 17:20 UTC (permalink / raw) To: Michael S. Tsirkin Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Mon, Sep 07, 2009 at 01:15:37PM +0300, Michael S. Tsirkin wrote: > On Thu, Sep 03, 2009 at 11:39:45AM -0700, Ira W. Snyder wrote: > > On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > > > What it is: vhost net is a character device that can be used to reduce > > > the number of system calls involved in virtio networking. > > > Existing virtio net code is used in the guest without modification. > > > > > > There's similarity with vringfd, with some differences and reduced scope > > > - uses eventfd for signalling > > > - structures can be moved around in memory at any time (good for migration) > > > - support memory table and not just an offset (needed for kvm) > > > > > > common virtio related code has been put in a separate file vhost.c and > > > can be made into a separate module if/when more backends appear. I used > > > Rusty's lguest.c as the source for developing this part : this supplied > > > me with witty comments I wouldn't be able to write myself. > > > > > > What it is not: vhost net is not a bus, and not a generic new system > > > call. No assumptions are made on how guest performs hypercalls. > > > Userspace hypervisors are supported as well as kvm. > > > > > > How it works: Basically, we connect virtio frontend (configured by > > > userspace) to a backend. The backend could be a network device, or a > > > tun-like device. In this version I only support raw socket as a backend, > > > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > > > also configured by userspace, including vlan/mac etc. > > > > > > Status: > > > This works for me, and I haven't see any crashes. > > > I have done some light benchmarking (with v4), compared to userspace, I > > > see improved latency (as I save up to 4 system calls per packet) but not > > > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > > > ping benchmark (where there's no TSO) troughput is also improved. > > > > > > Features that I plan to look at in the future: > > > - tap support > > > - TSO > > > - interrupt mitigation > > > - zero copy > > > > > > > Hello Michael, > > > > I've started looking at vhost with the intention of using it over PCI to > > connect physical machines together. > > > > The part that I am struggling with the most is figuring out which parts > > of the rings are in the host's memory, and which parts are in the > > guest's memory. > > All rings are in guest's memory, to match existing virtio code. Ok, this makes sense. > vhost > assumes that the memory space of the hypervisor userspace process covers > the whole of guest memory. Is this necessary? Why? The assumption seems very wrong when you're doing data transport between two physical systems via PCI. I know vhost has not been designed for this specific situation, but it is good to be looking toward other possible uses. > And there's a translation table. > Ring addresses are userspace addresses, they do not undergo translation. > > > If I understand everything correctly, the rings are all userspace > > addresses, which means that they can be moved around in physical memory, > > and get pushed out to swap. > > Unless they are locked, yes. > > > AFAIK, this is impossible to handle when > > connecting two physical systems, you'd need the rings available in IO > > memory (PCI memory), so you can ioreadXX() them instead. To the best of > > my knowledge, I shouldn't be using copy_to_user() on an __iomem address. > > Also, having them migrate around in memory would be a bad thing. > > > > Also, I'm having trouble figuring out how the packet contents are > > actually copied from one system to the other. Could you point this out > > for me? > > The code in net/packet/af_packet.c does it when vhost calls sendmsg. > Ok. The sendmsg() implementation uses memcpy_fromiovec(). Is it possible to make this use a DMA engine instead? I know this was suggested in an earlier thread. > > Is there somewhere I can find the userspace code (kvm, qemu, lguest, > > etc.) code needed for interacting with the vhost misc device so I can > > get a better idea of how userspace is supposed to work? > > Look in archives for kvm@vger.kernel.org. the subject is qemu-kvm: vhost net. > > > (Features > > negotiation, etc.) > > > > That's not yet implemented as there are no features yet. I'm working on > tap support, which will add a feature bit. Overall, qemu does an ioctl > to query supported features, and then acks them with another ioctl. I'm > also trying to avoid duplicating functionality available elsewhere. So > that to check e.g. TSO support, you'd just look at the underlying > hardware device you are binding to. > Ok. Do you have plans to support the VIRTIO_NET_F_MRG_RXBUF feature in the future? I found that this made an enormous improvement in throughput on my virtio-net <-> virtio-net system. Perhaps it isn't needed with vhost-net. Thanks for replying, Ira -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-08 17:20 ` Ira W. Snyder @ 2009-09-08 20:14 ` Michael S. Tsirkin 2009-09-11 15:17 ` Xin, Xiaohui 2009-09-11 16:00 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-08 20:14 UTC (permalink / raw) To: Ira W. Snyder Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Tue, Sep 08, 2009 at 10:20:35AM -0700, Ira W. Snyder wrote: > On Mon, Sep 07, 2009 at 01:15:37PM +0300, Michael S. Tsirkin wrote: > > On Thu, Sep 03, 2009 at 11:39:45AM -0700, Ira W. Snyder wrote: > > > On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > > > > What it is: vhost net is a character device that can be used to reduce > > > > the number of system calls involved in virtio networking. > > > > Existing virtio net code is used in the guest without modification. > > > > > > > > There's similarity with vringfd, with some differences and reduced scope > > > > - uses eventfd for signalling > > > > - structures can be moved around in memory at any time (good for migration) > > > > - support memory table and not just an offset (needed for kvm) > > > > > > > > common virtio related code has been put in a separate file vhost.c and > > > > can be made into a separate module if/when more backends appear. I used > > > > Rusty's lguest.c as the source for developing this part : this supplied > > > > me with witty comments I wouldn't be able to write myself. > > > > > > > > What it is not: vhost net is not a bus, and not a generic new system > > > > call. No assumptions are made on how guest performs hypercalls. > > > > Userspace hypervisors are supported as well as kvm. > > > > > > > > How it works: Basically, we connect virtio frontend (configured by > > > > userspace) to a backend. The backend could be a network device, or a > > > > tun-like device. In this version I only support raw socket as a backend, > > > > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > > > > also configured by userspace, including vlan/mac etc. > > > > > > > > Status: > > > > This works for me, and I haven't see any crashes. > > > > I have done some light benchmarking (with v4), compared to userspace, I > > > > see improved latency (as I save up to 4 system calls per packet) but not > > > > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > > > > ping benchmark (where there's no TSO) troughput is also improved. > > > > > > > > Features that I plan to look at in the future: > > > > - tap support > > > > - TSO > > > > - interrupt mitigation > > > > - zero copy > > > > > > > > > > Hello Michael, > > > > > > I've started looking at vhost with the intention of using it over PCI to > > > connect physical machines together. > > > > > > The part that I am struggling with the most is figuring out which parts > > > of the rings are in the host's memory, and which parts are in the > > > guest's memory. > > > > All rings are in guest's memory, to match existing virtio code. > > Ok, this makes sense. > > > vhost > > assumes that the memory space of the hypervisor userspace process covers > > the whole of guest memory. > > Is this necessary? Why? Because with virtio ring can give us arbitrary guest addresses. If guest was limited to using a subset of addresses, hypervisor would only have to map these. > The assumption seems very wrong when you're > doing data transport between two physical systems via PCI. > I know vhost has not been designed for this specific situation, but it > is good to be looking toward other possible uses. > > > And there's a translation table. > > Ring addresses are userspace addresses, they do not undergo translation. > > > > > If I understand everything correctly, the rings are all userspace > > > addresses, which means that they can be moved around in physical memory, > > > and get pushed out to swap. > > > > Unless they are locked, yes. > > > > > AFAIK, this is impossible to handle when > > > connecting two physical systems, you'd need the rings available in IO > > > memory (PCI memory), so you can ioreadXX() them instead. To the best of > > > my knowledge, I shouldn't be using copy_to_user() on an __iomem address. > > > Also, having them migrate around in memory would be a bad thing. > > > > > > Also, I'm having trouble figuring out how the packet contents are > > > actually copied from one system to the other. Could you point this out > > > for me? > > > > The code in net/packet/af_packet.c does it when vhost calls sendmsg. > > > > Ok. The sendmsg() implementation uses memcpy_fromiovec(). Is it possible > to make this use a DMA engine instead? Maybe. > I know this was suggested in an earlier thread. Yes, it might even give some performance benefit with e.g. I/O AT. > > > Is there somewhere I can find the userspace code (kvm, qemu, lguest, > > > etc.) code needed for interacting with the vhost misc device so I can > > > get a better idea of how userspace is supposed to work? > > > > Look in archives for kvm@vger.kernel.org. the subject is qemu-kvm: vhost net. > > > > > (Features > > > negotiation, etc.) > > > > > > > That's not yet implemented as there are no features yet. I'm working on > > tap support, which will add a feature bit. Overall, qemu does an ioctl > > to query supported features, and then acks them with another ioctl. I'm > > also trying to avoid duplicating functionality available elsewhere. So > > that to check e.g. TSO support, you'd just look at the underlying > > hardware device you are binding to. > > > > Ok. Do you have plans to support the VIRTIO_NET_F_MRG_RXBUF feature in > the future? I found that this made an enormous improvement in throughput > on my virtio-net <-> virtio-net system. Perhaps it isn't needed with > vhost-net. Yes, I'm working on it. > Thanks for replying, > Ira -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-08 20:14 ` Michael S. Tsirkin @ 2009-09-11 15:17 ` Xin, Xiaohui 2009-09-13 5:46 ` Michael S. Tsirkin 0 siblings, 1 reply; 83+ messages in thread From: Xin, Xiaohui @ 2009-09-11 15:17 UTC (permalink / raw) To: Michael S. Tsirkin, Ira W. Snyder Cc: netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com, Rusty Russell, s.hetze@linux-ag.com Michael, We are very interested in your patch and want to have a try with it. I have collected your 3 patches in kernel side and 4 patches in queue side. The patches are listed here: PATCHv5-1-3-mm-export-use_mm-unuse_mm-to-modules.patch PATCHv5-2-3-mm-reduce-atomic-use-on-use_mm-fast-path.patch PATCHv5-3-3-vhost_net-a-kernel-level-virtio-server.patch PATCHv3-1-4-qemu-kvm-move-virtio-pci[1].o-to-near-pci.o.patch PATCHv3-2-4-virtio-move-features-to-an-inline-function.patch PATCHv3-3-4-qemu-kvm-vhost-net-implementation.patch PATCHv3-4-4-qemu-kvm-add-compat-eventfd.patch I applied the kernel patches on v2.6.31-rc4 and the qemu patches on latest kvm qemu. But seems there are some patches are needed at least irqfd and ioeventfd patches on current qemu. I cannot create a kvm guest with "-net nic,model=virtio,vhost=vethX". May you kindly advice us the patch lists all exactly to make it work? Thanks a lot. :-) Thanks Xiaohui -----Original Message----- From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On Behalf Of Michael S. Tsirkin Sent: Wednesday, September 09, 2009 4:14 AM To: Ira W. Snyder Cc: netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com; Rusty Russell; s.hetze@linux-ag.com Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server On Tue, Sep 08, 2009 at 10:20:35AM -0700, Ira W. Snyder wrote: > On Mon, Sep 07, 2009 at 01:15:37PM +0300, Michael S. Tsirkin wrote: > > On Thu, Sep 03, 2009 at 11:39:45AM -0700, Ira W. Snyder wrote: > > > On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > > > > What it is: vhost net is a character device that can be used to reduce > > > > the number of system calls involved in virtio networking. > > > > Existing virtio net code is used in the guest without modification. > > > > > > > > There's similarity with vringfd, with some differences and reduced scope > > > > - uses eventfd for signalling > > > > - structures can be moved around in memory at any time (good for migration) > > > > - support memory table and not just an offset (needed for kvm) > > > > > > > > common virtio related code has been put in a separate file vhost.c and > > > > can be made into a separate module if/when more backends appear. I used > > > > Rusty's lguest.c as the source for developing this part : this supplied > > > > me with witty comments I wouldn't be able to write myself. > > > > > > > > What it is not: vhost net is not a bus, and not a generic new system > > > > call. No assumptions are made on how guest performs hypercalls. > > > > Userspace hypervisors are supported as well as kvm. > > > > > > > > How it works: Basically, we connect virtio frontend (configured by > > > > userspace) to a backend. The backend could be a network device, or a > > > > tun-like device. In this version I only support raw socket as a backend, > > > > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > > > > also configured by userspace, including vlan/mac etc. > > > > > > > > Status: > > > > This works for me, and I haven't see any crashes. > > > > I have done some light benchmarking (with v4), compared to userspace, I > > > > see improved latency (as I save up to 4 system calls per packet) but not > > > > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > > > > ping benchmark (where there's no TSO) troughput is also improved. > > > > > > > > Features that I plan to look at in the future: > > > > - tap support > > > > - TSO > > > > - interrupt mitigation > > > > - zero copy > > > > > > > > > > Hello Michael, > > > > > > I've started looking at vhost with the intention of using it over PCI to > > > connect physical machines together. > > > > > > The part that I am struggling with the most is figuring out which parts > > > of the rings are in the host's memory, and which parts are in the > > > guest's memory. > > > > All rings are in guest's memory, to match existing virtio code. > > Ok, this makes sense. > > > vhost > > assumes that the memory space of the hypervisor userspace process covers > > the whole of guest memory. > > Is this necessary? Why? Because with virtio ring can give us arbitrary guest addresses. If guest was limited to using a subset of addresses, hypervisor would only have to map these. > The assumption seems very wrong when you're > doing data transport between two physical systems via PCI. > I know vhost has not been designed for this specific situation, but it > is good to be looking toward other possible uses. > > > And there's a translation table. > > Ring addresses are userspace addresses, they do not undergo translation. > > > > > If I understand everything correctly, the rings are all userspace > > > addresses, which means that they can be moved around in physical memory, > > > and get pushed out to swap. > > > > Unless they are locked, yes. > > > > > AFAIK, this is impossible to handle when > > > connecting two physical systems, you'd need the rings available in IO > > > memory (PCI memory), so you can ioreadXX() them instead. To the best of > > > my knowledge, I shouldn't be using copy_to_user() on an __iomem address. > > > Also, having them migrate around in memory would be a bad thing. > > > > > > Also, I'm having trouble figuring out how the packet contents are > > > actually copied from one system to the other. Could you point this out > > > for me? > > > > The code in net/packet/af_packet.c does it when vhost calls sendmsg. > > > > Ok. The sendmsg() implementation uses memcpy_fromiovec(). Is it possible > to make this use a DMA engine instead? Maybe. > I know this was suggested in an earlier thread. Yes, it might even give some performance benefit with e.g. I/O AT. > > > Is there somewhere I can find the userspace code (kvm, qemu, lguest, > > > etc.) code needed for interacting with the vhost misc device so I can > > > get a better idea of how userspace is supposed to work? > > > > Look in archives for kvm@vger.kernel.org. the subject is qemu-kvm: vhost net. > > > > > (Features > > > negotiation, etc.) > > > > > > > That's not yet implemented as there are no features yet. I'm working on > > tap support, which will add a feature bit. Overall, qemu does an ioctl > > to query supported features, and then acks them with another ioctl. I'm > > also trying to avoid duplicating functionality available elsewhere. So > > that to check e.g. TSO support, you'd just look at the underlying > > hardware device you are binding to. > > > > Ok. Do you have plans to support the VIRTIO_NET_F_MRG_RXBUF feature in > the future? I found that this made an enormous improvement in throughput > on my virtio-net <-> virtio-net system. Perhaps it isn't needed with > vhost-net. Yes, I'm working on it. > Thanks for replying, > Ira -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-11 15:17 ` Xin, Xiaohui @ 2009-09-13 5:46 ` Michael S. Tsirkin 2009-09-14 5:57 ` Xin, Xiaohui 0 siblings, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-13 5:46 UTC (permalink / raw) To: Xin, Xiaohui Cc: Ira W. Snyder, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com, Rusty Russell, s.hetze@linux-ag.com, avi On Fri, Sep 11, 2009 at 11:17:33PM +0800, Xin, Xiaohui wrote: > Michael, > We are very interested in your patch and want to have a try with it. > I have collected your 3 patches in kernel side and 4 patches in queue side. > The patches are listed here: > > PATCHv5-1-3-mm-export-use_mm-unuse_mm-to-modules.patch > PATCHv5-2-3-mm-reduce-atomic-use-on-use_mm-fast-path.patch > PATCHv5-3-3-vhost_net-a-kernel-level-virtio-server.patch > > PATCHv3-1-4-qemu-kvm-move-virtio-pci[1].o-to-near-pci.o.patch > PATCHv3-2-4-virtio-move-features-to-an-inline-function.patch > PATCHv3-3-4-qemu-kvm-vhost-net-implementation.patch > PATCHv3-4-4-qemu-kvm-add-compat-eventfd.patch > > I applied the kernel patches on v2.6.31-rc4 and the qemu patches on latest kvm qemu. > But seems there are some patches are needed at least irqfd and ioeventfd patches on > current qemu. I cannot create a kvm guest with "-net nic,model=virtio,vhost=vethX". > > May you kindly advice us the patch lists all exactly to make it work? > Thanks a lot. :-) > > Thanks > Xiaohui The irqfd/ioeventfd patches are part of Avi's kvm.git tree: git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git I expect them to be merged by 2.6.32-rc1 - right, Avi? -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-13 5:46 ` Michael S. Tsirkin @ 2009-09-14 5:57 ` Xin, Xiaohui 2009-09-14 7:05 ` Michael S. Tsirkin 0 siblings, 1 reply; 83+ messages in thread From: Xin, Xiaohui @ 2009-09-14 5:57 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Ira W. Snyder, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com, Rusty Russell, s.hetze@linux-ag.com, avi@redhat.com >The irqfd/ioeventfd patches are part of Avi's kvm.git tree: >git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git > >I expect them to be merged by 2.6.32-rc1 - right, Avi? Michael, I think I have the kernel patch for kvm_irqfd and kvm_ioeventfd, but missed the qemu side patch for irqfd and ioeventfd. I met the compile error when I compiled virtio-pci.c file in qemu-kvm like this: /root/work/vmdq/vhost/qemu-kvm/hw/virtio-pci.c:384: error: `KVM_IRQFD` undeclared (first use in this function) /root/work/vmdq/vhost/qemu-kvm/hw/virtio-pci.c:400: error: `KVM_IOEVENTFD` undeclared (first use in this function) Which qemu tree or patch do you use for kvm_irqfd and kvm_ioeventfd? Thanks Xiaohui -----Original Message----- From: Michael S. Tsirkin [mailto:mst@redhat.com] Sent: Sunday, September 13, 2009 1:46 PM To: Xin, Xiaohui Cc: Ira W. Snyder; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com; Rusty Russell; s.hetze@linux-ag.com; avi@redhat.com Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server On Fri, Sep 11, 2009 at 11:17:33PM +0800, Xin, Xiaohui wrote: > Michael, > We are very interested in your patch and want to have a try with it. > I have collected your 3 patches in kernel side and 4 patches in queue side. > The patches are listed here: > > PATCHv5-1-3-mm-export-use_mm-unuse_mm-to-modules.patch > PATCHv5-2-3-mm-reduce-atomic-use-on-use_mm-fast-path.patch > PATCHv5-3-3-vhost_net-a-kernel-level-virtio-server.patch > > PATCHv3-1-4-qemu-kvm-move-virtio-pci[1].o-to-near-pci.o.patch > PATCHv3-2-4-virtio-move-features-to-an-inline-function.patch > PATCHv3-3-4-qemu-kvm-vhost-net-implementation.patch > PATCHv3-4-4-qemu-kvm-add-compat-eventfd.patch > > I applied the kernel patches on v2.6.31-rc4 and the qemu patches on latest kvm qemu. > But seems there are some patches are needed at least irqfd and ioeventfd patches on > current qemu. I cannot create a kvm guest with "-net nic,model=virtio,vhost=vethX". > > May you kindly advice us the patch lists all exactly to make it work? > Thanks a lot. :-) > > Thanks > Xiaohui The irqfd/ioeventfd patches are part of Avi's kvm.git tree: git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git I expect them to be merged by 2.6.32-rc1 - right, Avi? -- MST ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 5:57 ` Xin, Xiaohui @ 2009-09-14 7:05 ` Michael S. Tsirkin 0 siblings, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-14 7:05 UTC (permalink / raw) To: Xin, Xiaohui Cc: Ira W. Snyder, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com, Rusty Russell, s.hetze@linux-ag.com, avi@redhat.com On Mon, Sep 14, 2009 at 01:57:06PM +0800, Xin, Xiaohui wrote: > >The irqfd/ioeventfd patches are part of Avi's kvm.git tree: > >git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git > > > >I expect them to be merged by 2.6.32-rc1 - right, Avi? > > Michael, > > I think I have the kernel patch for kvm_irqfd and kvm_ioeventfd, but missed the qemu side patch for irqfd and ioeventfd. > > I met the compile error when I compiled virtio-pci.c file in qemu-kvm like this: > > /root/work/vmdq/vhost/qemu-kvm/hw/virtio-pci.c:384: error: `KVM_IRQFD` undeclared (first use in this function) > /root/work/vmdq/vhost/qemu-kvm/hw/virtio-pci.c:400: error: `KVM_IOEVENTFD` undeclared (first use in this function) > > Which qemu tree or patch do you use for kvm_irqfd and kvm_ioeventfd? I'm using the headers from upstream kernel. I'll send a patch for that. > Thanks > Xiaohui > > -----Original Message----- > From: Michael S. Tsirkin [mailto:mst@redhat.com] > Sent: Sunday, September 13, 2009 1:46 PM > To: Xin, Xiaohui > Cc: Ira W. Snyder; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com; Rusty Russell; s.hetze@linux-ag.com; avi@redhat.com > Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server > > On Fri, Sep 11, 2009 at 11:17:33PM +0800, Xin, Xiaohui wrote: > > Michael, > > We are very interested in your patch and want to have a try with it. > > I have collected your 3 patches in kernel side and 4 patches in queue side. > > The patches are listed here: > > > > PATCHv5-1-3-mm-export-use_mm-unuse_mm-to-modules.patch > > PATCHv5-2-3-mm-reduce-atomic-use-on-use_mm-fast-path.patch > > PATCHv5-3-3-vhost_net-a-kernel-level-virtio-server.patch > > > > PATCHv3-1-4-qemu-kvm-move-virtio-pci[1].o-to-near-pci.o.patch > > PATCHv3-2-4-virtio-move-features-to-an-inline-function.patch > > PATCHv3-3-4-qemu-kvm-vhost-net-implementation.patch > > PATCHv3-4-4-qemu-kvm-add-compat-eventfd.patch > > > > I applied the kernel patches on v2.6.31-rc4 and the qemu patches on latest kvm qemu. > > But seems there are some patches are needed at least irqfd and ioeventfd patches on > > current qemu. I cannot create a kvm guest with "-net nic,model=virtio,vhost=vethX". > > > > May you kindly advice us the patch lists all exactly to make it work? > > Thanks a lot. :-) > > > > Thanks > > Xiaohui > > > The irqfd/ioeventfd patches are part of Avi's kvm.git tree: > git://git.kernel.org/pub/scm/linux/kernel/git/avi/kvm.git > > I expect them to be merged by 2.6.32-rc1 - right, Avi? > > -- > MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-08 17:20 ` Ira W. Snyder 2009-09-08 20:14 ` Michael S. Tsirkin @ 2009-09-11 16:00 ` Gregory Haskins 2009-09-11 16:14 ` Gregory Haskins 2009-09-13 12:01 ` Michael S. Tsirkin 1 sibling, 2 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-11 16:00 UTC (permalink / raw) To: Ira W. Snyder Cc: Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze [-- Attachment #1: Type: text/plain, Size: 3146 bytes --] Ira W. Snyder wrote: > On Mon, Sep 07, 2009 at 01:15:37PM +0300, Michael S. Tsirkin wrote: >> On Thu, Sep 03, 2009 at 11:39:45AM -0700, Ira W. Snyder wrote: >>> On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: >>>> What it is: vhost net is a character device that can be used to reduce >>>> the number of system calls involved in virtio networking. >>>> Existing virtio net code is used in the guest without modification. >>>> >>>> There's similarity with vringfd, with some differences and reduced scope >>>> - uses eventfd for signalling >>>> - structures can be moved around in memory at any time (good for migration) >>>> - support memory table and not just an offset (needed for kvm) >>>> >>>> common virtio related code has been put in a separate file vhost.c and >>>> can be made into a separate module if/when more backends appear. I used >>>> Rusty's lguest.c as the source for developing this part : this supplied >>>> me with witty comments I wouldn't be able to write myself. >>>> >>>> What it is not: vhost net is not a bus, and not a generic new system >>>> call. No assumptions are made on how guest performs hypercalls. >>>> Userspace hypervisors are supported as well as kvm. >>>> >>>> How it works: Basically, we connect virtio frontend (configured by >>>> userspace) to a backend. The backend could be a network device, or a >>>> tun-like device. In this version I only support raw socket as a backend, >>>> which can be bound to e.g. SR IOV, or to macvlan device. Backend is >>>> also configured by userspace, including vlan/mac etc. >>>> >>>> Status: >>>> This works for me, and I haven't see any crashes. >>>> I have done some light benchmarking (with v4), compared to userspace, I >>>> see improved latency (as I save up to 4 system calls per packet) but not >>>> bandwidth/CPU (as TSO and interrupt mitigation are not supported). For >>>> ping benchmark (where there's no TSO) troughput is also improved. >>>> >>>> Features that I plan to look at in the future: >>>> - tap support >>>> - TSO >>>> - interrupt mitigation >>>> - zero copy >>>> >>> Hello Michael, >>> >>> I've started looking at vhost with the intention of using it over PCI to >>> connect physical machines together. >>> >>> The part that I am struggling with the most is figuring out which parts >>> of the rings are in the host's memory, and which parts are in the >>> guest's memory. >> All rings are in guest's memory, to match existing virtio code. > > Ok, this makes sense. > >> vhost >> assumes that the memory space of the hypervisor userspace process covers >> the whole of guest memory. > > Is this necessary? Why? The assumption seems very wrong when you're > doing data transport between two physical systems via PCI. FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, the memory is not assumed to be a userspace address. Rather, it is a memctx-specific address, which can be userspace, or any other type (including hardware, dma-engine, etc). As long as the memctx knows how to translate it, it will work. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-11 16:00 ` Gregory Haskins @ 2009-09-11 16:14 ` Gregory Haskins 2009-09-13 12:01 ` Michael S. Tsirkin 1 sibling, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-11 16:14 UTC (permalink / raw) To: Ira W. Snyder Cc: Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze [-- Attachment #1: Type: text/plain, Size: 1352 bytes --] Gregory Haskins wrote: [snip] > > FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, > the memory is not assumed to be a userspace address. Rather, it is a > memctx-specific address, which can be userspace, or any other type > (including hardware, dma-engine, etc). As long as the memctx knows how > to translate it, it will work. > citations: Here is a packet import (from the perspective of the host side "venet" device model, similar to Michaels "vhost") http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=kernel/vbus/devices/venet-tap.c;h=ee091c47f06e9bb8487a45e72d493273fe08329f;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l535 Here is the KVM specific memctx: http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=kernel/vbus/kvm.c;h=56e2c5682a7ca8432c159377b0f7389cf34cbc1b;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l188 and http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=virt/kvm/xinterface.c;h=0cccb6095ca2a51bad01f7ba2137fdd9111b63d3;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l289 You could alternatively define a memctx for your environment which knows how to deal with your PPC boards PCI based memory, and the devices would all "just work". Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-11 16:00 ` Gregory Haskins 2009-09-11 16:14 ` Gregory Haskins @ 2009-09-13 12:01 ` Michael S. Tsirkin 2009-09-14 16:08 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-13 12:01 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote: > FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, > the memory is not assumed to be a userspace address. Rather, it is a > memctx-specific address, which can be userspace, or any other type > (including hardware, dma-engine, etc). As long as the memctx knows how > to translate it, it will work. How would permissions be handled? it's easy to allow an app to pass in virtual addresses in its own address space. But we can't let the guest specify physical addresses. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-13 12:01 ` Michael S. Tsirkin @ 2009-09-14 16:08 ` Gregory Haskins 2009-09-14 16:47 ` Michael S. Tsirkin 2009-09-14 16:53 ` Michael S. Tsirkin 0 siblings, 2 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-14 16:08 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze [-- Attachment #1: Type: text/plain, Size: 1453 bytes --] Michael S. Tsirkin wrote: > On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote: >> FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, >> the memory is not assumed to be a userspace address. Rather, it is a >> memctx-specific address, which can be userspace, or any other type >> (including hardware, dma-engine, etc). As long as the memctx knows how >> to translate it, it will work. > > How would permissions be handled? Same as anything else, really. Read on for details. > it's easy to allow an app to pass in virtual addresses in its own address space. Agreed, and this is what I do. The guest always passes its own physical addresses (using things like __pa() in linux). This address passed is memctx specific, but generally would fall into the category of "virtual-addresses" from the hosts perspective. For a KVM/AlacrityVM guest example, the addresses are GPAs, accessed internally to the context via a gfn_to_hva conversion (you can see this occuring in the citation links I sent) For Ira's example, the addresses would represent a physical address on the PCI boards, and would follow any kind of relevant rules for converting a "GPA" to a host accessible address (even if indirectly, via a dma controller). > But we can't let the guest specify physical addresses. Agreed. Neither your proposal nor mine operate this way afaict. HTH Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 16:08 ` Gregory Haskins @ 2009-09-14 16:47 ` Michael S. Tsirkin 2009-09-14 19:14 ` Gregory Haskins 2009-09-15 12:32 ` Avi Kivity 2009-09-14 16:53 ` Michael S. Tsirkin 1 sibling, 2 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-14 16:47 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote: > For Ira's example, the addresses would represent a physical address on > the PCI boards, and would follow any kind of relevant rules for > converting a "GPA" to a host accessible address (even if indirectly, via > a dma controller). I don't think limiting addresses to PCI physical addresses will work well. From what I rememeber, Ira's x86 can not initiate burst transactions on PCI, and it's the ppc that initiates all DMA. > > > But we can't let the guest specify physical addresses. > > Agreed. Neither your proposal nor mine operate this way afaict. But this seems to be what Ira needs. > HTH > > Kind Regards, > -Greg > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 16:47 ` Michael S. Tsirkin @ 2009-09-14 19:14 ` Gregory Haskins 2009-09-15 12:35 ` Avi Kivity 2009-09-15 12:32 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-14 19:14 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 1921 bytes --] Michael S. Tsirkin wrote: > On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote: >> For Ira's example, the addresses would represent a physical address on >> the PCI boards, and would follow any kind of relevant rules for >> converting a "GPA" to a host accessible address (even if indirectly, via >> a dma controller). > > I don't think limiting addresses to PCI physical addresses will work > well. The only "limit" is imposed by the memctx. If a given context needs to meet certain requirements beyond PCI physical addresses, it would presumably be designed that way. > From what I rememeber, Ira's x86 can not initiate burst > transactions on PCI, and it's the ppc that initiates all DMA. The only requirement is that the "guest" "owns" the memory. IOW: As with virtio/vhost, the guest can access the pointers in the ring directly but the host must pass through a translation function. Your translation is direct: you use a slots/hva scheme. My translation is abstracted, which means it can support slots/hva (such as in alacrityvm) or some other scheme as long as the general model of "guest owned" holds true. > >>> But we can't let the guest specify physical addresses. >> Agreed. Neither your proposal nor mine operate this way afaict. > > But this seems to be what Ira needs. So what he could do then is implement the memctx to integrate with the ppc side dma controller. E.g. "translation" in his box means a protocol from the x86 to the ppc to initiate the dma cycle. This could be exposed as a dma facility in the register file of the ppc boards, for instance. To reiterate, as long as the model is such that the ppc boards are considered the "owner" (direct access, no translation needed) I believe it will work. If the pointers are expected to be owned by the host, then my model doesn't work well either. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 19:14 ` Gregory Haskins @ 2009-09-15 12:35 ` Avi Kivity 2009-09-15 13:03 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-15 12:35 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/14/2009 10:14 PM, Gregory Haskins wrote: > To reiterate, as long as the model is such that the ppc boards are > considered the "owner" (direct access, no translation needed) I believe > it will work. If the pointers are expected to be owned by the host, > then my model doesn't work well either. > In this case the x86 is the owner and the ppc boards use translated access. Just switch drivers and device and it falls into place. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 12:35 ` Avi Kivity @ 2009-09-15 13:03 ` Gregory Haskins 2009-09-15 13:25 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 13:03 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 1754 bytes --] Avi Kivity wrote: > On 09/14/2009 10:14 PM, Gregory Haskins wrote: >> To reiterate, as long as the model is such that the ppc boards are >> considered the "owner" (direct access, no translation needed) I believe >> it will work. If the pointers are expected to be owned by the host, >> then my model doesn't work well either. >> > > In this case the x86 is the owner and the ppc boards use translated > access. Just switch drivers and device and it falls into place. > You could switch vbus roles as well, I suppose. Another potential option is that he can stop mapping host memory on the guest so that it follows the more traditional model. As a bus-master device, the ppc boards should have access to any host memory at least in the GFP_DMA range, which would include all relevant pointers here. I digress: I was primarily addressing the concern that Ira would need to manage the "host" side of the link using hvas mapped from userspace (even if host side is the ppc boards). vbus abstracts that access so as to allow something other than userspace/hva mappings. OTOH, having each ppc board run a userspace app to do the mapping on its behalf and feed it to vhost is probably not a huge deal either. Where vhost might really fall apart is when any assumptions about pageable memory occur, if any. As an aside: a bigger issue is that, iiuc, Ira wants more than a single ethernet channel in his design (multiple ethernets, consoles, etc). A vhost solution in this environment is incomplete. Note that Ira's architecture highlights that vbus's explicit management interface is more valuable here than it is in KVM, since KVM already has its own management interface via QEMU. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 13:03 ` Gregory Haskins @ 2009-09-15 13:25 ` Avi Kivity 2009-09-15 13:50 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-15 13:25 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/15/2009 04:03 PM, Gregory Haskins wrote: > >> In this case the x86 is the owner and the ppc boards use translated >> access. Just switch drivers and device and it falls into place. >> >> > You could switch vbus roles as well, I suppose. Right, there's not real difference in this regard. > Another potential > option is that he can stop mapping host memory on the guest so that it > follows the more traditional model. As a bus-master device, the ppc > boards should have access to any host memory at least in the GFP_DMA > range, which would include all relevant pointers here. > > I digress: I was primarily addressing the concern that Ira would need > to manage the "host" side of the link using hvas mapped from userspace > (even if host side is the ppc boards). vbus abstracts that access so as > to allow something other than userspace/hva mappings. OTOH, having each > ppc board run a userspace app to do the mapping on its behalf and feed > it to vhost is probably not a huge deal either. Where vhost might > really fall apart is when any assumptions about pageable memory occur, > if any. > Why? vhost will call get_user_pages() or copy_*_user() which ought to do the right thing. > As an aside: a bigger issue is that, iiuc, Ira wants more than a single > ethernet channel in his design (multiple ethernets, consoles, etc). A > vhost solution in this environment is incomplete. > Why? Instantiate as many vhost-nets as needed. > Note that Ira's architecture highlights that vbus's explicit management > interface is more valuable here than it is in KVM, since KVM already has > its own management interface via QEMU. > vhost-net and vbus both need management, vhost-net via ioctls and vbus via configfs. The only difference is the implementation. vhost-net leaves much more to userspace, that's the main difference. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 13:25 ` Avi Kivity @ 2009-09-15 13:50 ` Gregory Haskins 2009-09-15 14:28 ` Michael S. Tsirkin 2009-09-15 15:03 ` Avi Kivity 0 siblings, 2 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 13:50 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 3263 bytes --] Avi Kivity wrote: > On 09/15/2009 04:03 PM, Gregory Haskins wrote: >> >>> In this case the x86 is the owner and the ppc boards use translated >>> access. Just switch drivers and device and it falls into place. >>> >>> >> You could switch vbus roles as well, I suppose. > > Right, there's not real difference in this regard. > >> Another potential >> option is that he can stop mapping host memory on the guest so that it >> follows the more traditional model. As a bus-master device, the ppc >> boards should have access to any host memory at least in the GFP_DMA >> range, which would include all relevant pointers here. >> >> I digress: I was primarily addressing the concern that Ira would need >> to manage the "host" side of the link using hvas mapped from userspace >> (even if host side is the ppc boards). vbus abstracts that access so as >> to allow something other than userspace/hva mappings. OTOH, having each >> ppc board run a userspace app to do the mapping on its behalf and feed >> it to vhost is probably not a huge deal either. Where vhost might >> really fall apart is when any assumptions about pageable memory occur, >> if any. >> > > Why? vhost will call get_user_pages() or copy_*_user() which ought to > do the right thing. I was speaking generally, not specifically to Ira's architecture. What I mean is that vbus was designed to work without assuming that the memory is pageable. There are environments in which the host is not capable of mapping hvas/*page, but the memctx->copy_to/copy_from paradigm could still work (think rdma, for instance). > >> As an aside: a bigger issue is that, iiuc, Ira wants more than a single >> ethernet channel in his design (multiple ethernets, consoles, etc). A >> vhost solution in this environment is incomplete. >> > > Why? Instantiate as many vhost-nets as needed. a) what about non-ethernets? b) what do you suppose this protocol to aggregate the connections would look like? (hint: this is what a vbus-connector does). c) how do you manage the configuration, especially on a per-board basis? > >> Note that Ira's architecture highlights that vbus's explicit management >> interface is more valuable here than it is in KVM, since KVM already has >> its own management interface via QEMU. >> > > vhost-net and vbus both need management, vhost-net via ioctls and vbus > via configfs. Actually I have patches queued to allow vbus to be managed via ioctls as well, per your feedback (and it solves the permissions/lifetime critisims in alacrityvm-v0.1). > The only difference is the implementation. vhost-net > leaves much more to userspace, that's the main difference. Also, *) vhost is virtio-net specific, whereas vbus is a more generic device model where thing like virtio-net or venet ride on top. *) vhost is only designed to work with environments that look very similar to a KVM guest (slot/hva translatable). vbus can bridge various environments by abstracting the key components (such as memory access). *) vhost requires an active userspace management daemon, whereas vbus can be driven by transient components, like scripts (ala udev) Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 13:50 ` Gregory Haskins @ 2009-09-15 14:28 ` Michael S. Tsirkin 2009-09-15 15:03 ` Avi Kivity 1 sibling, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-15 14:28 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tue, Sep 15, 2009 at 09:50:39AM -0400, Gregory Haskins wrote: > Avi Kivity wrote: > > On 09/15/2009 04:03 PM, Gregory Haskins wrote: > >> > >>> In this case the x86 is the owner and the ppc boards use translated > >>> access. Just switch drivers and device and it falls into place. > >>> > >>> > >> You could switch vbus roles as well, I suppose. > > > > Right, there's not real difference in this regard. > > > >> Another potential > >> option is that he can stop mapping host memory on the guest so that it > >> follows the more traditional model. As a bus-master device, the ppc > >> boards should have access to any host memory at least in the GFP_DMA > >> range, which would include all relevant pointers here. > >> > >> I digress: I was primarily addressing the concern that Ira would need > >> to manage the "host" side of the link using hvas mapped from userspace > >> (even if host side is the ppc boards). vbus abstracts that access so as > >> to allow something other than userspace/hva mappings. OTOH, having each > >> ppc board run a userspace app to do the mapping on its behalf and feed > >> it to vhost is probably not a huge deal either. Where vhost might > >> really fall apart is when any assumptions about pageable memory occur, > >> if any. > >> > > > > Why? vhost will call get_user_pages() or copy_*_user() which ought to > > do the right thing. > > I was speaking generally, not specifically to Ira's architecture. What > I mean is that vbus was designed to work without assuming that the > memory is pageable. There are environments in which the host is not > capable of mapping hvas/*page, but the memctx->copy_to/copy_from > paradigm could still work (think rdma, for instance). rdma interfaces are typically asynchronous, so blocking copy_from/copy_to can be made to work, but likely won't work that well. DMA might work better if it is asynchronous as well. Assuming a synchronous copy is what we need - maybe the issue is that there aren't good APIs for x86/ppc communication? If so, sticking them in vhost might not be the best place. Maybe the specific platform can redefine copy_to/from_user to do the right thing? Or, maybe add another API for that ... > > > >> As an aside: a bigger issue is that, iiuc, Ira wants more than a single > >> ethernet channel in his design (multiple ethernets, consoles, etc). A > >> vhost solution in this environment is incomplete. > >> > > > > Why? Instantiate as many vhost-nets as needed. > > a) what about non-ethernets? vhost-net actually does not care. the packet is passed on to a socket, we are done. > b) what do you suppose this protocol to aggregate the connections would > look like? (hint: this is what a vbus-connector does). You are talking about management protocol between ppc and x86, right? One wonders why does it have to be in kernel at all. > c) how do you manage the configuration, especially on a per-board basis? not sure what a board is, but configuration is done in userspace. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 13:50 ` Gregory Haskins 2009-09-15 14:28 ` Michael S. Tsirkin @ 2009-09-15 15:03 ` Avi Kivity 2009-09-15 20:08 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-15 15:03 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/15/2009 04:50 PM, Gregory Haskins wrote: >> Why? vhost will call get_user_pages() or copy_*_user() which ought to >> do the right thing. >> > I was speaking generally, not specifically to Ira's architecture. What > I mean is that vbus was designed to work without assuming that the > memory is pageable. There are environments in which the host is not > capable of mapping hvas/*page, but the memctx->copy_to/copy_from > paradigm could still work (think rdma, for instance). > Sure, vbus is more flexible here. >>> As an aside: a bigger issue is that, iiuc, Ira wants more than a single >>> ethernet channel in his design (multiple ethernets, consoles, etc). A >>> vhost solution in this environment is incomplete. >>> >>> >> Why? Instantiate as many vhost-nets as needed. >> > a) what about non-ethernets? > There's virtio-console, virtio-blk etc. None of these have kernel-mode servers, but these could be implemented if/when needed. > b) what do you suppose this protocol to aggregate the connections would > look like? (hint: this is what a vbus-connector does). > You mean multilink? You expose the device as a multiqueue. > c) how do you manage the configuration, especially on a per-board basis? > pci (for kvm/x86). > Actually I have patches queued to allow vbus to be managed via ioctls as > well, per your feedback (and it solves the permissions/lifetime > critisims in alacrityvm-v0.1). > That will make qemu integration easier. >> The only difference is the implementation. vhost-net >> leaves much more to userspace, that's the main difference. >> > Also, > > *) vhost is virtio-net specific, whereas vbus is a more generic device > model where thing like virtio-net or venet ride on top. > I think vhost-net is separated into vhost and vhost-net. > *) vhost is only designed to work with environments that look very > similar to a KVM guest (slot/hva translatable). vbus can bridge various > environments by abstracting the key components (such as memory access). > Yes. virtio is really virtualization oriented. > *) vhost requires an active userspace management daemon, whereas vbus > can be driven by transient components, like scripts (ala udev) > vhost by design leaves configuration and handshaking to userspace. I see it as an advantage. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 15:03 ` Avi Kivity @ 2009-09-15 20:08 ` Gregory Haskins 2009-09-15 20:40 ` Michael S. Tsirkin 2009-09-16 8:23 ` Avi Kivity 0 siblings, 2 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 20:08 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 6099 bytes --] Avi Kivity wrote: > On 09/15/2009 04:50 PM, Gregory Haskins wrote: >>> Why? vhost will call get_user_pages() or copy_*_user() which ought to >>> do the right thing. >>> >> I was speaking generally, not specifically to Ira's architecture. What >> I mean is that vbus was designed to work without assuming that the >> memory is pageable. There are environments in which the host is not >> capable of mapping hvas/*page, but the memctx->copy_to/copy_from >> paradigm could still work (think rdma, for instance). >> > > Sure, vbus is more flexible here. > >>>> As an aside: a bigger issue is that, iiuc, Ira wants more than a single >>>> ethernet channel in his design (multiple ethernets, consoles, etc). A >>>> vhost solution in this environment is incomplete. >>>> >>>> >>> Why? Instantiate as many vhost-nets as needed. >>> >> a) what about non-ethernets? >> > > There's virtio-console, virtio-blk etc. None of these have kernel-mode > servers, but these could be implemented if/when needed. IIUC, Ira already needs at least ethernet and console capability. > >> b) what do you suppose this protocol to aggregate the connections would >> look like? (hint: this is what a vbus-connector does). >> > > You mean multilink? You expose the device as a multiqueue. No, what I mean is how do you surface multiple ethernet and consoles to the guests? For Ira's case, I think he needs at minimum at least one of each, and he mentioned possibly having two unique ethernets at one point. His slave boards surface themselves as PCI devices to the x86 host. So how do you use that to make multiple vhost-based devices (say two virtio-nets, and a virtio-console) communicate across the transport? There are multiple ways to do this, but what I am saying is that whatever is conceived will start to look eerily like a vbus-connector, since this is one of its primary purposes ;) > >> c) how do you manage the configuration, especially on a per-board basis? >> > > pci (for kvm/x86). Ok, for kvm understood (and I would also add "qemu" to that mix). But we are talking about vhost's application in a non-kvm environment here, right?. So if the vhost-X devices are in the "guest", and the x86 board is just a slave...How do you tell each ppc board how many devices and what config (e.g. MACs, etc) to instantiate? Do you assume that they should all be symmetric and based on positional (e.g. slot) data? What if you want asymmetric configurations (if not here, perhaps in a different environment)? > >> Actually I have patches queued to allow vbus to be managed via ioctls as >> well, per your feedback (and it solves the permissions/lifetime >> critisims in alacrityvm-v0.1). >> > > That will make qemu integration easier. > >>> The only difference is the implementation. vhost-net >>> leaves much more to userspace, that's the main difference. >>> >> Also, >> >> *) vhost is virtio-net specific, whereas vbus is a more generic device >> model where thing like virtio-net or venet ride on top. >> > > I think vhost-net is separated into vhost and vhost-net. Thats good. > >> *) vhost is only designed to work with environments that look very >> similar to a KVM guest (slot/hva translatable). vbus can bridge various >> environments by abstracting the key components (such as memory access). >> > > Yes. virtio is really virtualization oriented. I would say that its vhost in particular that is virtualization oriented. virtio, as a concept, generally should work in physical systems, if perhaps with some minor modifications. The biggest "limit" is having "virt" in its name ;) > >> *) vhost requires an active userspace management daemon, whereas vbus >> can be driven by transient components, like scripts (ala udev) >> > > vhost by design leaves configuration and handshaking to userspace. I > see it as an advantage. The misconception here is that vbus by design _doesn't define_ where configuration/handshaking happens. It is primarily implemented by a modular component called a "vbus-connector", and _I_ see this flexibility as an advantage. vhost on the other hand depends on a active userspace component and a slots/hva memory design, which is more limiting in where it can be used and forces you to split the logic. However, I think we both more or less agree on this point already. For the record, vbus itself is simply a resource container for virtual-devices, which provides abstractions for the various points of interest to generalizing PV (memory, signals, etc) and the proper isolation and protection guarantees. What you do with it is defined by the modular virtual-devices (e.g. virtion-net, venet, sched, hrt, scsi, rdma, etc) and vbus-connectors (vbus-kvm, etc) you plug into it. As an example, you could emulate the vhost design in vbus by writing a "vbus-vhost" connector. This connector would be very thin and terminate locally in QEMU. It would provide a ioctl-based verb namespace similar to the existing vhost verbs we have today. QEMU would then similarly reflect the vbus-based virtio device as a PCI device to the guest, so that virtio-pci works unmodified. You would then have most of the advantages of the work I have done for commoditizing/abstracting the key points for in-kernel PV, like the memctx. In addition, much of the work could be reused in multiple environments since any vbus-compliant device model that is plugged into the framework would work with any connector that is plugged in (e.g. vbus-kvm (alacrityvm), vbus-vhost (KVM), and "vbus-ira"). The only tradeoff is in features offered by the connector (e.g. vbus-vhost has the advantage that existing PV guests can continue to work unmodified, vbus-kvm has the advantage that it supports new features like generic shared memory, non-virtio based devices, priortizable interrupts, no dependencies on PCI for non PCI guests, etc). Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 20:08 ` Gregory Haskins @ 2009-09-15 20:40 ` Michael S. Tsirkin 2009-09-15 20:43 ` Gregory Haskins 2009-09-16 8:23 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-15 20:40 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: > No, what I mean is how do you surface multiple ethernet and consoles to > the guests? For Ira's case, I think he needs at minimum at least one of > each, and he mentioned possibly having two unique ethernets at one point. > > His slave boards surface themselves as PCI devices to the x86 > host. So how do you use that to make multiple vhost-based devices (say > two virtio-nets, and a virtio-console) communicate across the transport? > > There are multiple ways to do this, but what I am saying is that > whatever is conceived will start to look eerily like a vbus-connector, > since this is one of its primary purposes ;) Can't all this be in userspace? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 20:40 ` Michael S. Tsirkin @ 2009-09-15 20:43 ` Gregory Haskins 2009-09-15 21:25 ` Michael S. Tsirkin 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 20:43 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 830 bytes --] Michael S. Tsirkin wrote: > On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: >> No, what I mean is how do you surface multiple ethernet and consoles to >> the guests? For Ira's case, I think he needs at minimum at least one of >> each, and he mentioned possibly having two unique ethernets at one point. >> >> His slave boards surface themselves as PCI devices to the x86 >> host. So how do you use that to make multiple vhost-based devices (say >> two virtio-nets, and a virtio-console) communicate across the transport? >> >> There are multiple ways to do this, but what I am saying is that >> whatever is conceived will start to look eerily like a vbus-connector, >> since this is one of its primary purposes ;) > > Can't all this be in userspace? Can you outline your proposal? -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 20:43 ` Gregory Haskins @ 2009-09-15 21:25 ` Michael S. Tsirkin 2009-09-15 21:39 ` Gregory Haskins 2009-09-16 14:57 ` Arnd Bergmann 0 siblings, 2 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-15 21:25 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tue, Sep 15, 2009 at 04:43:58PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: > >> No, what I mean is how do you surface multiple ethernet and consoles to > >> the guests? For Ira's case, I think he needs at minimum at least one of > >> each, and he mentioned possibly having two unique ethernets at one point. > >> > >> His slave boards surface themselves as PCI devices to the x86 > >> host. So how do you use that to make multiple vhost-based devices (say > >> two virtio-nets, and a virtio-console) communicate across the transport? > >> > >> There are multiple ways to do this, but what I am saying is that > >> whatever is conceived will start to look eerily like a vbus-connector, > >> since this is one of its primary purposes ;) > > > > Can't all this be in userspace? > > Can you outline your proposal? > > -Greg > Userspace in x86 maps a PCI region, uses it for communication with ppc? -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 21:25 ` Michael S. Tsirkin @ 2009-09-15 21:39 ` Gregory Haskins 2009-09-15 21:38 ` Michael S. Tsirkin 2009-09-16 14:57 ` Arnd Bergmann 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 21:39 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 1112 bytes --] Michael S. Tsirkin wrote: > On Tue, Sep 15, 2009 at 04:43:58PM -0400, Gregory Haskins wrote: >> Michael S. Tsirkin wrote: >>> On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: >>>> No, what I mean is how do you surface multiple ethernet and consoles to >>>> the guests? For Ira's case, I think he needs at minimum at least one of >>>> each, and he mentioned possibly having two unique ethernets at one point. >>>> >>>> His slave boards surface themselves as PCI devices to the x86 >>>> host. So how do you use that to make multiple vhost-based devices (say >>>> two virtio-nets, and a virtio-console) communicate across the transport? >>>> >>>> There are multiple ways to do this, but what I am saying is that >>>> whatever is conceived will start to look eerily like a vbus-connector, >>>> since this is one of its primary purposes ;) >>> Can't all this be in userspace? >> Can you outline your proposal? >> >> -Greg >> > > Userspace in x86 maps a PCI region, uses it for communication with ppc? > And what do you propose this communication to look like? -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 21:39 ` Gregory Haskins @ 2009-09-15 21:38 ` Michael S. Tsirkin 2009-09-15 21:55 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-15 21:38 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tue, Sep 15, 2009 at 05:39:27PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Tue, Sep 15, 2009 at 04:43:58PM -0400, Gregory Haskins wrote: > >> Michael S. Tsirkin wrote: > >>> On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: > >>>> No, what I mean is how do you surface multiple ethernet and consoles to > >>>> the guests? For Ira's case, I think he needs at minimum at least one of > >>>> each, and he mentioned possibly having two unique ethernets at one point. > >>>> > >>>> His slave boards surface themselves as PCI devices to the x86 > >>>> host. So how do you use that to make multiple vhost-based devices (say > >>>> two virtio-nets, and a virtio-console) communicate across the transport? > >>>> > >>>> There are multiple ways to do this, but what I am saying is that > >>>> whatever is conceived will start to look eerily like a vbus-connector, > >>>> since this is one of its primary purposes ;) > >>> Can't all this be in userspace? > >> Can you outline your proposal? > >> > >> -Greg > >> > > > > Userspace in x86 maps a PCI region, uses it for communication with ppc? > > > > And what do you propose this communication to look like? Who cares? Implement vbus protocol there if you like. > -Greg > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 21:38 ` Michael S. Tsirkin @ 2009-09-15 21:55 ` Gregory Haskins 0 siblings, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-15 21:55 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 2304 bytes --] Michael S. Tsirkin wrote: > On Tue, Sep 15, 2009 at 05:39:27PM -0400, Gregory Haskins wrote: >> Michael S. Tsirkin wrote: >>> On Tue, Sep 15, 2009 at 04:43:58PM -0400, Gregory Haskins wrote: >>>> Michael S. Tsirkin wrote: >>>>> On Tue, Sep 15, 2009 at 04:08:23PM -0400, Gregory Haskins wrote: >>>>>> No, what I mean is how do you surface multiple ethernet and consoles to >>>>>> the guests? For Ira's case, I think he needs at minimum at least one of >>>>>> each, and he mentioned possibly having two unique ethernets at one point. >>>>>> >>>>>> His slave boards surface themselves as PCI devices to the x86 >>>>>> host. So how do you use that to make multiple vhost-based devices (say >>>>>> two virtio-nets, and a virtio-console) communicate across the transport? >>>>>> >>>>>> There are multiple ways to do this, but what I am saying is that >>>>>> whatever is conceived will start to look eerily like a vbus-connector, >>>>>> since this is one of its primary purposes ;) >>>>> Can't all this be in userspace? >>>> Can you outline your proposal? >>>> >>>> -Greg >>>> >>> Userspace in x86 maps a PCI region, uses it for communication with ppc? >>> >> And what do you propose this communication to look like? > > Who cares? Implement vbus protocol there if you like. > Exactly. My point is that you need something like a vbus protocol there. ;) Here is the protocol I run over PCI in AlacrityVM: http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=include/linux/vbus_pci.h;h=fe337590e644017392e4c9d9236150adb2333729;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025 And I guess to your point, yes the protocol can technically be in userspace (outside of whatever you need for the in-kernel portion of the communication transport, if any. The vbus-connector design does not specify where the protocol needs to take place, per se. Note, however, for performance reasons some parts of the protocol may want to be in the kernel (such as DEVCALL and SHMSIGNAL). It is for this reason that I just run all of it there, because IMO its simpler than splitting it up. The slow path stuff just rides on infrastructure that I need for fast-path anyway, so it doesn't really cost me anything additional. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 21:25 ` Michael S. Tsirkin 2009-09-15 21:39 ` Gregory Haskins @ 2009-09-16 14:57 ` Arnd Bergmann 2009-09-16 15:13 ` Michael S. Tsirkin 1 sibling, 1 reply; 83+ messages in thread From: Arnd Bergmann @ 2009-09-16 14:57 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Gregory Haskins, Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tuesday 15 September 2009, Michael S. Tsirkin wrote: > Userspace in x86 maps a PCI region, uses it for communication with ppc? This might have portability issues. On x86 it should work, but if the host is powerpc or similar, you cannot reliably access PCI I/O memory through copy_tofrom_user but have to use memcpy_toio/fromio or readl/writel calls, which don't work on user pointers. Specifically on powerpc, copy_from_user cannot access unaligned buffers if they are on an I/O mapping. Arnd <>< -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 14:57 ` Arnd Bergmann @ 2009-09-16 15:13 ` Michael S. Tsirkin 2009-09-16 15:22 ` Arnd Bergmann 0 siblings, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-16 15:13 UTC (permalink / raw) To: Arnd Bergmann Cc: Gregory Haskins, Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wed, Sep 16, 2009 at 04:57:42PM +0200, Arnd Bergmann wrote: > On Tuesday 15 September 2009, Michael S. Tsirkin wrote: > > Userspace in x86 maps a PCI region, uses it for communication with ppc? > > This might have portability issues. On x86 it should work, but if the > host is powerpc or similar, you cannot reliably access PCI I/O memory > through copy_tofrom_user but have to use memcpy_toio/fromio or readl/writel > calls, which don't work on user pointers. > > Specifically on powerpc, copy_from_user cannot access unaligned buffers > if they are on an I/O mapping. > > Arnd <>< We are talking about doing this in userspace, not in kernel. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 15:13 ` Michael S. Tsirkin @ 2009-09-16 15:22 ` Arnd Bergmann 2009-09-16 16:08 ` Michael S. Tsirkin 0 siblings, 1 reply; 83+ messages in thread From: Arnd Bergmann @ 2009-09-16 15:22 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Gregory Haskins, Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wednesday 16 September 2009, Michael S. Tsirkin wrote: > On Wed, Sep 16, 2009 at 04:57:42PM +0200, Arnd Bergmann wrote: > > On Tuesday 15 September 2009, Michael S. Tsirkin wrote: > > > Userspace in x86 maps a PCI region, uses it for communication with ppc? > > > > This might have portability issues. On x86 it should work, but if the > > host is powerpc or similar, you cannot reliably access PCI I/O memory > > through copy_tofrom_user but have to use memcpy_toio/fromio or readl/writel > > calls, which don't work on user pointers. > > > > Specifically on powerpc, copy_from_user cannot access unaligned buffers > > if they are on an I/O mapping. > > > We are talking about doing this in userspace, not in kernel. Ok, that's fine then. I thought the idea was to use the vhost_net driver to access the user memory, which would be a really cute hack otherwise, as you'd only need to provide the eventfds from a hardware specific driver and could use the regular virtio_net on the other side. Arnd <>< -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 15:22 ` Arnd Bergmann @ 2009-09-16 16:08 ` Michael S. Tsirkin 0 siblings, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-16 16:08 UTC (permalink / raw) To: Arnd Bergmann Cc: Gregory Haskins, Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wed, Sep 16, 2009 at 05:22:37PM +0200, Arnd Bergmann wrote: > On Wednesday 16 September 2009, Michael S. Tsirkin wrote: > > On Wed, Sep 16, 2009 at 04:57:42PM +0200, Arnd Bergmann wrote: > > > On Tuesday 15 September 2009, Michael S. Tsirkin wrote: > > > > Userspace in x86 maps a PCI region, uses it for communication with ppc? > > > > > > This might have portability issues. On x86 it should work, but if the > > > host is powerpc or similar, you cannot reliably access PCI I/O memory > > > through copy_tofrom_user but have to use memcpy_toio/fromio or readl/writel > > > calls, which don't work on user pointers. > > > > > > Specifically on powerpc, copy_from_user cannot access unaligned buffers > > > if they are on an I/O mapping. > > > > > We are talking about doing this in userspace, not in kernel. > > Ok, that's fine then. I thought the idea was to use the vhost_net driver It's a separate issue. We were talking generally about configuration and setup. Gregory implemented it in kernel, Avi wants it moved to userspace, with only fastpath in kernel. > to access the user memory, which would be a really cute hack otherwise, > as you'd only need to provide the eventfds from a hardware specific > driver and could use the regular virtio_net on the other side. > > Arnd <>< To do that, maybe copy to user on ppc can be fixed, or wrapped around in a arch specific macro, so that everyone else does not have to go through abstraction layers. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-15 20:08 ` Gregory Haskins 2009-09-15 20:40 ` Michael S. Tsirkin @ 2009-09-16 8:23 ` Avi Kivity 2009-09-16 11:44 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-16 8:23 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/15/2009 11:08 PM, Gregory Haskins wrote: > >> There's virtio-console, virtio-blk etc. None of these have kernel-mode >> servers, but these could be implemented if/when needed. >> > IIUC, Ira already needs at least ethernet and console capability. > > He's welcome to pick up the necessary code from qemu. >>> b) what do you suppose this protocol to aggregate the connections would >>> look like? (hint: this is what a vbus-connector does). >>> >>> >> You mean multilink? You expose the device as a multiqueue. >> > No, what I mean is how do you surface multiple ethernet and consoles to > the guests? For Ira's case, I think he needs at minimum at least one of > each, and he mentioned possibly having two unique ethernets at one point. > You instantiate multiple vhost-nets. Multiple ethernet NICs is a supported configuration for kvm. > His slave boards surface themselves as PCI devices to the x86 > host. So how do you use that to make multiple vhost-based devices (say > two virtio-nets, and a virtio-console) communicate across the transport? > I don't really see the difference between 1 and N here. > There are multiple ways to do this, but what I am saying is that > whatever is conceived will start to look eerily like a vbus-connector, > since this is one of its primary purposes ;) > I'm not sure if you're talking about the configuration interface or data path here. >>> c) how do you manage the configuration, especially on a per-board basis? >>> >>> >> pci (for kvm/x86). >> > Ok, for kvm understood (and I would also add "qemu" to that mix). But > we are talking about vhost's application in a non-kvm environment here, > right?. > > So if the vhost-X devices are in the "guest", They aren't in the "guest". The best way to look at it is - a device side, with a dma engine: vhost-net - a driver side, only accessing its own memory: virtio-net Given that Ira's config has the dma engine in the ppc boards, that's where vhost-net would live (the ppc boards acting as NICs to the x86 board, essentially). > and the x86 board is just > a slave...How do you tell each ppc board how many devices and what > config (e.g. MACs, etc) to instantiate? Do you assume that they should > all be symmetric and based on positional (e.g. slot) data? What if you > want asymmetric configurations (if not here, perhaps in a different > environment)? > I have no idea, that's for Ira to solve. If he could fake the PCI config space as seen by the x86 board, he would just show the normal pci config and use virtio-pci (multiple channels would show up as a multifunction device). Given he can't, he needs to tunnel the virtio config space some other way. >> Yes. virtio is really virtualization oriented. >> > I would say that its vhost in particular that is virtualization > oriented. virtio, as a concept, generally should work in physical > systems, if perhaps with some minor modifications. The biggest "limit" > is having "virt" in its name ;) > Let me rephrase. The virtio developers are virtualization oriented. If it works for non-virt applications, that's good, but not a design goal. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 8:23 ` Avi Kivity @ 2009-09-16 11:44 ` Gregory Haskins 2009-09-16 13:05 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-16 11:44 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 5873 bytes --] Avi Kivity wrote: > On 09/15/2009 11:08 PM, Gregory Haskins wrote: >> >>> There's virtio-console, virtio-blk etc. None of these have kernel-mode >>> servers, but these could be implemented if/when needed. >>> >> IIUC, Ira already needs at least ethernet and console capability. >> >> > > He's welcome to pick up the necessary code from qemu. The problem isn't where to find the models...the problem is how to aggregate multiple models to the guest. > >>>> b) what do you suppose this protocol to aggregate the connections would >>>> look like? (hint: this is what a vbus-connector does). >>>> >>>> >>> You mean multilink? You expose the device as a multiqueue. >>> >> No, what I mean is how do you surface multiple ethernet and consoles to >> the guests? For Ira's case, I think he needs at minimum at least one of >> each, and he mentioned possibly having two unique ethernets at one point. >> > > You instantiate multiple vhost-nets. Multiple ethernet NICs is a > supported configuration for kvm. But this is not KVM. > >> His slave boards surface themselves as PCI devices to the x86 >> host. So how do you use that to make multiple vhost-based devices (say >> two virtio-nets, and a virtio-console) communicate across the transport? >> > > I don't really see the difference between 1 and N here. A KVM surfaces N virtio-devices as N pci-devices to the guest. What do we do in Ira's case where the entire guest represents itself as a PCI device to the host, and nothing the other way around? > >> There are multiple ways to do this, but what I am saying is that >> whatever is conceived will start to look eerily like a vbus-connector, >> since this is one of its primary purposes ;) >> > > I'm not sure if you're talking about the configuration interface or data > path here. I am talking about how we would tunnel the config space for N devices across his transport. As an aside, the vbus-kvm connector makes them one and the same, but they do not have to be. Its all in the connector design. > >>>> c) how do you manage the configuration, especially on a per-board >>>> basis? >>>> >>>> >>> pci (for kvm/x86). >>> >> Ok, for kvm understood (and I would also add "qemu" to that mix). But >> we are talking about vhost's application in a non-kvm environment here, >> right?. >> >> So if the vhost-X devices are in the "guest", > > They aren't in the "guest". The best way to look at it is > > - a device side, with a dma engine: vhost-net > - a driver side, only accessing its own memory: virtio-net > > Given that Ira's config has the dma engine in the ppc boards, that's > where vhost-net would live (the ppc boards acting as NICs to the x86 > board, essentially). That sounds convenient given his hardware, but it has its own set of problems. For one, the configuration/inventory of these boards is now driven by the wrong side and has to be addressed. Second, the role reversal will likely not work for many models other than ethernet (e.g. virtio-console or virtio-blk drivers running on the x86 board would be naturally consuming services from the slave boards...virtio-net is an exception because 802.x is generally symmetrical). IIUC, vbus would support having the device models live properly on the x86 side, solving both of these problems. It would be impossible to reverse vhost given its current design. > >> and the x86 board is just >> a slave...How do you tell each ppc board how many devices and what >> config (e.g. MACs, etc) to instantiate? Do you assume that they should >> all be symmetric and based on positional (e.g. slot) data? What if you >> want asymmetric configurations (if not here, perhaps in a different >> environment)? >> > > I have no idea, that's for Ira to solve. Bingo. Thus my statement that the vhost proposal is incomplete. You have the virtio-net and vhost-net pieces covering the fast-path end-points, but nothing in the middle (transport, aggregation, config-space), and nothing on the management-side. vbus provides most of the other pieces, and can even support the same virtio-net protocol on top. The remaining part would be something like a udev script to populate the vbus with devices on board-insert events. > If he could fake the PCI > config space as seen by the x86 board, he would just show the normal pci > config and use virtio-pci (multiple channels would show up as a > multifunction device). Given he can't, he needs to tunnel the virtio > config space some other way. Right, and note that vbus was designed to solve this. This tunneling can, of course, be done without vbus using some other design. However, whatever solution is created will look incredibly close to what I've already done, so my point is "why reinvent it"? > >>> Yes. virtio is really virtualization oriented. >>> >> I would say that its vhost in particular that is virtualization >> oriented. virtio, as a concept, generally should work in physical >> systems, if perhaps with some minor modifications. The biggest "limit" >> is having "virt" in its name ;) >> > > Let me rephrase. The virtio developers are virtualization oriented. If > it works for non-virt applications, that's good, but not a design goal. > Fair enough. Vbus was designed to support both HW and virt (as well as other models, like containers), including tunneling virtio within those environments. That is probably why IMO vbus is a better fit than vhost here. (FWIW: I would love to see vhost use the vbus framework, then we all win. You can do this and still retain virtio-pci compatiblity (at least theoretically). I am still open to working with the team on this). Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 11:44 ` Gregory Haskins @ 2009-09-16 13:05 ` Avi Kivity 2009-09-16 14:10 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-16 13:05 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/16/2009 02:44 PM, Gregory Haskins wrote: > The problem isn't where to find the models...the problem is how to > aggregate multiple models to the guest. > You mean configuration? >> You instantiate multiple vhost-nets. Multiple ethernet NICs is a >> supported configuration for kvm. >> > But this is not KVM. > > If kvm can do it, others can. >>> His slave boards surface themselves as PCI devices to the x86 >>> host. So how do you use that to make multiple vhost-based devices (say >>> two virtio-nets, and a virtio-console) communicate across the transport? >>> >>> >> I don't really see the difference between 1 and N here. >> > A KVM surfaces N virtio-devices as N pci-devices to the guest. What do > we do in Ira's case where the entire guest represents itself as a PCI > device to the host, and nothing the other way around? > There is no guest and host in this scenario. There's a device side (ppc) and a driver side (x86). The driver side can access configuration information on the device side. How to multiplex multiple devices is an interesting exercise for whoever writes the virtio binding for that setup. >>> There are multiple ways to do this, but what I am saying is that >>> whatever is conceived will start to look eerily like a vbus-connector, >>> since this is one of its primary purposes ;) >>> >>> >> I'm not sure if you're talking about the configuration interface or data >> path here. >> > I am talking about how we would tunnel the config space for N devices > across his transport. > Sounds trivial. Write an address containing the device number and register number to on location, read or write data from another. Just like the PCI cf8/cfc interface. >> They aren't in the "guest". The best way to look at it is >> >> - a device side, with a dma engine: vhost-net >> - a driver side, only accessing its own memory: virtio-net >> >> Given that Ira's config has the dma engine in the ppc boards, that's >> where vhost-net would live (the ppc boards acting as NICs to the x86 >> board, essentially). >> > That sounds convenient given his hardware, but it has its own set of > problems. For one, the configuration/inventory of these boards is now > driven by the wrong side and has to be addressed. Why is it the wrong side? > Second, the role > reversal will likely not work for many models other than ethernet (e.g. > virtio-console or virtio-blk drivers running on the x86 board would be > naturally consuming services from the slave boards...virtio-net is an > exception because 802.x is generally symmetrical). > There is no role reversal. The side doing dma is the device, the side accessing its own memory is the driver. Just like that other 1e12 driver/device pairs out there. >> I have no idea, that's for Ira to solve. >> > Bingo. Thus my statement that the vhost proposal is incomplete. You > have the virtio-net and vhost-net pieces covering the fast-path > end-points, but nothing in the middle (transport, aggregation, > config-space), and nothing on the management-side. vbus provides most > of the other pieces, and can even support the same virtio-net protocol > on top. The remaining part would be something like a udev script to > populate the vbus with devices on board-insert events. > Of course vhost is incomplete, in the same sense that Linux is incomplete. Both require userspace. >> If he could fake the PCI >> config space as seen by the x86 board, he would just show the normal pci >> config and use virtio-pci (multiple channels would show up as a >> multifunction device). Given he can't, he needs to tunnel the virtio >> config space some other way. >> > Right, and note that vbus was designed to solve this. This tunneling > can, of course, be done without vbus using some other design. However, > whatever solution is created will look incredibly close to what I've > already done, so my point is "why reinvent it"? > virtio requires binding for this tunnelling, so does vbus. Its the same problem with the same solution. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 13:05 ` Avi Kivity @ 2009-09-16 14:10 ` Gregory Haskins 2009-09-16 15:59 ` Avi Kivity 2009-09-17 3:57 ` Michael S. Tsirkin 0 siblings, 2 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-16 14:10 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 6777 bytes --] Avi Kivity wrote: > On 09/16/2009 02:44 PM, Gregory Haskins wrote: >> The problem isn't where to find the models...the problem is how to >> aggregate multiple models to the guest. >> > > You mean configuration? > >>> You instantiate multiple vhost-nets. Multiple ethernet NICs is a >>> supported configuration for kvm. >>> >> But this is not KVM. >> >> > > If kvm can do it, others can. The problem is that you seem to either hand-wave over details like this, or you give details that are pretty much exactly what vbus does already. My point is that I've already sat down and thought about these issues and solved them in a freely available GPL'ed software package. So the question is: is your position that vbus is all wrong and you wish to create a new bus-like thing to solve the problem? If so, how is it different from what Ive already done? More importantly, what specific objections do you have to what Ive done, as perhaps they can be fixed instead of starting over? > >>>> His slave boards surface themselves as PCI devices to the x86 >>>> host. So how do you use that to make multiple vhost-based devices (say >>>> two virtio-nets, and a virtio-console) communicate across the >>>> transport? >>>> >>>> >>> I don't really see the difference between 1 and N here. >>> >> A KVM surfaces N virtio-devices as N pci-devices to the guest. What do >> we do in Ira's case where the entire guest represents itself as a PCI >> device to the host, and nothing the other way around? >> > > There is no guest and host in this scenario. There's a device side > (ppc) and a driver side (x86). The driver side can access configuration > information on the device side. How to multiplex multiple devices is an > interesting exercise for whoever writes the virtio binding for that setup. Bingo. So now its a question of do you want to write this layer from scratch, or re-use my framework. > >>>> There are multiple ways to do this, but what I am saying is that >>>> whatever is conceived will start to look eerily like a vbus-connector, >>>> since this is one of its primary purposes ;) >>>> >>>> >>> I'm not sure if you're talking about the configuration interface or data >>> path here. >>> >> I am talking about how we would tunnel the config space for N devices >> across his transport. >> > > Sounds trivial. No one said it was rocket science. But it does need to be designed and implemented end-to-end, much of which Ive already done in what I hope is an extensible way. > Write an address containing the device number and > register number to on location, read or write data from another. You mean like the "u64 devh", and "u32 func" fields I have here for the vbus-kvm connector? http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=include/linux/vbus_pci.h;h=fe337590e644017392e4c9d9236150adb2333729;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l64 > Just > like the PCI cf8/cfc interface. > >>> They aren't in the "guest". The best way to look at it is >>> >>> - a device side, with a dma engine: vhost-net >>> - a driver side, only accessing its own memory: virtio-net >>> >>> Given that Ira's config has the dma engine in the ppc boards, that's >>> where vhost-net would live (the ppc boards acting as NICs to the x86 >>> board, essentially). >>> >> That sounds convenient given his hardware, but it has its own set of >> problems. For one, the configuration/inventory of these boards is now >> driven by the wrong side and has to be addressed. > > Why is it the wrong side? "Wrong" is probably too harsh a word when looking at ethernet. Its certainly "odd", and possibly inconvenient. It would be like having vhost in a KVM guest, and virtio-net running on the host. You could do it, but its weird and awkward. Where it really falls apart and enters the "wrong" category is for non-symmetric devices, like disk-io. > >> Second, the role >> reversal will likely not work for many models other than ethernet (e.g. >> virtio-console or virtio-blk drivers running on the x86 board would be >> naturally consuming services from the slave boards...virtio-net is an >> exception because 802.x is generally symmetrical). >> > > There is no role reversal. So if I have virtio-blk driver running on the x86 and vhost-blk device running on the ppc board, I can use the ppc board as a block-device. What if I really wanted to go the other way? > The side doing dma is the device, the side > accessing its own memory is the driver. Just like that other 1e12 > driver/device pairs out there. IIUC, his ppc boards really can be seen as "guests" (they are linux instances that are utilizing services from the x86, not the other way around). vhost forces the model to have the ppc boards act as IO-hosts, whereas vbus would likely work in either direction due to its more refined abstraction layer. > >>> I have no idea, that's for Ira to solve. >>> >> Bingo. Thus my statement that the vhost proposal is incomplete. You >> have the virtio-net and vhost-net pieces covering the fast-path >> end-points, but nothing in the middle (transport, aggregation, >> config-space), and nothing on the management-side. vbus provides most >> of the other pieces, and can even support the same virtio-net protocol >> on top. The remaining part would be something like a udev script to >> populate the vbus with devices on board-insert events. >> > > Of course vhost is incomplete, in the same sense that Linux is > incomplete. Both require userspace. A vhost based solution to Iras design is missing more than userspace. Many of those gaps are addressed by a vbus based solution. > >>> If he could fake the PCI >>> config space as seen by the x86 board, he would just show the normal pci >>> config and use virtio-pci (multiple channels would show up as a >>> multifunction device). Given he can't, he needs to tunnel the virtio >>> config space some other way. >>> >> Right, and note that vbus was designed to solve this. This tunneling >> can, of course, be done without vbus using some other design. However, >> whatever solution is created will look incredibly close to what I've >> already done, so my point is "why reinvent it"? >> > > virtio requires binding for this tunnelling, so does vbus. We aren't talking about virtio. Virtio would work with either vbus or vhost. This is purely a question of what the layers below virtio and the device backend looks like. > Its the same problem with the same solution. I disagree. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 14:10 ` Gregory Haskins @ 2009-09-16 15:59 ` Avi Kivity 2009-09-16 19:22 ` Gregory Haskins 2009-09-17 3:57 ` Michael S. Tsirkin 1 sibling, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-16 15:59 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/16/2009 05:10 PM, Gregory Haskins wrote: > >> If kvm can do it, others can. >> > The problem is that you seem to either hand-wave over details like this, > or you give details that are pretty much exactly what vbus does already. > My point is that I've already sat down and thought about these issues > and solved them in a freely available GPL'ed software package. > In the kernel. IMO that's the wrong place for it. Further, if we adopt vbus, if drop compatibility with existing guests or have to support both vbus and virtio-pci. > So the question is: is your position that vbus is all wrong and you wish > to create a new bus-like thing to solve the problem? I don't intend to create anything new, I am satisfied with virtio. If it works for Ira, excellent. If not, too bad. I believe it will work without too much trouble. > If so, how is it > different from what Ive already done? More importantly, what specific > objections do you have to what Ive done, as perhaps they can be fixed > instead of starting over? > The two biggest objections are: - the host side is in the kernel - the guest side is a new bus instead of reusing pci (on x86/kvm), making Windows support more difficult I guess these two are exactly what you think are vbus' greatest advantages, so we'll probably have to extend our agree-to-disagree on this one. I also had issues with using just one interrupt vector to service all events, but that's easily fixed. >> There is no guest and host in this scenario. There's a device side >> (ppc) and a driver side (x86). The driver side can access configuration >> information on the device side. How to multiplex multiple devices is an >> interesting exercise for whoever writes the virtio binding for that setup. >> > Bingo. So now its a question of do you want to write this layer from > scratch, or re-use my framework. > You will have to implement a connector or whatever for vbus as well. vbus has more layers so it's probably smaller for vbus. >>>> >>>> >>> I am talking about how we would tunnel the config space for N devices >>> across his transport. >>> >>> >> Sounds trivial. >> > No one said it was rocket science. But it does need to be designed and > implemented end-to-end, much of which Ive already done in what I hope is > an extensible way. > It was already implemented three times for virtio, so apparently that's extensible too. >> Write an address containing the device number and >> register number to on location, read or write data from another. >> > You mean like the "u64 devh", and "u32 func" fields I have here for the > vbus-kvm connector? > > http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=include/linux/vbus_pci.h;h=fe337590e644017392e4c9d9236150adb2333729;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l64 > > Probably. >>> That sounds convenient given his hardware, but it has its own set of >>> problems. For one, the configuration/inventory of these boards is now >>> driven by the wrong side and has to be addressed. >>> >> Why is it the wrong side? >> > "Wrong" is probably too harsh a word when looking at ethernet. Its > certainly "odd", and possibly inconvenient. It would be like having > vhost in a KVM guest, and virtio-net running on the host. You could do > it, but its weird and awkward. Where it really falls apart and enters > the "wrong" category is for non-symmetric devices, like disk-io. > > It's not odd or wrong or wierd or awkward. An ethernet NIC is not symmetric, one side does DMA and issues interrupts, the other uses its own memory. That's exactly the case with Ira's setup. If the ppc boards were to emulate a disk controller, you'd run virtio-blk on x86 and vhost-blk on the ppc boards. >>> Second, the role >>> reversal will likely not work for many models other than ethernet (e.g. >>> virtio-console or virtio-blk drivers running on the x86 board would be >>> naturally consuming services from the slave boards...virtio-net is an >>> exception because 802.x is generally symmetrical). >>> >>> >> There is no role reversal. >> > So if I have virtio-blk driver running on the x86 and vhost-blk device > running on the ppc board, I can use the ppc board as a block-device. > What if I really wanted to go the other way? > You mean, if the x86 board was able to access the disks and dma into the ppb boards memory? You'd run vhost-blk on x86 and virtio-net on ppc. As long as you don't use the words "guest" and "host" but keep to "driver" and "device", it all works out. >> The side doing dma is the device, the side >> accessing its own memory is the driver. Just like that other 1e12 >> driver/device pairs out there. >> > IIUC, his ppc boards really can be seen as "guests" (they are linux > instances that are utilizing services from the x86, not the other way > around). They aren't guests. Guests don't dma into their host's memory. > vhost forces the model to have the ppc boards act as IO-hosts, > whereas vbus would likely work in either direction due to its more > refined abstraction layer. > vhost=device=dma, virtio=driver=own-memory. >> Of course vhost is incomplete, in the same sense that Linux is >> incomplete. Both require userspace. >> > A vhost based solution to Iras design is missing more than userspace. > Many of those gaps are addressed by a vbus based solution. > Maybe. Ira can fill the gaps or use vbus. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 15:59 ` Avi Kivity @ 2009-09-16 19:22 ` Gregory Haskins 2009-09-16 21:00 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-16 19:22 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 12570 bytes --] Avi Kivity wrote: > On 09/16/2009 05:10 PM, Gregory Haskins wrote: >> >>> If kvm can do it, others can. >>> >> The problem is that you seem to either hand-wave over details like this, >> or you give details that are pretty much exactly what vbus does already. >> My point is that I've already sat down and thought about these issues >> and solved them in a freely available GPL'ed software package. >> > > In the kernel. IMO that's the wrong place for it. In conversations with Ira, he indicated he needs kernel-to-kernel ethernet for performance, and needs at least an ethernet and console connectivity. You could conceivably build a solution for this system 3 basic ways: 1) "completely" in userspace: use things like tuntap on the ppc boards, and tunnel packets across a custom point-to-point connection formed over the pci link to a userspace app on the x86 board. This app then reinjects the packets into the x86 kernel as a raw socket or tuntap, etc. Pretty much vanilla tuntap/vpn kind of stuff. Advantage: very little kernel code. Problem: performance (citation: hopefully obvious). 2) "partially" in userspace: have an in-kernel virtio-net driver talk to a userspace based virtio-net backend. This is the (current, non-vhost oriented) KVM/qemu model. Advantage, re-uses existing kernel-code. Problem: performance (citation: see alacrityvm numbers). 3) "in-kernel": You can do something like virtio-net to vhost to potentially meet some of the requirements, but not all. In order to fully meet (3), you would need to do some of that stuff you mentioned in the last reply with muxing device-nr/reg-nr. In addition, we need to have a facility for mapping eventfds and establishing a signaling mechanism (like PIO+qid), etc. KVM does this with IRQFD/IOEVENTFD, but we dont have KVM in this case so it needs to be invented. To meet performance, this stuff has to be in kernel and there has to be a way to manage it. Since vbus was designed to do exactly that, this is what I would advocate. You could also reinvent these concepts and put your own mux and mapping code in place, in addition to all the other stuff that vbus does. But I am not clear why anyone would want to. So no, the kernel is not the wrong place for it. Its the _only_ place for it. Otherwise, just use (1) and be done with it. > Further, if we adopt > vbus, if drop compatibility with existing guests or have to support both > vbus and virtio-pci. We already need to support both (at least to support Ira). virtio-pci doesn't work here. Something else (vbus, or vbus-like) is needed. > >> So the question is: is your position that vbus is all wrong and you wish >> to create a new bus-like thing to solve the problem? > > I don't intend to create anything new, I am satisfied with virtio. If > it works for Ira, excellent. If not, too bad. I think that about sums it up, then. > I believe it will work without too much trouble. Afaict it wont for the reasons I mentioned. > >> If so, how is it >> different from what Ive already done? More importantly, what specific >> objections do you have to what Ive done, as perhaps they can be fixed >> instead of starting over? >> > > The two biggest objections are: > - the host side is in the kernel As it needs to be. > - the guest side is a new bus instead of reusing pci (on x86/kvm), > making Windows support more difficult Thats a function of the vbus-connector, which is different from vbus-core. If you don't like it (and I know you don't), we can write one that interfaces to qemu's pci system. I just don't like the limitations that imposes, nor do I think we need that complexity of dealing with a split PCI model, so I chose to not implement vbus-kvm this way. With all due respect, based on all of your comments in aggregate I really do not think you are truly grasping what I am actually building here. > > I guess these two are exactly what you think are vbus' greatest > advantages, so we'll probably have to extend our agree-to-disagree on > this one. > > I also had issues with using just one interrupt vector to service all > events, but that's easily fixed. Again, function of the connector. > >>> There is no guest and host in this scenario. There's a device side >>> (ppc) and a driver side (x86). The driver side can access configuration >>> information on the device side. How to multiplex multiple devices is an >>> interesting exercise for whoever writes the virtio binding for that >>> setup. >>> >> Bingo. So now its a question of do you want to write this layer from >> scratch, or re-use my framework. >> > > You will have to implement a connector or whatever for vbus as well. > vbus has more layers so it's probably smaller for vbus. Bingo! That is precisely the point. All the stuff for how to map eventfds, handle signal mitigation, demux device/function pointers, isolation, etc, are built in. All the connector has to do is transport the 4-6 verbs and provide a memory mapping/copy function, and the rest is reusable. The device models would then work in all environments unmodified, and likewise the connectors could use all device-models unmodified. > >>>>> >>>>> >>>> I am talking about how we would tunnel the config space for N devices >>>> across his transport. >>>> >>>> >>> Sounds trivial. >>> >> No one said it was rocket science. But it does need to be designed and >> implemented end-to-end, much of which Ive already done in what I hope is >> an extensible way. >> > > It was already implemented three times for virtio, so apparently that's > extensible too. And to my point, I'm trying to commoditize as much of that process as possible on both the front and backends (at least for cases where performance matters) so that you don't need to reinvent the wheel for each one. > >>> Write an address containing the device number and >>> register number to on location, read or write data from another. >>> >> You mean like the "u64 devh", and "u32 func" fields I have here for the >> vbus-kvm connector? >> >> http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=include/linux/vbus_pci.h;h=fe337590e644017392e4c9d9236150adb2333729;hb=ded8ce2005a85c174ba93ee26f8d67049ef11025#l64 >> >> >> > > Probably. > > > >>>> That sounds convenient given his hardware, but it has its own set of >>>> problems. For one, the configuration/inventory of these boards is now >>>> driven by the wrong side and has to be addressed. >>>> >>> Why is it the wrong side? >>> >> "Wrong" is probably too harsh a word when looking at ethernet. Its >> certainly "odd", and possibly inconvenient. It would be like having >> vhost in a KVM guest, and virtio-net running on the host. You could do >> it, but its weird and awkward. Where it really falls apart and enters >> the "wrong" category is for non-symmetric devices, like disk-io. >> >> > > > It's not odd or wrong or wierd or awkward. Its weird IMO because IIUC the ppc boards are not really "NICs". Yes, their arrangement as bus-master PCI devices makes them look and smell like "devices", but that is an implementation detail of its transport (like hypercalls/PIO in KVM) and not relevant to its broader role in the system. They are more or less like "guests" from the KVM world. The x86 is providing connectivity resources to these guests, not the other way around. It is not a goal to make the x86 look like it has a multihomed array of ppc based NIC adapters. The only reason we would treat these ppc boards like NICs is because (iiuc) that is the only way vhost can be hacked to work with the system, not because its the optimal design. FWIW: There are a ton of chassis-based systems that look similar to Ira's out there (PCI inter-connected nodes), and I would like to support them, too. So its not like this is a one-off. > An ethernet NIC is not > symmetric, one side does DMA and issues interrupts, the other uses its > own memory. I never said a NIC was. I meant the ethernet _protocol_ is symmetric. I meant it in the sense that you can ingress/egress packets in either direction and as long as "TX" on one side is "RX" on the other and vice versa, it all kind of works. You can even loop it back and it still works. Contrast this to something like a disk-block protocol where a "read" message is expected to actually do a read, etc. In this case, you cannot arbitrarily assign the location of the "driver" and "device" like you can with ethernet. The device should presumably be where the storage is, and the driver should be where the consumer is. > That's exactly the case with Ira's setup. See "implementation detail" comment above. > > If the ppc boards were to emulate a disk controller, you'd run > virtio-blk on x86 and vhost-blk on the ppc boards. Agreed. > >>>> Second, the role >>>> reversal will likely not work for many models other than ethernet (e.g. >>>> virtio-console or virtio-blk drivers running on the x86 board would be >>>> naturally consuming services from the slave boards...virtio-net is an >>>> exception because 802.x is generally symmetrical). >>>> >>>> >>> There is no role reversal. >>> >> So if I have virtio-blk driver running on the x86 and vhost-blk device >> running on the ppc board, I can use the ppc board as a block-device. >> What if I really wanted to go the other way? >> > > You mean, if the x86 board was able to access the disks and dma into the > ppb boards memory? You'd run vhost-blk on x86 and virtio-net on ppc. But as we discussed, vhost doesn't work well if you try to run it on the x86 side due to its assumptions about pagable "guest" memory, right? So is that even an option? And even still, you would still need to solve the aggregation problem so that multiple devices can coexist. > > As long as you don't use the words "guest" and "host" but keep to > "driver" and "device", it all works out. > >>> The side doing dma is the device, the side >>> accessing its own memory is the driver. Just like that other 1e12 >>> driver/device pairs out there. >>> >> IIUC, his ppc boards really can be seen as "guests" (they are linux >> instances that are utilizing services from the x86, not the other way >> around). > > They aren't guests. Guests don't dma into their host's memory. Thats not relevant. They are not guests in the sense of isolated virtualized guests like KVM. They are guests in the sense that they are subordinate linux instances which utilize IO resources on the x86 (host). The way this would work is that the x86 would be driving the dma controller on the ppc board, not the other way around. The fact that the controller lives on the ppc board is an implementation detail. The way I envision this to work would be that the ppc board exports two functions in its device: 1) a vbus-bridge like device 2) a dma-controller that accepts "gpas" as one parameter so function (1) does the 4-6 verbs I mentioned for device addressing, etc. function (2) is utilized by the x86 memctx whenever a ->copy_from() or ->copy_to() operation is invoked. The ppc board's would be doing their normal virtio kind of things, like ->add_buf(_pa(skb->data))). > >> vhost forces the model to have the ppc boards act as IO-hosts, >> whereas vbus would likely work in either direction due to its more >> refined abstraction layer. >> > > vhost=device=dma, virtio=driver=own-memory. I agree that virtio=driver=own-memory. The problem is vhost != dma. vhost = hva*, and it just so happens that Ira's ppc boards support host mapping/dma so it kind of works. What I have been trying to say is that the extra abstraction to the memctx gets the "vhost" side away from hva*, such that it can support hva if that makes sense, or something else (like a custom dma engine if it doesn't) > >>> Of course vhost is incomplete, in the same sense that Linux is >>> incomplete. Both require userspace. >>> >> A vhost based solution to Iras design is missing more than userspace. >> Many of those gaps are addressed by a vbus based solution. >> > > Maybe. Ira can fill the gaps or use vbus. > > Agreed. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 19:22 ` Gregory Haskins @ 2009-09-16 21:00 ` Avi Kivity 2009-09-17 3:11 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-16 21:00 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/16/2009 10:22 PM, Gregory Haskins wrote: > Avi Kivity wrote: > >> On 09/16/2009 05:10 PM, Gregory Haskins wrote: >> >>>> If kvm can do it, others can. >>>> >>>> >>> The problem is that you seem to either hand-wave over details like this, >>> or you give details that are pretty much exactly what vbus does already. >>> My point is that I've already sat down and thought about these issues >>> and solved them in a freely available GPL'ed software package. >>> >>> >> In the kernel. IMO that's the wrong place for it. >> > 3) "in-kernel": You can do something like virtio-net to vhost to > potentially meet some of the requirements, but not all. > > In order to fully meet (3), you would need to do some of that stuff you > mentioned in the last reply with muxing device-nr/reg-nr. In addition, > we need to have a facility for mapping eventfds and establishing a > signaling mechanism (like PIO+qid), etc. KVM does this with > IRQFD/IOEVENTFD, but we dont have KVM in this case so it needs to be > invented. > irqfd/eventfd is the abstraction layer, it doesn't need to be reabstracted. > To meet performance, this stuff has to be in kernel and there has to be > a way to manage it. and management belongs in userspace. > Since vbus was designed to do exactly that, this is > what I would advocate. You could also reinvent these concepts and put > your own mux and mapping code in place, in addition to all the other > stuff that vbus does. But I am not clear why anyone would want to. > Maybe they like their backward compatibility and Windows support. > So no, the kernel is not the wrong place for it. Its the _only_ place > for it. Otherwise, just use (1) and be done with it. > > I'm talking about the config stuff, not the data path. >> Further, if we adopt >> vbus, if drop compatibility with existing guests or have to support both >> vbus and virtio-pci. >> > We already need to support both (at least to support Ira). virtio-pci > doesn't work here. Something else (vbus, or vbus-like) is needed. > virtio-ira. >>> So the question is: is your position that vbus is all wrong and you wish >>> to create a new bus-like thing to solve the problem? >>> >> I don't intend to create anything new, I am satisfied with virtio. If >> it works for Ira, excellent. If not, too bad. >> > I think that about sums it up, then. > Yes. I'm all for reusing virtio, but I'm not going switch to vbus or support both for this esoteric use case. >>> If so, how is it >>> different from what Ive already done? More importantly, what specific >>> objections do you have to what Ive done, as perhaps they can be fixed >>> instead of starting over? >>> >>> >> The two biggest objections are: >> - the host side is in the kernel >> > As it needs to be. > vhost-net somehow manages to work without the config stuff in the kernel. > With all due respect, based on all of your comments in aggregate I > really do not think you are truly grasping what I am actually building here. > Thanks. >>> Bingo. So now its a question of do you want to write this layer from >>> scratch, or re-use my framework. >>> >>> >> You will have to implement a connector or whatever for vbus as well. >> vbus has more layers so it's probably smaller for vbus. >> > Bingo! (addictive, isn't it) > That is precisely the point. > > All the stuff for how to map eventfds, handle signal mitigation, demux > device/function pointers, isolation, etc, are built in. All the > connector has to do is transport the 4-6 verbs and provide a memory > mapping/copy function, and the rest is reusable. The device models > would then work in all environments unmodified, and likewise the > connectors could use all device-models unmodified. > Well, virtio has a similar abstraction on the guest side. The host side abstraction is limited to signalling since all configuration is in userspace. vhost-net ought to work for lguest and s390 without change. >> It was already implemented three times for virtio, so apparently that's >> extensible too. >> > And to my point, I'm trying to commoditize as much of that process as > possible on both the front and backends (at least for cases where > performance matters) so that you don't need to reinvent the wheel for > each one. > Since you're interested in any-to-any connectors it makes sense to you. I'm only interested in kvm-host-to-kvm-guest, so reducing the already minor effort to implement a new virtio binding has little appeal to me. >> You mean, if the x86 board was able to access the disks and dma into the >> ppb boards memory? You'd run vhost-blk on x86 and virtio-net on ppc. >> > But as we discussed, vhost doesn't work well if you try to run it on the > x86 side due to its assumptions about pagable "guest" memory, right? So > is that even an option? And even still, you would still need to solve > the aggregation problem so that multiple devices can coexist. > I don't know. Maybe it can be made to work and maybe it cannot. It probably can with some determined hacking. -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 21:00 ` Avi Kivity @ 2009-09-17 3:11 ` Gregory Haskins 2009-09-17 7:49 ` Avi Kivity ` (2 more replies) 0 siblings, 3 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-17 3:11 UTC (permalink / raw) To: Avi Kivity Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 12081 bytes --] Avi Kivity wrote: > On 09/16/2009 10:22 PM, Gregory Haskins wrote: >> Avi Kivity wrote: >> >>> On 09/16/2009 05:10 PM, Gregory Haskins wrote: >>> >>>>> If kvm can do it, others can. >>>>> >>>>> >>>> The problem is that you seem to either hand-wave over details like >>>> this, >>>> or you give details that are pretty much exactly what vbus does >>>> already. >>>> My point is that I've already sat down and thought about these >>>> issues >>>> and solved them in a freely available GPL'ed software package. >>>> >>>> >>> In the kernel. IMO that's the wrong place for it. >>> >> 3) "in-kernel": You can do something like virtio-net to vhost to >> potentially meet some of the requirements, but not all. >> >> In order to fully meet (3), you would need to do some of that stuff you >> mentioned in the last reply with muxing device-nr/reg-nr. In addition, >> we need to have a facility for mapping eventfds and establishing a >> signaling mechanism (like PIO+qid), etc. KVM does this with >> IRQFD/IOEVENTFD, but we dont have KVM in this case so it needs to be >> invented. >> > > irqfd/eventfd is the abstraction layer, it doesn't need to be reabstracted. Not per se, but it needs to be interfaced. How do I register that eventfd with the fastpath in Ira's rig? How do I signal the eventfd (x86->ppc, and ppc->x86)? To take it to the next level, how do I organize that mechanism so that it works for more than one IO-stream (e.g. address the various queues within ethernet or a different device like the console)? KVM has IOEVENTFD and IRQFD managed with MSI and PIO. This new rig does not have the luxury of an established IO paradigm. Is vbus the only way to implement a solution? No. But it is _a_ way, and its one that was specifically designed to solve this very problem (as well as others). (As an aside, note that you generally will want an abstraction on top of irqfd/eventfd like shm-signal or virtqueues to do shared-memory based event mitigation, but I digress. That is a separate topic). > >> To meet performance, this stuff has to be in kernel and there has to be >> a way to manage it. > > and management belongs in userspace. vbus does not dictate where the management must be. Its an extensible framework, governed by what you plug into it (ala connectors and devices). For instance, the vbus-kvm connector in alacrityvm chooses to put DEVADD and DEVDROP hotswap events into the interrupt stream, because they are simple and we already needed the interrupt stream anyway for fast-path. As another example: venet chose to put ->call(MACQUERY) "config-space" into its call namespace because its simple, and we already need ->calls() for fastpath. It therefore exports an attribute to sysfs that allows the management app to set it. I could likewise have designed the connector or device-model differently as to keep the mac-address and hotswap-events somewhere else (QEMU/PCI userspace) but this seems silly to me when they are so trivial, so I didn't. > >> Since vbus was designed to do exactly that, this is >> what I would advocate. You could also reinvent these concepts and put >> your own mux and mapping code in place, in addition to all the other >> stuff that vbus does. But I am not clear why anyone would want to. >> > > Maybe they like their backward compatibility and Windows support. This is really not relevant to this thread, since we are talking about Ira's hardware. But if you must bring this up, then I will reiterate that you just design the connector to interface with QEMU+PCI and you have that too if that was important to you. But on that topic: Since you could consider KVM a "motherboard manufacturer" of sorts (it just happens to be virtual hardware), I don't know why KVM seems to consider itself the only motherboard manufacturer in the world that has to make everything look legacy. If a company like ASUS wants to add some cutting edge IO controller/bus, they simply do it. Pretty much every product release may contain a different array of devices, many of which are not backwards compatible with any prior silicon. The guy/gal installing Windows on that system may see a "?" in device-manager until they load a driver that supports the new chip, and subsequently it works. It is certainly not a requirement to make said chip somehow work with existing drivers/facilities on bare metal, per se. Why should virtual systems be different? So, yeah, the current design of the vbus-kvm connector means I have to provide a driver. This is understood, and I have no problem with that. The only thing that I would agree has to be backwards compatible is the BIOS/boot function. If you can't support running an image like the Windows installer, you are hosed. If you can't use your ethernet until you get a chance to install a driver after the install completes, its just like most other systems in existence. IOW: It's not a big deal. For cases where the IO system is needed as part of the boot/install, you provide BIOS and/or an install-disk support for it. > >> So no, the kernel is not the wrong place for it. Its the _only_ place >> for it. Otherwise, just use (1) and be done with it. >> >> > > I'm talking about the config stuff, not the data path. As stated above, where config stuff lives is a function of what you interface to vbus. Data-path stuff must be in the kernel for performance reasons, and this is what I was referring to. I think we are generally both in agreement, here. What I was getting at is that you can't just hand-wave the datapath stuff. We do fast path in KVM with IRQFD/IOEVENTFD+PIO, and we do device discovery/addressing with PCI. Neither of those are available here in Ira's case yet the general concepts are needed. Therefore, we have to come up with something else. > >>> Further, if we adopt >>> vbus, if drop compatibility with existing guests or have to support both >>> vbus and virtio-pci. >>> >> We already need to support both (at least to support Ira). virtio-pci >> doesn't work here. Something else (vbus, or vbus-like) is needed. >> > > virtio-ira. Sure, virtio-ira and he is on his own to make a bus-model under that, or virtio-vbus + vbus-ira-connector to use the vbus framework. Either model can work, I agree. > >>>> So the question is: is your position that vbus is all wrong and you >>>> wish >>>> to create a new bus-like thing to solve the problem? >>>> >>> I don't intend to create anything new, I am satisfied with virtio. If >>> it works for Ira, excellent. If not, too bad. >>> >> I think that about sums it up, then. >> > > Yes. I'm all for reusing virtio, but I'm not going switch to vbus or > support both for this esoteric use case. With all due respect, no one asked you to. This sub-thread was originally about using vhost in Ira's rig. When problems surfaced in that proposed model, I highlighted that I had already addressed that problem in vbus, and here we are. > >>>> If so, how is it >>>> different from what Ive already done? More importantly, what specific >>>> objections do you have to what Ive done, as perhaps they can be fixed >>>> instead of starting over? >>>> >>>> >>> The two biggest objections are: >>> - the host side is in the kernel >>> >> As it needs to be. >> > > vhost-net somehow manages to work without the config stuff in the kernel. I was referring to data-path stuff, like signal and memory configuration/routing. As an aside, it should be noted that vhost under KVM has IRQFD/IOEVENTFD, PCI-emulation, QEMU, etc to complement it and fill in some of the pieces one needs for a complete solution. Not all environments have all of those pieces (nor should they), and those pieces need to come from somewhere. It should also be noted that what remains (config/management) after the data-path stuff is laid out is actually quite simple. It consists of pretty much an enumerated list of device-ids within a container, DEVADD(id), DEVDROP(id) events, and some sysfs attributes as defined on a per-device basis (many of which are often needed regardless of whether the "config-space" operation is handled in-kernel or not) Therefore, the configuration aspect of the system does not necessitate a complicated (e.g. full PCI emulation) or external (e.g. userspace) component per se. The parts of vbus that could be construed as "management" are (afaict) built using accepted/best-practices for managing arbitrary kernel subsystems (sysfs, configfs, ioctls, etc) so there is nothing new or reasonably controversial there. It is for this reason that I think the objection to "in-kernel config" is unfounded. Disagreements on this point may be settled by the connector design, while still utilizing vbus, and thus retaining most of the other benefits of using the vbus framework. The connector ultimately dictates how and what is exposed to the "guest". > >> With all due respect, based on all of your comments in aggregate I >> really do not think you are truly grasping what I am actually building >> here. >> > > Thanks. > > > >>>> Bingo. So now its a question of do you want to write this layer from >>>> scratch, or re-use my framework. >>>> >>>> >>> You will have to implement a connector or whatever for vbus as well. >>> vbus has more layers so it's probably smaller for vbus. >>> >> Bingo! > > (addictive, isn't it) Apparently. > >> That is precisely the point. >> >> All the stuff for how to map eventfds, handle signal mitigation, demux >> device/function pointers, isolation, etc, are built in. All the >> connector has to do is transport the 4-6 verbs and provide a memory >> mapping/copy function, and the rest is reusable. The device models >> would then work in all environments unmodified, and likewise the >> connectors could use all device-models unmodified. >> > > Well, virtio has a similar abstraction on the guest side. The host side > abstraction is limited to signalling since all configuration is in > userspace. vhost-net ought to work for lguest and s390 without change. But IIUC that is primarily because the revectoring work is already in QEMU for virtio-u and it rides on that, right? Not knocking that, thats nice and a distinct advantage. It should just be noted that its based on sunk-cost, and not truly free. Its just already paid for, which is different. It also means it only works in environments based on QEMU, which not all are (as evident by this sub-thread). > >>> It was already implemented three times for virtio, so apparently that's >>> extensible too. >>> >> And to my point, I'm trying to commoditize as much of that process as >> possible on both the front and backends (at least for cases where >> performance matters) so that you don't need to reinvent the wheel for >> each one. >> > > Since you're interested in any-to-any connectors it makes sense to you. > I'm only interested in kvm-host-to-kvm-guest, so reducing the already > minor effort to implement a new virtio binding has little appeal to me. > Fair enough. >>> You mean, if the x86 board was able to access the disks and dma into the >>> ppb boards memory? You'd run vhost-blk on x86 and virtio-net on ppc. >>> >> But as we discussed, vhost doesn't work well if you try to run it on the >> x86 side due to its assumptions about pagable "guest" memory, right? So >> is that even an option? And even still, you would still need to solve >> the aggregation problem so that multiple devices can coexist. >> > > I don't know. Maybe it can be made to work and maybe it cannot. It > probably can with some determined hacking. > I guess you can say the same for any of the solutions. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-17 3:11 ` Gregory Haskins @ 2009-09-17 7:49 ` Avi Kivity 2009-09-17 14:16 ` Javier Guerra 2009-09-21 21:43 ` Ira W. Snyder 2 siblings, 0 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-17 7:49 UTC (permalink / raw) To: Gregory Haskins Cc: Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/17/2009 06:11 AM, Gregory Haskins wrote: > >> irqfd/eventfd is the abstraction layer, it doesn't need to be reabstracted. >> > Not per se, but it needs to be interfaced. How do I register that > eventfd with the fastpath in Ira's rig? How do I signal the eventfd > (x86->ppc, and ppc->x86)? > You write a userspace or kernel module to do it. It's a few dozen lines of code. > To take it to the next level, how do I organize that mechanism so that > it works for more than one IO-stream (e.g. address the various queues > within ethernet or a different device like the console)? KVM has > IOEVENTFD and IRQFD managed with MSI and PIO. This new rig does not > have the luxury of an established IO paradigm. > > Is vbus the only way to implement a solution? No. But it is _a_ way, > and its one that was specifically designed to solve this very problem > (as well as others). > virtio assumes that the number of transports will be limited and interesting growth is in the number of device classes and drivers. So we have support for just three transports, but 6 device classes (9p, rng, balloon, console, blk, net) and 8 drivers (the preceding 6 for linux, plus blk/net for Windows). It would have nice to be able to write a new binding in Visual Basic but it's hardly a killer feature. >>> Since vbus was designed to do exactly that, this is >>> what I would advocate. You could also reinvent these concepts and put >>> your own mux and mapping code in place, in addition to all the other >>> stuff that vbus does. But I am not clear why anyone would want to. >>> >>> >> Maybe they like their backward compatibility and Windows support. >> > This is really not relevant to this thread, since we are talking about > Ira's hardware. But if you must bring this up, then I will reiterate > that you just design the connector to interface with QEMU+PCI and you > have that too if that was important to you. > Well, for Ira the major issue is probably inclusion in the upstream kernel. > But on that topic: Since you could consider KVM a "motherboard > manufacturer" of sorts (it just happens to be virtual hardware), I don't > know why KVM seems to consider itself the only motherboard manufacturer > in the world that has to make everything look legacy. If a company like > ASUS wants to add some cutting edge IO controller/bus, they simply do > it. No, they don't. New buses are added through industry consortiums these days. No one adds a bus that is only available with their machine, not even Apple. > Pretty much every product release may contain a different array of > devices, many of which are not backwards compatible with any prior > silicon. The guy/gal installing Windows on that system may see a "?" in > device-manager until they load a driver that supports the new chip, and > subsequently it works. It is certainly not a requirement to make said > chip somehow work with existing drivers/facilities on bare metal, per > se. Why should virtual systems be different? > Devices/drivers are a different matter, and if you have a virtio-net device you'll get the same "?" until you load the driver. That's how people and the OS vendors expect things to work. > What I was getting at is that you can't just hand-wave the datapath > stuff. We do fast path in KVM with IRQFD/IOEVENTFD+PIO, and we do > device discovery/addressing with PCI. That's not datapath stuff. > Neither of those are available > here in Ira's case yet the general concepts are needed. Therefore, we > have to come up with something else. > Ira has to implement virtio's ->kick() function and come up with something for discovery. It's a lot less lines of code than there are messages in this thread. >> Yes. I'm all for reusing virtio, but I'm not going switch to vbus or >> support both for this esoteric use case. >> > With all due respect, no one asked you to. This sub-thread was > originally about using vhost in Ira's rig. When problems surfaced in > that proposed model, I highlighted that I had already addressed that > problem in vbus, and here we are. > Ah, okay. I have no interest in Ira choosing either virtio or vbus. >> vhost-net somehow manages to work without the config stuff in the kernel. >> > I was referring to data-path stuff, like signal and memory > configuration/routing. > signal and memory configuration/routing are not data-path stuff. >> Well, virtio has a similar abstraction on the guest side. The host side >> abstraction is limited to signalling since all configuration is in >> userspace. vhost-net ought to work for lguest and s390 without change. >> > But IIUC that is primarily because the revectoring work is already in > QEMU for virtio-u and it rides on that, right? Not knocking that, thats > nice and a distinct advantage. It should just be noted that its based > on sunk-cost, and not truly free. Its just already paid for, which is > different. It also means it only works in environments based on QEMU, > which not all are (as evident by this sub-thread). > No. We expose a mix of emulated-in-userspace and emulated-in-the-kernel devices on one bus. Devices emulated in userspace only lose by having the bus emulated in the kernel. Devices in the kernel gain nothing from having the bus emulated in the kernel. It's a complete slow path so it belongs in userspace where state is easy to get at, development is faster, and bugs are cheaper to fix. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-17 3:11 ` Gregory Haskins 2009-09-17 7:49 ` Avi Kivity @ 2009-09-17 14:16 ` Javier Guerra 2009-09-21 21:43 ` Ira W. Snyder 2 siblings, 0 replies; 83+ messages in thread From: Javier Guerra @ 2009-09-17 14:16 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Michael S. Tsirkin, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wed, Sep 16, 2009 at 10:11 PM, Gregory Haskins <gregory.haskins@gmail.com> wrote: > It is certainly not a requirement to make said > chip somehow work with existing drivers/facilities on bare metal, per > se. Why should virtual systems be different? i'd guess it's an issue of support resources. a hardware developer creates a chip and immediately sells it, getting small but assured revenue, with it they write (or pays to write) drivers for a couple of releases, and stop to manufacture it as soon as it's not profitable. software has a much longer lifetime, especially at the platform-level (and KVM is a platform for a lot of us). also, being GPL, it's cheaper to produce but has (much!) more limited resources. creating a new support issue is a scary thought. -- Javier ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-17 3:11 ` Gregory Haskins 2009-09-17 7:49 ` Avi Kivity 2009-09-17 14:16 ` Javier Guerra @ 2009-09-21 21:43 ` Ira W. Snyder 2009-09-22 9:43 ` Avi Kivity 2 siblings, 1 reply; 83+ messages in thread From: Ira W. Snyder @ 2009-09-21 21:43 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wed, Sep 16, 2009 at 11:11:57PM -0400, Gregory Haskins wrote: > Avi Kivity wrote: > > On 09/16/2009 10:22 PM, Gregory Haskins wrote: > >> Avi Kivity wrote: > >> > >>> On 09/16/2009 05:10 PM, Gregory Haskins wrote: > >>> > >>>>> If kvm can do it, others can. > >>>>> > >>>>> > >>>> The problem is that you seem to either hand-wave over details like > >>>> this, > >>>> or you give details that are pretty much exactly what vbus does > >>>> already. > >>>> My point is that I've already sat down and thought about these > >>>> issues > >>>> and solved them in a freely available GPL'ed software package. > >>>> > >>>> > >>> In the kernel. IMO that's the wrong place for it. > >>> > >> 3) "in-kernel": You can do something like virtio-net to vhost to > >> potentially meet some of the requirements, but not all. > >> > >> In order to fully meet (3), you would need to do some of that stuff you > >> mentioned in the last reply with muxing device-nr/reg-nr. In addition, > >> we need to have a facility for mapping eventfds and establishing a > >> signaling mechanism (like PIO+qid), etc. KVM does this with > >> IRQFD/IOEVENTFD, but we dont have KVM in this case so it needs to be > >> invented. > >> > > > > irqfd/eventfd is the abstraction layer, it doesn't need to be reabstracted. > > Not per se, but it needs to be interfaced. How do I register that > eventfd with the fastpath in Ira's rig? How do I signal the eventfd > (x86->ppc, and ppc->x86)? > Sorry to reply so late to this thread, I've been on vacation for the past week. If you'd like to continue in another thread, please start it and CC me. On the PPC, I've got a hardware "doorbell" register which generates 30 distiguishable interrupts over the PCI bus. I have outbound and inbound registers, which can be used to signal the "other side". I assume it isn't too much code to signal an eventfd in an interrupt handler. I haven't gotten to this point in the code yet. > To take it to the next level, how do I organize that mechanism so that > it works for more than one IO-stream (e.g. address the various queues > within ethernet or a different device like the console)? KVM has > IOEVENTFD and IRQFD managed with MSI and PIO. This new rig does not > have the luxury of an established IO paradigm. > > Is vbus the only way to implement a solution? No. But it is _a_ way, > and its one that was specifically designed to solve this very problem > (as well as others). > > (As an aside, note that you generally will want an abstraction on top of > irqfd/eventfd like shm-signal or virtqueues to do shared-memory based > event mitigation, but I digress. That is a separate topic). > > > > >> To meet performance, this stuff has to be in kernel and there has to be > >> a way to manage it. > > > > and management belongs in userspace. > > vbus does not dictate where the management must be. Its an extensible > framework, governed by what you plug into it (ala connectors and devices). > > For instance, the vbus-kvm connector in alacrityvm chooses to put DEVADD > and DEVDROP hotswap events into the interrupt stream, because they are > simple and we already needed the interrupt stream anyway for fast-path. > > As another example: venet chose to put ->call(MACQUERY) "config-space" > into its call namespace because its simple, and we already need > ->calls() for fastpath. It therefore exports an attribute to sysfs that > allows the management app to set it. > > I could likewise have designed the connector or device-model differently > as to keep the mac-address and hotswap-events somewhere else (QEMU/PCI > userspace) but this seems silly to me when they are so trivial, so I didn't. > > > > >> Since vbus was designed to do exactly that, this is > >> what I would advocate. You could also reinvent these concepts and put > >> your own mux and mapping code in place, in addition to all the other > >> stuff that vbus does. But I am not clear why anyone would want to. > >> > > > > Maybe they like their backward compatibility and Windows support. > > This is really not relevant to this thread, since we are talking about > Ira's hardware. But if you must bring this up, then I will reiterate > that you just design the connector to interface with QEMU+PCI and you > have that too if that was important to you. > > But on that topic: Since you could consider KVM a "motherboard > manufacturer" of sorts (it just happens to be virtual hardware), I don't > know why KVM seems to consider itself the only motherboard manufacturer > in the world that has to make everything look legacy. If a company like > ASUS wants to add some cutting edge IO controller/bus, they simply do > it. Pretty much every product release may contain a different array of > devices, many of which are not backwards compatible with any prior > silicon. The guy/gal installing Windows on that system may see a "?" in > device-manager until they load a driver that supports the new chip, and > subsequently it works. It is certainly not a requirement to make said > chip somehow work with existing drivers/facilities on bare metal, per > se. Why should virtual systems be different? > > So, yeah, the current design of the vbus-kvm connector means I have to > provide a driver. This is understood, and I have no problem with that. > > The only thing that I would agree has to be backwards compatible is the > BIOS/boot function. If you can't support running an image like the > Windows installer, you are hosed. If you can't use your ethernet until > you get a chance to install a driver after the install completes, its > just like most other systems in existence. IOW: It's not a big deal. > > For cases where the IO system is needed as part of the boot/install, you > provide BIOS and/or an install-disk support for it. > > > > >> So no, the kernel is not the wrong place for it. Its the _only_ place > >> for it. Otherwise, just use (1) and be done with it. > >> > >> > > > > I'm talking about the config stuff, not the data path. > > As stated above, where config stuff lives is a function of what you > interface to vbus. Data-path stuff must be in the kernel for > performance reasons, and this is what I was referring to. I think we > are generally both in agreement, here. > > What I was getting at is that you can't just hand-wave the datapath > stuff. We do fast path in KVM with IRQFD/IOEVENTFD+PIO, and we do > device discovery/addressing with PCI. Neither of those are available > here in Ira's case yet the general concepts are needed. Therefore, we > have to come up with something else. > > > > >>> Further, if we adopt > >>> vbus, if drop compatibility with existing guests or have to support both > >>> vbus and virtio-pci. > >>> > >> We already need to support both (at least to support Ira). virtio-pci > >> doesn't work here. Something else (vbus, or vbus-like) is needed. > >> > > > > virtio-ira. > > Sure, virtio-ira and he is on his own to make a bus-model under that, or > virtio-vbus + vbus-ira-connector to use the vbus framework. Either > model can work, I agree. > Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and virtio-s390. It isn't especially easy. I can steal lots of code from the lguest bus model, but sometimes it is good to generalize, especially after the fourth implemention or so. I think this is what GHaskins tried to do. Here is what I've implemented so far: * a generic virtio-phys-guest layer (my bus model, like lguest) - this runs on the crate server (x86) in my system * a generic virtio-phys-host layer (my /dev/lguest implementation) - this runs on the ppc boards in my system - this assumes that the kernel will allocate some memory and expose it over PCI in a device-specific way, so the guest can see it as a PCI BAR * a virtio-phys-mpc83xx driver - this runs on the crate server (x86) in my system - this interfaces virtio-phys-guest to my mpc83xx board - it is a Linux PCI driver, which detects mpc83xx boards, runs ioremap_pci_bar() on the correct PCI BAR, and then gives that to the virtio-phys-guest layer I think that the idea of device/driver (instead of host/guest) is a good one. It makes my problem easier to think about. I've given it some thought, and I think that running vhost-net (or similar) on the ppc boards, with virtio-net on the x86 crate server will work. The virtio-ring abstraction is almost good enough to work for this situation, but I had to re-invent it to work with my boards. I've exposed a 16K region of memory as PCI BAR1 from my ppc board. Remember that this is the "host" system. I used each 4K block as a "device descriptor" which contains: 1) the type of device, config space, etc. for virtio 2) the "desc" table (virtio memory descriptors, see virtio-ring) 3) the "avail" table (available entries in the desc table) Parts 2 and 3 are repeated three times, to allow for a maximum of three virtqueues per device. This is good enough for all current drivers. The guest side (x86 in my system) allocates some device-accessible memory, and writes the PCI address to the device descriptor. This memory contains: 1) the "used" table (consumed entries in the desc/avail tables) This exists three times as well, once for each virtqueue. The rest is basically a copy of virtio-ring, with a few changes to allow for cacheing, etc. It may not even be worth doing this from a performance standpoint, I haven't benchmarked it yet. For now, I'd be happy with a non-DMA memcpy only solution. I can add DMA once things are working. I've got the current code (subject to change at any time) available at the address listed below. If you think another format would be better for you, please ask, and I'll provide it. http://www.mmarray.org/~iws/virtio-phys/ I've gotten plenty of email about this from lots of interested developers. There are people who would like this kind of system to just work, while having to write just some glue for their device, just like a network driver. I hunch most people have created some proprietary mess that basically works, and left it at that. So, here is a desperate cry for help. I'd like to make this work, and I'd really like to see it in mainline. I'm trying to give back to the community from which I've taken plenty. Ira -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-21 21:43 ` Ira W. Snyder @ 2009-09-22 9:43 ` Avi Kivity 2009-09-22 15:25 ` Ira W. Snyder 2009-09-23 14:26 ` Gregory Haskins 0 siblings, 2 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-22 9:43 UTC (permalink / raw) To: Ira W. Snyder Cc: Gregory Haskins, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/22/2009 12:43 AM, Ira W. Snyder wrote: > >> Sure, virtio-ira and he is on his own to make a bus-model under that, or >> virtio-vbus + vbus-ira-connector to use the vbus framework. Either >> model can work, I agree. >> >> > Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and > virtio-s390. It isn't especially easy. I can steal lots of code from the > lguest bus model, but sometimes it is good to generalize, especially > after the fourth implemention or so. I think this is what GHaskins tried > to do. > Yes. vbus is more finely layered so there is less code duplication. The virtio layering was more or less dictated by Xen which doesn't have shared memory (it uses grant references instead). As a matter of fact lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that part is duplicated. It's probably possible to add a virtio-shmem.ko library that people who do have shared memory can reuse. > I've given it some thought, and I think that running vhost-net (or > similar) on the ppc boards, with virtio-net on the x86 crate server will > work. The virtio-ring abstraction is almost good enough to work for this > situation, but I had to re-invent it to work with my boards. > > I've exposed a 16K region of memory as PCI BAR1 from my ppc board. > Remember that this is the "host" system. I used each 4K block as a > "device descriptor" which contains: > > 1) the type of device, config space, etc. for virtio > 2) the "desc" table (virtio memory descriptors, see virtio-ring) > 3) the "avail" table (available entries in the desc table) > Won't access from x86 be slow to this memory (on the other hand, if you change it to main memory access from ppc will be slow... really depends on how your system is tuned. > Parts 2 and 3 are repeated three times, to allow for a maximum of three > virtqueues per device. This is good enough for all current drivers. > The plan is to switch to multiqueue soon. Will not affect you if your boards are uniprocessor or small smp. > I've gotten plenty of email about this from lots of interested > developers. There are people who would like this kind of system to just > work, while having to write just some glue for their device, just like a > network driver. I hunch most people have created some proprietary mess > that basically works, and left it at that. > So long as you keep the system-dependent features hookable or configurable, it should work. > So, here is a desperate cry for help. I'd like to make this work, and > I'd really like to see it in mainline. I'm trying to give back to the > community from which I've taken plenty. > Not sure who you're crying for help to. Once you get this working, post patches. If the patches are reasonably clean and don't impact performance for the main use case, and if you can show the need, I expect they'll be merged. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-22 9:43 ` Avi Kivity @ 2009-09-22 15:25 ` Ira W. Snyder 2009-09-22 15:56 ` Avi Kivity 2009-09-23 14:26 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Ira W. Snyder @ 2009-09-22 15:25 UTC (permalink / raw) To: Avi Kivity Cc: Gregory Haskins, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Tue, Sep 22, 2009 at 12:43:36PM +0300, Avi Kivity wrote: > On 09/22/2009 12:43 AM, Ira W. Snyder wrote: > > > >> Sure, virtio-ira and he is on his own to make a bus-model under that, or > >> virtio-vbus + vbus-ira-connector to use the vbus framework. Either > >> model can work, I agree. > >> > >> > > Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and > > virtio-s390. It isn't especially easy. I can steal lots of code from the > > lguest bus model, but sometimes it is good to generalize, especially > > after the fourth implemention or so. I think this is what GHaskins tried > > to do. > > > > Yes. vbus is more finely layered so there is less code duplication. > > The virtio layering was more or less dictated by Xen which doesn't have > shared memory (it uses grant references instead). As a matter of fact > lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that > part is duplicated. It's probably possible to add a virtio-shmem.ko > library that people who do have shared memory can reuse. > Seems like a nice benefit of vbus. > > I've given it some thought, and I think that running vhost-net (or > > similar) on the ppc boards, with virtio-net on the x86 crate server will > > work. The virtio-ring abstraction is almost good enough to work for this > > situation, but I had to re-invent it to work with my boards. > > > > I've exposed a 16K region of memory as PCI BAR1 from my ppc board. > > Remember that this is the "host" system. I used each 4K block as a > > "device descriptor" which contains: > > > > 1) the type of device, config space, etc. for virtio > > 2) the "desc" table (virtio memory descriptors, see virtio-ring) > > 3) the "avail" table (available entries in the desc table) > > > > Won't access from x86 be slow to this memory (on the other hand, if you > change it to main memory access from ppc will be slow... really depends > on how your system is tuned. > Writes across the bus are fast, reads across the bus are slow. These are just the descriptor tables for memory buffers, not the physical memory buffers themselves. These only need to be written by the guest (x86), and read by the host (ppc). The host never changes the tables, so we can cache a copy in the guest, for a fast detach_buf() implementation (see virtio-ring, which I'm copying the design from). The only accesses are writes across the PCI bus. There is never a need to do a read (except for slow-path configuration). > > Parts 2 and 3 are repeated three times, to allow for a maximum of three > > virtqueues per device. This is good enough for all current drivers. > > > > The plan is to switch to multiqueue soon. Will not affect you if your > boards are uniprocessor or small smp. > Everything I have is UP. I don't need extreme performance, either. 40MB/sec is the minimum I need to reach, though I'd like to have some headroom. For reference, using the CPU to handle data transfers, I get ~2MB/sec transfers. Using the DMA engine, I've hit about 60MB/sec with my "crossed-wires" virtio-net. > > I've gotten plenty of email about this from lots of interested > > developers. There are people who would like this kind of system to just > > work, while having to write just some glue for their device, just like a > > network driver. I hunch most people have created some proprietary mess > > that basically works, and left it at that. > > > > So long as you keep the system-dependent features hookable or > configurable, it should work. > > > So, here is a desperate cry for help. I'd like to make this work, and > > I'd really like to see it in mainline. I'm trying to give back to the > > community from which I've taken plenty. > > > > Not sure who you're crying for help to. Once you get this working, post > patches. If the patches are reasonably clean and don't impact > performance for the main use case, and if you can show the need, I > expect they'll be merged. > In the spirit of "post early and often", I'm making my code available, that's all. I'm asking anyone interested for some review, before I have to re-code this for about the fifth time now. I'm trying to avoid Haskins' situation, where he's invented and debugged a lot of new code, and then been told to do it completely differently. Yes, the code I posted is only compile-tested, because quite a lot of code (kernel and userspace) must be working before anything works at all. I hate to design the whole thing, then be told that something fundamental about it is wrong, and have to completely re-write it. Thanks for the comments, Ira > -- > error compiling committee.c: too many arguments to function > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-22 15:25 ` Ira W. Snyder @ 2009-09-22 15:56 ` Avi Kivity 0 siblings, 0 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-22 15:56 UTC (permalink / raw) To: Ira W. Snyder Cc: Gregory Haskins, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/22/2009 06:25 PM, Ira W. Snyder wrote: > >> Yes. vbus is more finely layered so there is less code duplication. >> >> The virtio layering was more or less dictated by Xen which doesn't have >> shared memory (it uses grant references instead). As a matter of fact >> lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that >> part is duplicated. It's probably possible to add a virtio-shmem.ko >> library that people who do have shared memory can reuse. >> >> > Seems like a nice benefit of vbus. > Yes, it is. With some work virtio can gain that too (virtio-shmem.ko). >>> I've given it some thought, and I think that running vhost-net (or >>> similar) on the ppc boards, with virtio-net on the x86 crate server will >>> work. The virtio-ring abstraction is almost good enough to work for this >>> situation, but I had to re-invent it to work with my boards. >>> >>> I've exposed a 16K region of memory as PCI BAR1 from my ppc board. >>> Remember that this is the "host" system. I used each 4K block as a >>> "device descriptor" which contains: >>> >>> 1) the type of device, config space, etc. for virtio >>> 2) the "desc" table (virtio memory descriptors, see virtio-ring) >>> 3) the "avail" table (available entries in the desc table) >>> >>> >> Won't access from x86 be slow to this memory (on the other hand, if you >> change it to main memory access from ppc will be slow... really depends >> on how your system is tuned. >> >> > Writes across the bus are fast, reads across the bus are slow. These are > just the descriptor tables for memory buffers, not the physical memory > buffers themselves. > > These only need to be written by the guest (x86), and read by the host > (ppc). The host never changes the tables, so we can cache a copy in the > guest, for a fast detach_buf() implementation (see virtio-ring, which > I'm copying the design from). > > The only accesses are writes across the PCI bus. There is never a need > to do a read (except for slow-path configuration). > Okay, sounds like what you're doing it optimal then. > In the spirit of "post early and often", I'm making my code available, > that's all. I'm asking anyone interested for some review, before I have > to re-code this for about the fifth time now. I'm trying to avoid > Haskins' situation, where he's invented and debugged a lot of new code, > and then been told to do it completely differently. > > Yes, the code I posted is only compile-tested, because quite a lot of > code (kernel and userspace) must be working before anything works at > all. I hate to design the whole thing, then be told that something > fundamental about it is wrong, and have to completely re-write it. > Understood. Best to get a review from Rusty then. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-22 9:43 ` Avi Kivity 2009-09-22 15:25 ` Ira W. Snyder @ 2009-09-23 14:26 ` Gregory Haskins 2009-09-23 14:37 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-23 14:26 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 4551 bytes --] Avi Kivity wrote: > On 09/22/2009 12:43 AM, Ira W. Snyder wrote: >> >>> Sure, virtio-ira and he is on his own to make a bus-model under that, or >>> virtio-vbus + vbus-ira-connector to use the vbus framework. Either >>> model can work, I agree. >>> >>> >> Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and >> virtio-s390. It isn't especially easy. I can steal lots of code from the >> lguest bus model, but sometimes it is good to generalize, especially >> after the fourth implemention or so. I think this is what GHaskins tried >> to do. >> > > Yes. vbus is more finely layered so there is less code duplication. To clarify, Ira was correct in stating this generalizing some of these components was one of the goals for the vbus project: IOW vbus finely layers and defines what's below virtio, not replaces it. You can think of a virtio-stack like this: -------------------------- | virtio-net -------------------------- | virtio-ring -------------------------- | virtio-bus -------------------------- | ? undefined ? -------------------------- IOW: The way I see it, virtio is a device interface model only. The rest of it is filled in by the virtio-transport and some kind of back-end. So today, we can complete the "? undefined ?" block like this for KVM: -------------------------- | virtio-pci -------------------------- | -------------------------- | kvm.ko -------------------------- | qemu -------------------------- | tuntap -------------------------- In this case, kvm.ko and tuntap are providing plumbing, and qemu is providing a backend device model (pci-based, etc). You can, of course, plug a different stack in (such as virtio-lguest, virtio-ira, etc) but you are more or less on your own to recreate many of the various facilities contained in that stack (such as things provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. Vbus tries to commoditize more components in the stack (like the bus model and backend-device model) so they don't need to be redesigned each time we solve this "virtio-transport" problem. IOW: stop the proliferation of the need for pci-bus, lguest-bus, foo-bus underneath virtio. Instead, we can then focus on the value add on top, like the models themselves or the simple glue between them. So now you might have something like -------------------------- | virtio-vbus -------------------------- | vbus-proxy -------------------------- | kvm-guest-connector -------------------------- | -------------------------- | kvm.ko -------------------------- | kvm-host-connector.ko -------------------------- | vbus.ko -------------------------- | virtio-net-backend.ko -------------------------- so now we don't need to worry about the bus-model or the device-model framework. We only need to implement the connector, etc. This is handy when you find yourself in an environment that doesn't support PCI (such as Ira's rig, or userspace containers), or when you want to add features that PCI doesn't have (such as fluid event channels for things like IPC services, or priortizable interrupts, etc). > > The virtio layering was more or less dictated by Xen which doesn't have > shared memory (it uses grant references instead). As a matter of fact > lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that > part is duplicated. It's probably possible to add a virtio-shmem.ko > library that people who do have shared memory can reuse. Note that I do not believe the Xen folk use virtio, so while I can appreciate the foresight that went into that particular aspect of the design of the virtio model, I am not sure if its a realistic constraint. The reason why I decided to not worry about that particular model is twofold: 1) Trying to support non shared-memory designs is prohibitively high for my performance goals (for instance, requiring an exit on each ->add_buf() in addition to the ->kick()). 2) The Xen guys are unlikely to diverge from something like xenbus/xennet anyway, so it would be for naught. Therefore, I just went with a device model optimized for shared-memory outright. That said, I believe we can refactor what is called the "vbus-proxy-device" into this virtio-shmem interface that you and Anthony have described. We could make the feature optional and only support on architectures where this makes sense. <snip> Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 14:26 ` Gregory Haskins @ 2009-09-23 14:37 ` Avi Kivity 2009-09-23 15:10 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-23 14:37 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/23/2009 05:26 PM, Gregory Haskins wrote: > > >>> Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, and >>> virtio-s390. It isn't especially easy. I can steal lots of code from the >>> lguest bus model, but sometimes it is good to generalize, especially >>> after the fourth implemention or so. I think this is what GHaskins tried >>> to do. >>> >>> >> Yes. vbus is more finely layered so there is less code duplication. >> > To clarify, Ira was correct in stating this generalizing some of these > components was one of the goals for the vbus project: IOW vbus finely > layers and defines what's below virtio, not replaces it. > > You can think of a virtio-stack like this: > > -------------------------- > | virtio-net > -------------------------- > | virtio-ring > -------------------------- > | virtio-bus > -------------------------- > | ? undefined ? > -------------------------- > > IOW: The way I see it, virtio is a device interface model only. The > rest of it is filled in by the virtio-transport and some kind of back-end. > > So today, we can complete the "? undefined ?" block like this for KVM: > > -------------------------- > | virtio-pci > -------------------------- > | > -------------------------- > | kvm.ko > -------------------------- > | qemu > -------------------------- > | tuntap > -------------------------- > > In this case, kvm.ko and tuntap are providing plumbing, and qemu is > providing a backend device model (pci-based, etc). > > You can, of course, plug a different stack in (such as virtio-lguest, > virtio-ira, etc) but you are more or less on your own to recreate many > of the various facilities contained in that stack (such as things > provided by QEMU, like discovery/hotswap/addressing), as Ira is discovering. > > Vbus tries to commoditize more components in the stack (like the bus > model and backend-device model) so they don't need to be redesigned each > time we solve this "virtio-transport" problem. IOW: stop the > proliferation of the need for pci-bus, lguest-bus, foo-bus underneath > virtio. Instead, we can then focus on the value add on top, like the > models themselves or the simple glue between them. > > So now you might have something like > > -------------------------- > | virtio-vbus > -------------------------- > | vbus-proxy > -------------------------- > | kvm-guest-connector > -------------------------- > | > -------------------------- > | kvm.ko > -------------------------- > | kvm-host-connector.ko > -------------------------- > | vbus.ko > -------------------------- > | virtio-net-backend.ko > -------------------------- > > so now we don't need to worry about the bus-model or the device-model > framework. We only need to implement the connector, etc. This is handy > when you find yourself in an environment that doesn't support PCI (such > as Ira's rig, or userspace containers), or when you want to add features > that PCI doesn't have (such as fluid event channels for things like IPC > services, or priortizable interrupts, etc). > Well, vbus does more, for example it tunnels interrupts instead of exposing them 1:1 on the native interface if it exists. It also pulls parts of the device model into the host kernel. >> The virtio layering was more or less dictated by Xen which doesn't have >> shared memory (it uses grant references instead). As a matter of fact >> lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that >> part is duplicated. It's probably possible to add a virtio-shmem.ko >> library that people who do have shared memory can reuse. >> > Note that I do not believe the Xen folk use virtio, so while I can > appreciate the foresight that went into that particular aspect of the > design of the virtio model, I am not sure if its a realistic constraint. > Since a virtio goal was to reduce virtual device driver proliferation, it was necessary to accommodate Xen. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 14:37 ` Avi Kivity @ 2009-09-23 15:10 ` Gregory Haskins 2009-09-23 17:58 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-23 15:10 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 7035 bytes --] Avi Kivity wrote: > On 09/23/2009 05:26 PM, Gregory Haskins wrote: >> >> >>>> Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, >>>> and >>>> virtio-s390. It isn't especially easy. I can steal lots of code from >>>> the >>>> lguest bus model, but sometimes it is good to generalize, especially >>>> after the fourth implemention or so. I think this is what GHaskins >>>> tried >>>> to do. >>>> >>>> >>> Yes. vbus is more finely layered so there is less code duplication. >>> >> To clarify, Ira was correct in stating this generalizing some of these >> components was one of the goals for the vbus project: IOW vbus finely >> layers and defines what's below virtio, not replaces it. >> >> You can think of a virtio-stack like this: >> >> -------------------------- >> | virtio-net >> -------------------------- >> | virtio-ring >> -------------------------- >> | virtio-bus >> -------------------------- >> | ? undefined ? >> -------------------------- >> >> IOW: The way I see it, virtio is a device interface model only. The >> rest of it is filled in by the virtio-transport and some kind of >> back-end. >> >> So today, we can complete the "? undefined ?" block like this for KVM: >> >> -------------------------- >> | virtio-pci >> -------------------------- >> | >> -------------------------- >> | kvm.ko >> -------------------------- >> | qemu >> -------------------------- >> | tuntap >> -------------------------- >> >> In this case, kvm.ko and tuntap are providing plumbing, and qemu is >> providing a backend device model (pci-based, etc). >> >> You can, of course, plug a different stack in (such as virtio-lguest, >> virtio-ira, etc) but you are more or less on your own to recreate many >> of the various facilities contained in that stack (such as things >> provided by QEMU, like discovery/hotswap/addressing), as Ira is >> discovering. >> >> Vbus tries to commoditize more components in the stack (like the bus >> model and backend-device model) so they don't need to be redesigned each >> time we solve this "virtio-transport" problem. IOW: stop the >> proliferation of the need for pci-bus, lguest-bus, foo-bus underneath >> virtio. Instead, we can then focus on the value add on top, like the >> models themselves or the simple glue between them. >> >> So now you might have something like >> >> -------------------------- >> | virtio-vbus >> -------------------------- >> | vbus-proxy >> -------------------------- >> | kvm-guest-connector >> -------------------------- >> | >> -------------------------- >> | kvm.ko >> -------------------------- >> | kvm-host-connector.ko >> -------------------------- >> | vbus.ko >> -------------------------- >> | virtio-net-backend.ko >> -------------------------- >> >> so now we don't need to worry about the bus-model or the device-model >> framework. We only need to implement the connector, etc. This is handy >> when you find yourself in an environment that doesn't support PCI (such >> as Ira's rig, or userspace containers), or when you want to add features >> that PCI doesn't have (such as fluid event channels for things like IPC >> services, or priortizable interrupts, etc). >> > > Well, vbus does more, for example it tunnels interrupts instead of > exposing them 1:1 on the native interface if it exists. As I've previously explained, that trait is a function of the kvm-connector I've chosen to implement, not of the overall design of vbus. The reason why my kvm-connector is designed that way is because my early testing/benchmarking shows one of the issues in KVM performance is the ratio of exits per IO operation are fairly high, especially as your scale io-load. Therefore, the connector achieves a substantial reduction in that ratio by treating "interrupts" to the same kind of benefits that NAPI brought to general networking: That is, we enqueue "interrupt" messages into a lockless ring and only hit the IDT for the first occurrence. Subsequent interrupts are injected in a parallel/lockless manner, without hitting the IDT nor incurring an extra EOI. This pays dividends as the IO rate increases, which is when the guest needs the most help. OTOH, it is entirely possible to design the connector such that we maintain a 1:1 ratio of signals to traditional IDT interrupts. It is also possible to design a connector which surfaces as something else, such as PCI devices (by terminating the connector in QEMU and utilizing its PCI emulation facilities), which would naturally employ 1:1 mapping. So if 1:1 mapping is a critical feature (I would argue to the contrary), vbus can support it. > It also pulls parts of the device model into the host kernel. That is the point. Most of it needs to be there for performance. And what doesn't need to be there for performance can either be: a) skipped at the discretion of the connector/device-model designer OR b) included because its trivially small subset of the model (e.g. a mac-addr attribute) and its nice to have a cohesive solution instead of requiring a separate binary blob that can get out of sync, etc. The example Ive provided to date (venet on kvm) utilizes (b), but it certainly doesn't have to. Therefore, I don't think vbus as a whole can be judged on this one point. > >>> The virtio layering was more or less dictated by Xen which doesn't have >>> shared memory (it uses grant references instead). As a matter of fact >>> lguest, kvm/pci, and kvm/s390 all have shared memory, as you do, so that >>> part is duplicated. It's probably possible to add a virtio-shmem.ko >>> library that people who do have shared memory can reuse. >>> >> Note that I do not believe the Xen folk use virtio, so while I can >> appreciate the foresight that went into that particular aspect of the >> design of the virtio model, I am not sure if its a realistic constraint. >> > > Since a virtio goal was to reduce virtual device driver proliferation, > it was necessary to accommodate Xen. Fair enough, but I don't think the Xen community will ever use it. To your point, a vbus goal was to reduce the bus-model and backend-device-model proliferation for environments served by Linux as the host. This naturally complements virtio's driver non-proliferation goal, but probably excludes Xen for reasons beyond the lack of shmem (since it has its own non-linux hypervisor kernel). In any case, I've already stated that we simply make the virtio-shmem (vbus-proxy-device) facility optionally defined, and unavailable on non-shmem based architectures to work around that issue. The alternative is that we abstract the shmem concept further (ala ->add_buf() from the virtqueue world) but it is probably pointless to try to accommodate shared-memory if you don't really have it, and no-one will likely use it. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 15:10 ` Gregory Haskins @ 2009-09-23 17:58 ` Gregory Haskins 2009-09-23 19:37 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-23 17:58 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 11641 bytes --] Gregory Haskins wrote: > Avi Kivity wrote: >> On 09/23/2009 05:26 PM, Gregory Haskins wrote: >>> >>>>> Yes, I'm having to create my own bus model, a-la lguest, virtio-pci, >>>>> and >>>>> virtio-s390. It isn't especially easy. I can steal lots of code from >>>>> the >>>>> lguest bus model, but sometimes it is good to generalize, especially >>>>> after the fourth implemention or so. I think this is what GHaskins >>>>> tried >>>>> to do. >>>>> >>>>> >>>> Yes. vbus is more finely layered so there is less code duplication. >>>> >>> To clarify, Ira was correct in stating this generalizing some of these >>> components was one of the goals for the vbus project: IOW vbus finely >>> layers and defines what's below virtio, not replaces it. >>> >>> You can think of a virtio-stack like this: >>> >>> -------------------------- >>> | virtio-net >>> -------------------------- >>> | virtio-ring >>> -------------------------- >>> | virtio-bus >>> -------------------------- >>> | ? undefined ? >>> -------------------------- >>> >>> IOW: The way I see it, virtio is a device interface model only. The >>> rest of it is filled in by the virtio-transport and some kind of >>> back-end. >>> >>> So today, we can complete the "? undefined ?" block like this for KVM: >>> >>> -------------------------- >>> | virtio-pci >>> -------------------------- >>> | >>> -------------------------- >>> | kvm.ko >>> -------------------------- >>> | qemu >>> -------------------------- >>> | tuntap >>> -------------------------- >>> >>> In this case, kvm.ko and tuntap are providing plumbing, and qemu is >>> providing a backend device model (pci-based, etc). >>> >>> You can, of course, plug a different stack in (such as virtio-lguest, >>> virtio-ira, etc) but you are more or less on your own to recreate many >>> of the various facilities contained in that stack (such as things >>> provided by QEMU, like discovery/hotswap/addressing), as Ira is >>> discovering. >>> >>> Vbus tries to commoditize more components in the stack (like the bus >>> model and backend-device model) so they don't need to be redesigned each >>> time we solve this "virtio-transport" problem. IOW: stop the >>> proliferation of the need for pci-bus, lguest-bus, foo-bus underneath >>> virtio. Instead, we can then focus on the value add on top, like the >>> models themselves or the simple glue between them. >>> >>> So now you might have something like >>> >>> -------------------------- >>> | virtio-vbus >>> -------------------------- >>> | vbus-proxy >>> -------------------------- >>> | kvm-guest-connector >>> -------------------------- >>> | >>> -------------------------- >>> | kvm.ko >>> -------------------------- >>> | kvm-host-connector.ko >>> -------------------------- >>> | vbus.ko >>> -------------------------- >>> | virtio-net-backend.ko >>> -------------------------- >>> >>> so now we don't need to worry about the bus-model or the device-model >>> framework. We only need to implement the connector, etc. This is handy >>> when you find yourself in an environment that doesn't support PCI (such >>> as Ira's rig, or userspace containers), or when you want to add features >>> that PCI doesn't have (such as fluid event channels for things like IPC >>> services, or priortizable interrupts, etc). >>> >> Well, vbus does more, for example it tunnels interrupts instead of >> exposing them 1:1 on the native interface if it exists. > > As I've previously explained, that trait is a function of the > kvm-connector I've chosen to implement, not of the overall design of vbus. > > The reason why my kvm-connector is designed that way is because my early > testing/benchmarking shows one of the issues in KVM performance is the > ratio of exits per IO operation are fairly high, especially as your > scale io-load. Therefore, the connector achieves a substantial > reduction in that ratio by treating "interrupts" to the same kind of > benefits that NAPI brought to general networking: That is, we enqueue > "interrupt" messages into a lockless ring and only hit the IDT for the > first occurrence. Subsequent interrupts are injected in a > parallel/lockless manner, without hitting the IDT nor incurring an extra > EOI. This pays dividends as the IO rate increases, which is when the > guest needs the most help. > > OTOH, it is entirely possible to design the connector such that we > maintain a 1:1 ratio of signals to traditional IDT interrupts. It is > also possible to design a connector which surfaces as something else, > such as PCI devices (by terminating the connector in QEMU and utilizing > its PCI emulation facilities), which would naturally employ 1:1 mapping. > > So if 1:1 mapping is a critical feature (I would argue to the contrary), > vbus can support it. > >> It also pulls parts of the device model into the host kernel. > > That is the point. Most of it needs to be there for performance. To clarify this point: There are various aspects about designing high-performance virtual devices such as providing the shortest paths possible between the physical resources and the consumers. Conversely, we also need to ensure that we meet proper isolation/protection guarantees at the same time. What this means is there are various aspects to any high-performance PV design that require to be placed in-kernel to maximize the performance yet properly isolate the guest. For instance, you are required to have your signal-path (interrupts and hypercalls), your memory-path (gpa translation), and addressing/isolation model in-kernel to maximize performance. Vbus accomplishes its in-kernel isolation model by providing a "container" concept, where objects are placed into this container by userspace. The host kernel enforces isolation/protection by using a namespace to identify objects that is only relevant within a specific container's context (namely, a "u32 dev-id"). The guest addresses the objects by its dev-id, and the kernel ensures that the guest can't access objects outside of its dev-id namespace. All that is required is a way to transport a message with a "devid" attribute as an address (such as DEVCALL(devid)) and the framework provides the rest of the decode+execute function. Contrast this to vhost+virtio-pci (called simply "vhost" from here). It is not immune to requiring in-kernel addressing support either, but rather it just does it differently (and its not as you might expect via qemu). Vhost relies on QEMU to render PCI objects to the guest, which the guest assigns resources (such as BARs, interrupts, etc). A PCI-BAR in this example may represent a PIO address for triggering some operation in the device-model's fast-path. For it to have meaning in the fast-path, KVM has to have in-kernel knowledge of what a PIO-exit is, and what to do with it (this is where pio-bus and ioeventfd come in). The programming of the PIO-exit and the ioeventfd are likewise controlled by some userspace management entity (i.e. qemu). The PIO address and value tuple form the address, and the ioeventfd framework within KVM provide the decode+execute function. This idea seemingly works fine, mind you, but it rides on top of a *lot* of stuff including but not limited to: the guests pci stack, the qemu pci emulation, kvm pio support, and ioeventfd. When you get into situations where you don't have PCI or even KVM underneath you (e.g. a userspace container, Ira's rig, etc) trying to recreate all of that PCI infrastructure for the sake of using PCI is, IMO, a lot of overhead for little gain. All you really need is a simple decode+execute mechanism, and a way to program it from userspace control. vbus tries to do just that: commoditize it so all you need is the transport of the control messages (like DEVCALL()), but the decode+execute itself is reuseable, even across various environments (like KVM or Iras rig). And we face similar situations with the signal-path and memory-path components...but lets take a look at the slow-path side. > And what doesn't need to be there for performance can either be: > > a) skipped at the discretion of the connector/device-model designer > > OR > > b) included because its trivially small subset of the model (e.g. a > mac-addr attribute) and its nice to have a cohesive solution instead of > requiring a separate binary blob that can get out of sync, etc. > > The example Ive provided to date (venet on kvm) utilizes (b), but it > certainly doesn't have to. Therefore, I don't think vbus as a whole can > be judged on this one point. For a given model, we have a grouping of operations for fast path and slow path. Fast path would be things like we just talked about (signal-path, memory-path, addressing model). Slow path would be things like device discovery (and hotswap), config-space, etc. And your argument, I believe, is that vbus allows both to be implemented in the kernel (though to reiterate, its optional) and is therefore a bad design, so lets discuss that. I believe the assertion is that things like config-space are best left to userspace, and we should only relegate fast-path duties to the kernel. The problem is that, in my experience, a good deal of config-space actually influences the fast-path and thus needs to interact with the fast-path mechanism eventually anyway. Whats left over that doesn't fall into this category may cheaply ride on existing plumbing, so its not like we created something new or unnatural just to support this subclass of config-space. For example: take an attribute like the mac-address assigned to a NIC. This clearly doesn't need to be in-kernel and could go either way (such as a PCI config-space register). As another example: consider an option bit that enables a new feature that affects the fast-path, like RXBUF merging. If we use the split model where config space is handled by userspace and fast-path is in-kernel, the userspace component is only going to act as a proxy. I.e. it will pass the option down to the kernel eventually. Therefore, there is little gain in trying to split this type of slow-path out to userspace. In fact, its more work. vbus addresses this observation by providing a very simple (yet hopefully powerful) model of providing two basic verbs to a device: dev->call() dev->shm() It makes no distinction of slow or fast-path type operations, per se. Just a mechanism for synchronous or asynchronous communication. It is expected that a given component will build "config-space" primarily from the synchronous ->call() interface if it requires one. However, it gets this for free since we need ->call() for fast-path too (like the rt-scheduler device, etc). So I can then use ->call to perform a fast-path scheduler update (has to go in-kernel for performance), an "enable rxbuf-merge" function (has to end-up in-kernel eventually), or a "macquery" (doesn't need to be in-kernel). My choice was to support that third operation in-kernel as well, because its way more complicated to do it another way that it is to simply export a sysfs attribute to set it. Userspace is still completely in control..it sets the value. It just doesnt have to write plumbing to make it accessible. The basic vbus model inherently provides this. Thats enough for now. We can talk about discovery/hotswap at a later time. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 17:58 ` Gregory Haskins @ 2009-09-23 19:37 ` Avi Kivity 2009-09-23 21:15 ` Gregory Haskins 2009-09-24 8:03 ` Avi Kivity 0 siblings, 2 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-23 19:37 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/23/2009 08:58 PM, Gregory Haskins wrote: >> >>> It also pulls parts of the device model into the host kernel. >>> >> That is the point. Most of it needs to be there for performance. >> > To clarify this point: > > There are various aspects about designing high-performance virtual > devices such as providing the shortest paths possible between the > physical resources and the consumers. Conversely, we also need to > ensure that we meet proper isolation/protection guarantees at the same > time. What this means is there are various aspects to any > high-performance PV design that require to be placed in-kernel to > maximize the performance yet properly isolate the guest. > > For instance, you are required to have your signal-path (interrupts and > hypercalls), your memory-path (gpa translation), and > addressing/isolation model in-kernel to maximize performance. > Exactly. That's what vhost puts into the kernel and nothing more. > Vbus accomplishes its in-kernel isolation model by providing a > "container" concept, where objects are placed into this container by > userspace. The host kernel enforces isolation/protection by using a > namespace to identify objects that is only relevant within a specific > container's context (namely, a "u32 dev-id"). The guest addresses the > objects by its dev-id, and the kernel ensures that the guest can't > access objects outside of its dev-id namespace. > vhost manages to accomplish this without any kernel support. The guest simply has not access to any vhost resources other than the guest->host doorbell, which is handed to the guest outside vhost (so it's somebody else's problem, in userspace). > All that is required is a way to transport a message with a "devid" > attribute as an address (such as DEVCALL(devid)) and the framework > provides the rest of the decode+execute function. > vhost avoids that. > Contrast this to vhost+virtio-pci (called simply "vhost" from here). > It's the wrong name. vhost implements only the data path. > It is not immune to requiring in-kernel addressing support either, but > rather it just does it differently (and its not as you might expect via > qemu). > > Vhost relies on QEMU to render PCI objects to the guest, which the guest > assigns resources (such as BARs, interrupts, etc). vhost does not rely on qemu. It relies on its user to handle configuration. In one important case it's qemu+pci. It could just as well be the lguest launcher. > A PCI-BAR in this > example may represent a PIO address for triggering some operation in the > device-model's fast-path. For it to have meaning in the fast-path, KVM > has to have in-kernel knowledge of what a PIO-exit is, and what to do > with it (this is where pio-bus and ioeventfd come in). The programming > of the PIO-exit and the ioeventfd are likewise controlled by some > userspace management entity (i.e. qemu). The PIO address and value > tuple form the address, and the ioeventfd framework within KVM provide > the decode+execute function. > Right. > This idea seemingly works fine, mind you, but it rides on top of a *lot* > of stuff including but not limited to: the guests pci stack, the qemu > pci emulation, kvm pio support, and ioeventfd. When you get into > situations where you don't have PCI or even KVM underneath you (e.g. a > userspace container, Ira's rig, etc) trying to recreate all of that PCI > infrastructure for the sake of using PCI is, IMO, a lot of overhead for > little gain. > For the N+1th time, no. vhost is perfectly usable without pci. Can we stop raising and debunking this point? > All you really need is a simple decode+execute mechanism, and a way to > program it from userspace control. vbus tries to do just that: > commoditize it so all you need is the transport of the control messages > (like DEVCALL()), but the decode+execute itself is reuseable, even > across various environments (like KVM or Iras rig). > If you think it should be "commodotized", write libvhostconfig.so. > And your argument, I believe, is that vbus allows both to be implemented > in the kernel (though to reiterate, its optional) and is therefore a bad > design, so lets discuss that. > > I believe the assertion is that things like config-space are best left > to userspace, and we should only relegate fast-path duties to the > kernel. The problem is that, in my experience, a good deal of > config-space actually influences the fast-path and thus needs to > interact with the fast-path mechanism eventually anyway. > Whats left > over that doesn't fall into this category may cheaply ride on existing > plumbing, so its not like we created something new or unnatural just to > support this subclass of config-space. > Flexibility is reduced, because changing code in the kernel is more expensive than in userspace, and kernel/user interfaces aren't typically as wide as pure userspace interfaces. Security is reduced, since a bug in the kernel affects the host, while a bug in userspace affects just on guest. Example: feature negotiation. If it happens in userspace, it's easy to limit what features we expose to the guest. If it happens in the kernel, we need to add an interface to let the kernel know which features it should expose to the guest. We also need to add an interface to let userspace know which features were negotiated, if we want to implement live migration. Something fairly trivial bloats rapidly. > For example: take an attribute like the mac-address assigned to a NIC. > This clearly doesn't need to be in-kernel and could go either way (such > as a PCI config-space register). > > As another example: consider an option bit that enables a new feature > that affects the fast-path, like RXBUF merging. If we use the split > model where config space is handled by userspace and fast-path is > in-kernel, the userspace component is only going to act as a proxy. > I.e. it will pass the option down to the kernel eventually. Therefore, > there is little gain in trying to split this type of slow-path out to > userspace. In fact, its more work. > As you can see above, userspace needs to be involved in this, and the number of interfaces required is smaller if it's in userspace: you only need to know which features the kernel supports (they can be enabled unconditionally, just not exposed). Further, some devices are perfectly happy to be implemented in userspace, so we need userspace configuration support anyway. Why reimplement it in the kernel? -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 19:37 ` Avi Kivity @ 2009-09-23 21:15 ` Gregory Haskins 2009-09-24 7:18 ` Avi Kivity 2009-09-24 8:03 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-23 21:15 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 13736 bytes --] Avi Kivity wrote: > On 09/23/2009 08:58 PM, Gregory Haskins wrote: >>> >>>> It also pulls parts of the device model into the host kernel. >>>> >>> That is the point. Most of it needs to be there for performance. >>> >> To clarify this point: >> >> There are various aspects about designing high-performance virtual >> devices such as providing the shortest paths possible between the >> physical resources and the consumers. Conversely, we also need to >> ensure that we meet proper isolation/protection guarantees at the same >> time. What this means is there are various aspects to any >> high-performance PV design that require to be placed in-kernel to >> maximize the performance yet properly isolate the guest. >> >> For instance, you are required to have your signal-path (interrupts and >> hypercalls), your memory-path (gpa translation), and >> addressing/isolation model in-kernel to maximize performance. >> > > Exactly. That's what vhost puts into the kernel and nothing more. Actually, no. Generally, _KVM_ puts those things into the kernel, and vhost consumes them. Without KVM (or something equivalent), vhost is incomplete. One of my goals with vbus is to generalize the "something equivalent" part here. I know you may not care about non-kvm use cases, and thats fine. No one says you have to. However, note that some of use do care about these non-kvm cases, and thus its a distinction I am making here as a benefit of the vbus framework. > >> Vbus accomplishes its in-kernel isolation model by providing a >> "container" concept, where objects are placed into this container by >> userspace. The host kernel enforces isolation/protection by using a >> namespace to identify objects that is only relevant within a specific >> container's context (namely, a "u32 dev-id"). The guest addresses the >> objects by its dev-id, and the kernel ensures that the guest can't >> access objects outside of its dev-id namespace. >> > > vhost manages to accomplish this without any kernel support. No, vhost manages to accomplish this because of KVMs kernel support (ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a merely a kind of "tuntap"-like clone signalled by eventfds. vbus on the other hand, generalizes one more piece of the puzzle (namely, the function of pio+ioeventfd and userspace's programming of it) by presenting the devid namespace and container concept. This goes directly to my rebuttal of your claim that vbus places too much in the kernel. I state that, one way or the other, address decode and isolation _must_ be in the kernel for performance. Vbus does this with a devid/container scheme. vhost+virtio-pci+kvm does it with pci+pio+ioeventfd. > The guest > simply has not access to any vhost resources other than the guest->host > doorbell, which is handed to the guest outside vhost (so it's somebody > else's problem, in userspace). You mean _controlled_ by userspace, right? Obviously, the other side of the kernel still needs to be programmed (ioeventfd, etc). Otherwise, vhost would be pointless: e.g. just use vanilla tuntap if you don't need fast in-kernel decoding. > >> All that is required is a way to transport a message with a "devid" >> attribute as an address (such as DEVCALL(devid)) and the framework >> provides the rest of the decode+execute function. >> > > vhost avoids that. No, it doesn't avoid it. It just doesn't specify how its done, and relies on something else to do it on its behalf. Conversely, vbus specifies how its done, but not how to transport the verb "across the wire". That is the role of the vbus-connector abstraction. > >> Contrast this to vhost+virtio-pci (called simply "vhost" from here). >> > > It's the wrong name. vhost implements only the data path. Understood, but vhost+virtio-pci is what I am contrasting, and I use "vhost" for short from that point on because I am too lazy to type the whole name over and over ;) > >> It is not immune to requiring in-kernel addressing support either, but >> rather it just does it differently (and its not as you might expect via >> qemu). >> >> Vhost relies on QEMU to render PCI objects to the guest, which the guest >> assigns resources (such as BARs, interrupts, etc). > > vhost does not rely on qemu. It relies on its user to handle > configuration. In one important case it's qemu+pci. It could just as > well be the lguest launcher. I meant vhost=vhost+virtio-pci here. Sorry for the confusion. The point I am making specifically is that vhost in general relies on other in-kernel components to function. I.e. It cannot function without having something like the PCI model to build an IO namespace. That namespace (in this case, pio addresses+data tuples) are used for the in-kernel addressing function under KVM + virtio-pci. The case of the lguest launcher is a good one to highlight. Yes, you can presumably also use lguest with vhost, if the requisite facilities are exposed to lguest-bus, and some eventfd based thing like ioeventfd is written for the host (if it doesnt exist already). And when the next virt design "foo" comes out, it can make a "foo-bus" model, and implement foo-eventfd on the backend, etc, etc. Ira can make ira-bus, and ira-eventfd, etc, etc. Each iteration will invariably introduce duplicated parts of the stack. Vbus tries to generalize some of those pieces so we can reuse them. I chose the very non-specific name "virtual-bus" for the design intentionally to decouple it from any one particular "hypervisor" (e.g. xenbus, lguest-bus, etc) and promote it as a general purpose bus for hopefully any hypervisor (or physical systems too, e.g. Iras). I assume "virtio" was chosen to reflect a similar positioning at the device-model layer. Had vbus come out before lguest, I would have proposed that lguest should use it natively instead of creating lguest-bus. While its probably too late in that specific case, perhaps going forward this is the direction we can take, just like perhaps virtio is the device model direction we can take. Likewise, the backend is generalized so one model can be written that works in all environments that support vbus. The connector takes care of the "wire" details, and the other stuff functions to serve the bus portion of the stack (signal-routing, memory-routing, isolation/addressing, etc). > >> A PCI-BAR in this >> example may represent a PIO address for triggering some operation in the >> device-model's fast-path. For it to have meaning in the fast-path, KVM >> has to have in-kernel knowledge of what a PIO-exit is, and what to do >> with it (this is where pio-bus and ioeventfd come in). The programming >> of the PIO-exit and the ioeventfd are likewise controlled by some >> userspace management entity (i.e. qemu). The PIO address and value >> tuple form the address, and the ioeventfd framework within KVM provide >> the decode+execute function. >> > > Right. > >> This idea seemingly works fine, mind you, but it rides on top of a *lot* >> of stuff including but not limited to: the guests pci stack, the qemu >> pci emulation, kvm pio support, and ioeventfd. When you get into >> situations where you don't have PCI or even KVM underneath you (e.g. a >> userspace container, Ira's rig, etc) trying to recreate all of that PCI >> infrastructure for the sake of using PCI is, IMO, a lot of overhead for >> little gain. >> > > For the N+1th time, no. vhost is perfectly usable without pci. Can we > stop raising and debunking this point? Again, I understand vhost is decoupled from PCI, and I don't mean to imply anything different. I use PCI as an example here because a) its the only working example of vhost today (to my knowledge), and b) you have stated in the past that PCI is the only "right" way here, to paraphrase. Perhaps you no longer feel that way, so I apologize if you feel you already recanted your position on PCI and I missed it. I digress. My point here isn't PCI. The point here is the missing component for when PCI is not present. The component that is partially satisfied by vbus's devid addressing scheme. If you are going to use vhost, and you don't have PCI, you've gotta build something to replace it. > >> All you really need is a simple decode+execute mechanism, and a way to >> program it from userspace control. vbus tries to do just that: >> commoditize it so all you need is the transport of the control messages >> (like DEVCALL()), but the decode+execute itself is reuseable, even >> across various environments (like KVM or Iras rig). >> > > If you think it should be "commodotized", write libvhostconfig.so. I know you are probably being facetious here, but what do you propose for the parts that must be in-kernel? > >> And your argument, I believe, is that vbus allows both to be implemented >> in the kernel (though to reiterate, its optional) and is therefore a bad >> design, so lets discuss that. >> >> I believe the assertion is that things like config-space are best left >> to userspace, and we should only relegate fast-path duties to the >> kernel. The problem is that, in my experience, a good deal of >> config-space actually influences the fast-path and thus needs to >> interact with the fast-path mechanism eventually anyway. >> Whats left >> over that doesn't fall into this category may cheaply ride on existing >> plumbing, so its not like we created something new or unnatural just to >> support this subclass of config-space. >> > > Flexibility is reduced, because changing code in the kernel is more > expensive than in userspace, and kernel/user interfaces aren't typically > as wide as pure userspace interfaces. Security is reduced, since a bug > in the kernel affects the host, while a bug in userspace affects just on > guest. For a mac-address attribute? Thats all we are really talking about here. These points you raise, while true of any kernel code I suppose, are a bit of a stretch in this context. > > Example: feature negotiation. If it happens in userspace, it's easy to > limit what features we expose to the guest. Its not any harder in the kernel. I do this today. And when you are done negotiating said features, you will generally have to turn around and program the feature into the backend anyway (e.g. ioctl() to vhost module). Now you have to maintain some knowledge of that particular feature and how to program it in two places. Conversely, I am eliminating the (unnecessary) middleman by letting the feature negotiating take place directly between the two entities that will consume it. > If it happens in the > kernel, we need to add an interface to let the kernel know which > features it should expose to the guest. You need this already either way for both models anyway. As an added bonus, vbus has generalized that interface using sysfs attributes, so all models are handled in a similar and community accepted way. > We also need to add an > interface to let userspace know which features were negotiated, if we > want to implement live migration. Something fairly trivial bloats rapidly. Can you elaborate on the requirements for live-migration? Wouldnt an opaque save/restore model work here? (e.g. why does userspace need to be able to interpret the in-kernel state, just pass it along as a blob to the new instance). > >> For example: take an attribute like the mac-address assigned to a NIC. >> This clearly doesn't need to be in-kernel and could go either way (such >> as a PCI config-space register). >> >> As another example: consider an option bit that enables a new feature >> that affects the fast-path, like RXBUF merging. If we use the split >> model where config space is handled by userspace and fast-path is >> in-kernel, the userspace component is only going to act as a proxy. >> I.e. it will pass the option down to the kernel eventually. Therefore, >> there is little gain in trying to split this type of slow-path out to >> userspace. In fact, its more work. >> > > As you can see above, userspace needs to be involved in this, and the > number of interfaces required is smaller if it's in userspace: Actually, no. My experience has been the opposite. Anytime I sat down and tried to satisfy your request to move things to the userspace, things got ugly and duplicative really quick. I suspect part of the reason you may think its easier because you already have part of virtio-net in userspace and its surrounding support, but that is not the case moving forward for new device types. > you only > need to know which features the kernel supports (they can be enabled > unconditionally, just not exposed). > > Further, some devices are perfectly happy to be implemented in > userspace, so we need userspace configuration support anyway. Why > reimplement it in the kernel? Thats fine. vbus is targetted for high-performance IO. So if you have a robust userspace (like KVM+QEMU) and low-performance constraints (say, for a console or something), put it in userspace and vbus is not involved. I don't care. However, if you are coming from somewhere else (like Ira's rig) where you don't necessarily have a robust userspace module, vbus provides a model that allows you to chose whether you want to do a vhost like model, or a full resource container with the isolation guarantees, etc, built in. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 21:15 ` Gregory Haskins @ 2009-09-24 7:18 ` Avi Kivity 2009-09-24 18:03 ` Gregory Haskins 2009-09-24 19:27 ` Ira W. Snyder 0 siblings, 2 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-24 7:18 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/24/2009 12:15 AM, Gregory Haskins wrote: > >>> There are various aspects about designing high-performance virtual >>> devices such as providing the shortest paths possible between the >>> physical resources and the consumers. Conversely, we also need to >>> ensure that we meet proper isolation/protection guarantees at the same >>> time. What this means is there are various aspects to any >>> high-performance PV design that require to be placed in-kernel to >>> maximize the performance yet properly isolate the guest. >>> >>> For instance, you are required to have your signal-path (interrupts and >>> hypercalls), your memory-path (gpa translation), and >>> addressing/isolation model in-kernel to maximize performance. >>> >>> >> Exactly. That's what vhost puts into the kernel and nothing more. >> > Actually, no. Generally, _KVM_ puts those things into the kernel, and > vhost consumes them. Without KVM (or something equivalent), vhost is > incomplete. One of my goals with vbus is to generalize the "something > equivalent" part here. > I don't really see how vhost and vbus are different here. vhost expects signalling to happen through a couple of eventfds and requires someone to supply them and implement kernel support (if needed). vbus requires someone to write a connector to provide the signalling implementation. Neither will work out-of-the-box when implementing virtio-net over falling dominos, for example. >>> Vbus accomplishes its in-kernel isolation model by providing a >>> "container" concept, where objects are placed into this container by >>> userspace. The host kernel enforces isolation/protection by using a >>> namespace to identify objects that is only relevant within a specific >>> container's context (namely, a "u32 dev-id"). The guest addresses the >>> objects by its dev-id, and the kernel ensures that the guest can't >>> access objects outside of its dev-id namespace. >>> >>> >> vhost manages to accomplish this without any kernel support. >> > No, vhost manages to accomplish this because of KVMs kernel support > (ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a > merely a kind of "tuntap"-like clone signalled by eventfds. > Without a vbus-connector-falling-dominos, vbus-venet can't do anything either. Both vhost and vbus need an interface, vhost's is just narrower since it doesn't do configuration or enumeration. > This goes directly to my rebuttal of your claim that vbus places too > much in the kernel. I state that, one way or the other, address decode > and isolation _must_ be in the kernel for performance. Vbus does this > with a devid/container scheme. vhost+virtio-pci+kvm does it with > pci+pio+ioeventfd. > vbus doesn't do kvm guest address decoding for the fast path. It's still done by ioeventfd. >> The guest >> simply has not access to any vhost resources other than the guest->host >> doorbell, which is handed to the guest outside vhost (so it's somebody >> else's problem, in userspace). >> > You mean _controlled_ by userspace, right? Obviously, the other side of > the kernel still needs to be programmed (ioeventfd, etc). Otherwise, > vhost would be pointless: e.g. just use vanilla tuntap if you don't need > fast in-kernel decoding. > Yes (though for something like level-triggered interrupts we're probably keeping it in userspace, enjoying the benefits of vhost data path while paying more for signalling). >>> All that is required is a way to transport a message with a "devid" >>> attribute as an address (such as DEVCALL(devid)) and the framework >>> provides the rest of the decode+execute function. >>> >>> >> vhost avoids that. >> > No, it doesn't avoid it. It just doesn't specify how its done, and > relies on something else to do it on its behalf. > That someone else can be in userspace, apart from the actual fast path. > Conversely, vbus specifies how its done, but not how to transport the > verb "across the wire". That is the role of the vbus-connector abstraction. > So again, vbus does everything in the kernel (since it's so easy and cheap) but expects a vbus-connector. vhost does configuration in userspace (since it's so clunky and fragile) but expects a couple of eventfds. >>> Contrast this to vhost+virtio-pci (called simply "vhost" from here). >>> >>> >> It's the wrong name. vhost implements only the data path. >> > Understood, but vhost+virtio-pci is what I am contrasting, and I use > "vhost" for short from that point on because I am too lazy to type the > whole name over and over ;) > If you #define A A+B+C don't expect intelligent conversation afterwards. >>> It is not immune to requiring in-kernel addressing support either, but >>> rather it just does it differently (and its not as you might expect via >>> qemu). >>> >>> Vhost relies on QEMU to render PCI objects to the guest, which the guest >>> assigns resources (such as BARs, interrupts, etc). >>> >> vhost does not rely on qemu. It relies on its user to handle >> configuration. In one important case it's qemu+pci. It could just as >> well be the lguest launcher. >> > I meant vhost=vhost+virtio-pci here. Sorry for the confusion. > > The point I am making specifically is that vhost in general relies on > other in-kernel components to function. I.e. It cannot function without > having something like the PCI model to build an IO namespace. That > namespace (in this case, pio addresses+data tuples) are used for the > in-kernel addressing function under KVM + virtio-pci. > > The case of the lguest launcher is a good one to highlight. Yes, you > can presumably also use lguest with vhost, if the requisite facilities > are exposed to lguest-bus, and some eventfd based thing like ioeventfd > is written for the host (if it doesnt exist already). > > And when the next virt design "foo" comes out, it can make a "foo-bus" > model, and implement foo-eventfd on the backend, etc, etc. > It's exactly the same with vbus needing additional connectors for additional transports. > Ira can make ira-bus, and ira-eventfd, etc, etc. > > Each iteration will invariably introduce duplicated parts of the stack. > Invariably? Use libraries (virtio-shmem.ko, libvhost.so). >> For the N+1th time, no. vhost is perfectly usable without pci. Can we >> stop raising and debunking this point? >> > Again, I understand vhost is decoupled from PCI, and I don't mean to > imply anything different. I use PCI as an example here because a) its > the only working example of vhost today (to my knowledge), and b) you > have stated in the past that PCI is the only "right" way here, to > paraphrase. Perhaps you no longer feel that way, so I apologize if you > feel you already recanted your position on PCI and I missed it. > For kvm/x86 pci definitely remains king. I was talking about the two lguest users and Ira. > I digress. My point here isn't PCI. The point here is the missing > component for when PCI is not present. The component that is partially > satisfied by vbus's devid addressing scheme. If you are going to use > vhost, and you don't have PCI, you've gotta build something to replace it. > Yes, that's why people have keyboards. They'll write that glue code if they need it. If it turns out to be a hit an people start having virtio transport module writing parties, they'll figure out a way to share code. >>> All you really need is a simple decode+execute mechanism, and a way to >>> program it from userspace control. vbus tries to do just that: >>> commoditize it so all you need is the transport of the control messages >>> (like DEVCALL()), but the decode+execute itself is reuseable, even >>> across various environments (like KVM or Iras rig). >>> >>> >> If you think it should be "commodotized", write libvhostconfig.so. >> > I know you are probably being facetious here, but what do you propose > for the parts that must be in-kernel? > On the guest side, virtio-shmem.ko can unify the ring access. It probably makes sense even today. On the host side I eventfd is the kernel interface and libvhostconfig.so can provide the configuration when an existing ABI is not imposed. >>> And your argument, I believe, is that vbus allows both to be implemented >>> in the kernel (though to reiterate, its optional) and is therefore a bad >>> design, so lets discuss that. >>> >>> I believe the assertion is that things like config-space are best left >>> to userspace, and we should only relegate fast-path duties to the >>> kernel. The problem is that, in my experience, a good deal of >>> config-space actually influences the fast-path and thus needs to >>> interact with the fast-path mechanism eventually anyway. >>> Whats left >>> over that doesn't fall into this category may cheaply ride on existing >>> plumbing, so its not like we created something new or unnatural just to >>> support this subclass of config-space. >>> >>> >> Flexibility is reduced, because changing code in the kernel is more >> expensive than in userspace, and kernel/user interfaces aren't typically >> as wide as pure userspace interfaces. Security is reduced, since a bug >> in the kernel affects the host, while a bug in userspace affects just on >> guest. >> > For a mac-address attribute? Thats all we are really talking about > here. These points you raise, while true of any kernel code I suppose, > are a bit of a stretch in this context. > Look at the virtio-net feature negotiation. There's a lot more there than the MAC address, and it's going to grow. >> Example: feature negotiation. If it happens in userspace, it's easy to >> limit what features we expose to the guest. >> > Its not any harder in the kernel. I do this today. > > And when you are done negotiating said features, you will generally have > to turn around and program the feature into the backend anyway (e.g. > ioctl() to vhost module). Now you have to maintain some knowledge of > that particular feature and how to program it in two places. > No, you can leave it enabled unconditionally in vhost (the guest won't use what it doesn't know about). > Conversely, I am eliminating the (unnecessary) middleman by letting the > feature negotiating take place directly between the two entities that > will consume it. > The middleman is necessary, if you want to support live migration, or to restrict a guest to a subset of your features. >> If it happens in the >> kernel, we need to add an interface to let the kernel know which >> features it should expose to the guest. >> > You need this already either way for both models anyway. As an added > bonus, vbus has generalized that interface using sysfs attributes, so > all models are handled in a similar and community accepted way. > vhost doesn't need it since userspace takes care of it. >> We also need to add an >> interface to let userspace know which features were negotiated, if we >> want to implement live migration. Something fairly trivial bloats rapidly. >> > Can you elaborate on the requirements for live-migration? Wouldnt an > opaque save/restore model work here? (e.g. why does userspace need to be > able to interpret the in-kernel state, just pass it along as a blob to > the new instance). > A blob would work, if you commit to forward and backward compatibility in the kernel side (i.e. an older kernel must be able to accept a blob from a newer one). I don't like blobs though, they tie you to the implemenetation. >> As you can see above, userspace needs to be involved in this, and the >> number of interfaces required is smaller if it's in userspace: >> > Actually, no. My experience has been the opposite. Anytime I sat down > and tried to satisfy your request to move things to the userspace, > things got ugly and duplicative really quick. I suspect part of the > reason you may think its easier because you already have part of > virtio-net in userspace and its surrounding support, but that is not the > case moving forward for new device types. > I can't comment on your experience, but we'll definitely build on existing code for new device types. >> you only >> need to know which features the kernel supports (they can be enabled >> unconditionally, just not exposed). >> >> Further, some devices are perfectly happy to be implemented in >> userspace, so we need userspace configuration support anyway. Why >> reimplement it in the kernel? >> > Thats fine. vbus is targetted for high-performance IO. So if you have > a robust userspace (like KVM+QEMU) and low-performance constraints (say, > for a console or something), put it in userspace and vbus is not > involved. I don't care. > So now the hypothetical non-pci hypervisor needs to support two busses. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-24 7:18 ` Avi Kivity @ 2009-09-24 18:03 ` Gregory Haskins 2009-09-25 8:22 ` Avi Kivity 2009-09-24 19:27 ` Ira W. Snyder 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-24 18:03 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 24877 bytes --] Avi Kivity wrote: > On 09/24/2009 12:15 AM, Gregory Haskins wrote: >> >>>> There are various aspects about designing high-performance virtual >>>> devices such as providing the shortest paths possible between the >>>> physical resources and the consumers. Conversely, we also need to >>>> ensure that we meet proper isolation/protection guarantees at the same >>>> time. What this means is there are various aspects to any >>>> high-performance PV design that require to be placed in-kernel to >>>> maximize the performance yet properly isolate the guest. >>>> >>>> For instance, you are required to have your signal-path (interrupts and >>>> hypercalls), your memory-path (gpa translation), and >>>> addressing/isolation model in-kernel to maximize performance. >>>> >>>> >>> Exactly. That's what vhost puts into the kernel and nothing more. >>> >> Actually, no. Generally, _KVM_ puts those things into the kernel, and >> vhost consumes them. Without KVM (or something equivalent), vhost is >> incomplete. One of my goals with vbus is to generalize the "something >> equivalent" part here. >> > > I don't really see how vhost and vbus are different here. vhost expects > signalling to happen through a couple of eventfds and requires someone > to supply them and implement kernel support (if needed). vbus requires > someone to write a connector to provide the signalling implementation. > Neither will work out-of-the-box when implementing virtio-net over > falling dominos, for example. I realize in retrospect that my choice of words above implies vbus _is_ complete, but this is not what I was saying. What I was trying to convey is that vbus is _more_ complete. Yes, in either case some kind of glue needs to be written. The difference is that vbus implements more of the glue generally, and leaves less required to be customized for each iteration. Going back to our stack diagrams, you could think of a vhost solution like this: -------------------------- | virtio-net -------------------------- | virtio-ring -------------------------- | virtio-bus -------------------------- | ? undefined-1 ? -------------------------- | vhost -------------------------- and you could think of a vbus solution like this -------------------------- | virtio-net -------------------------- | virtio-ring -------------------------- | virtio-bus -------------------------- | bus-interface -------------------------- | ? undefined-2 ? -------------------------- | bus-model -------------------------- | virtio-net-device (vhost ported to vbus model? :) -------------------------- So the difference between vhost and vbus in this particular context is that you need to have "undefined-1" do device discovery/hotswap, config-space, address-decode/isolation, signal-path routing, memory-path routing, etc. Today this function is filled by things like virtio-pci, pci-bus, KVM/ioeventfd, and QEMU for x86. I am not as familiar with lguest, but presumably it is filled there by components like virtio-lguest, lguest-bus, lguest.ko, and lguest-launcher. And to use more contemporary examples, we might have virtio-domino, domino-bus, domino.ko, and domino-launcher as well as virtio-ira, ira-bus, ira.ko, and ira-launcher. Contrast this to the vbus stack: The bus-X components (when optionally employed by the connector designer) do device-discovery, hotswap, config-space, address-decode/isolation, signal-path and memory-path routing, etc in a general (and pv-centric) way. The "undefined-2" portion is the "connector", and just needs to convey messages like "DEVCALL" and "SHMSIGNAL". The rest is handled in other parts of the stack. So to answer your question, the difference is that the part that has to be customized in vbus should be a fraction of what needs to be customized with vhost because it defines more of the stack. And, as eluded to in my diagram, both virtio-net and vhost (with some modifications to fit into the vbus framework) are potentially complementary, not competitors. > >>>> Vbus accomplishes its in-kernel isolation model by providing a >>>> "container" concept, where objects are placed into this container by >>>> userspace. The host kernel enforces isolation/protection by using a >>>> namespace to identify objects that is only relevant within a specific >>>> container's context (namely, a "u32 dev-id"). The guest addresses the >>>> objects by its dev-id, and the kernel ensures that the guest can't >>>> access objects outside of its dev-id namespace. >>>> >>>> >>> vhost manages to accomplish this without any kernel support. >>> >> No, vhost manages to accomplish this because of KVMs kernel support >> (ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a >> merely a kind of "tuntap"-like clone signalled by eventfds. >> > > Without a vbus-connector-falling-dominos, vbus-venet can't do anything > either. Mostly covered above... However, I was addressing your assertion that vhost somehow magically accomplishes this "container/addressing" function without any specific kernel support. This is incorrect. I contend that this kernel support is required and present. The difference is that its defined elsewhere (and typically in a transport/arch specific way). IOW: You can basically think of the programmed PIO addresses as forming its "container". Only addresses explicitly added are visible, and everything else is inaccessible. This whole discussion is merely a question of what's been generalized verses what needs to be re-implemented each time. > Both vhost and vbus need an interface, Agreed > vhost's is just narrower since it doesn't do configuration or enumeration. I would say that makes vhost solution's interface wider, not narrower. With the vbus kvm-connector, simple vbus device instantiation implicitly registers it in the address/enumeration namespace, and transmits a devadd event. It does all that with no more interface complexity than instantiating a vhost device. However, vhost has to then also separately configure its address/enumeration space with other subsystems (e.g. pci, ioeventfd, msi, etc), and define its config-space twice. This means something in userspace has to proxy and/or refactor requests. This also means that the userspace component has to have some knowledge of _how_ to proxy/refactor said requests (i.e. splitting the design), which is another example of where the current vhost model really falls apart IMO. > >> This goes directly to my rebuttal of your claim that vbus places too >> much in the kernel. I state that, one way or the other, address decode >> and isolation _must_ be in the kernel for performance. Vbus does this >> with a devid/container scheme. vhost+virtio-pci+kvm does it with >> pci+pio+ioeventfd. >> > > vbus doesn't do kvm guest address decoding for the fast path. It's > still done by ioeventfd. That is not correct. vbus does its own native address decoding in the fast path, such as here: http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=kernel/vbus/client.c;h=e85b2d92d629734866496b67455dd307486e394a;hb=e6cbd4d1decca8e829db3b2b9b6ec65330b379e9#l331 The connector delivers a SHMSIGNAL(id) message, and its decoded generically by an rcu protected radix tree. I think what you are thinking of is that my KVM-connector in AlacrityVM uses PIO/ioeventfd (*) as part of its transport to deliver that SHMSIGNAL message. In this sense, I am doing two address-decodes (one for the initial pio, one for the subsequent shmsignal), but this is an implementation detail of the KVM connector. (Also note that its an implementation detail that the KVM maintainer forced me into ;) The original vbus design utilized a global hypercall in place of the PIO, and thus the shmsignal was the only real decode occurring) (*) actually I dropped ioeventfd in my latest tree, but this is a separate topic. I still use KVM's pio-bus, however. > >>> The guest >>> simply has not access to any vhost resources other than the guest->host >>> doorbell, which is handed to the guest outside vhost (so it's somebody >>> else's problem, in userspace). >>> >> You mean _controlled_ by userspace, right? Obviously, the other side of >> the kernel still needs to be programmed (ioeventfd, etc). Otherwise, >> vhost would be pointless: e.g. just use vanilla tuntap if you don't need >> fast in-kernel decoding. >> > > Yes (though for something like level-triggered interrupts we're probably > keeping it in userspace, enjoying the benefits of vhost data path while > paying more for signalling). Thats fine. I am primarily interested in the high-performance IO, so low-perf/legacy components can fall back to something else like userspace if that best serves them. > >>>> All that is required is a way to transport a message with a "devid" >>>> attribute as an address (such as DEVCALL(devid)) and the framework >>>> provides the rest of the decode+execute function. >>>> >>>> >>> vhost avoids that. >>> >> No, it doesn't avoid it. It just doesn't specify how its done, and >> relies on something else to do it on its behalf. >> > > That someone else can be in userspace, apart from the actual fast path. No, this "devcall" like decoding _is_ fast path and it can't be in userspace if you care about performance. And if you don't care about performance, you can use existing facilities (like QEMU+tuntap) so vhost and vbus alike would become unnecessary in that scenario. > >> Conversely, vbus specifies how its done, but not how to transport the >> verb "across the wire". That is the role of the vbus-connector >> abstraction. >> > > So again, vbus does everything in the kernel (since it's so easy and > cheap) but expects a vbus-connector. vhost does configuration in > userspace (since it's so clunky and fragile) but expects a couple of > eventfds. Well, we are talking about fast-path here, so I am not sure why config-space is coming up in this context. I digress. I realize you are being sarcastic, but your easy+cheap/clunky+fragile assessment is more accurate than you perhaps realize. You keep extolling that vhost does most things in userspace and that is an advantage. But the simple fact is that they both functionally do almost the same amount in-kernel, because they _have_ to. This includes the obvious stuff like signal and memory routing, but also the less obvious stuff like most of config-space. Ultimately, most of config-space needs to terminate at the device-model (the one exception is perhaps "read-only attributes", like "MACQUERY"). Therefore, even if you use a vhost like model, most of your parameters will invariably be a translation from one config space to another and passed on (e.g. pci config-cycle to ioctl()). The disparity of in-kernel vs userspace functionality that remains between the two implementations are basically the enumeration/hotswap and read-only attribute functions. These functions are prohibitively complex in the vhost+virtio-pci+kvm model (full ICH/pci chipset emulation, etc), so I understand why we wouldnt want to move those in-kernel. However, vbus was designed from scratch specifically for PV to be flexible and simple. As a result, the remaining functions in the kvm-connector take advantage of this simplicity and just ride on the existing model that we needed for fast-path anyway. What this means is there are of no significant consequence to do these few minor details in-kernel, other than this long discussion. In fact, it's actually a simpler design to unify things this way because you avoid splitting the device model up. Consider how painful the vhost implementation would be if it didn't already have the userspace virtio-net to fall-back on. This is effectively what we face for new devices going forward if that model is to persist. > >>>> Contrast this to vhost+virtio-pci (called simply "vhost" from here). >>>> >>>> >>> It's the wrong name. vhost implements only the data path. >>> >> Understood, but vhost+virtio-pci is what I am contrasting, and I use >> "vhost" for short from that point on because I am too lazy to type the >> whole name over and over ;) >> > > If you #define A A+B+C don't expect intelligent conversation afterwards. Fair enough, but I did attempt to declare the definition before using it. Sorry again for the confusion. > >>>> It is not immune to requiring in-kernel addressing support either, but >>>> rather it just does it differently (and its not as you might expect via >>>> qemu). >>>> >>>> Vhost relies on QEMU to render PCI objects to the guest, which the >>>> guest >>>> assigns resources (such as BARs, interrupts, etc). >>>> >>> vhost does not rely on qemu. It relies on its user to handle >>> configuration. In one important case it's qemu+pci. It could just as >>> well be the lguest launcher. >>> >> I meant vhost=vhost+virtio-pci here. Sorry for the confusion. >> >> The point I am making specifically is that vhost in general relies on >> other in-kernel components to function. I.e. It cannot function without >> having something like the PCI model to build an IO namespace. That >> namespace (in this case, pio addresses+data tuples) are used for the >> in-kernel addressing function under KVM + virtio-pci. >> >> The case of the lguest launcher is a good one to highlight. Yes, you >> can presumably also use lguest with vhost, if the requisite facilities >> are exposed to lguest-bus, and some eventfd based thing like ioeventfd >> is written for the host (if it doesnt exist already). >> >> And when the next virt design "foo" comes out, it can make a "foo-bus" >> model, and implement foo-eventfd on the backend, etc, etc. >> > > It's exactly the same with vbus needing additional connectors for > additional transports. No, see my reply above. > >> Ira can make ira-bus, and ira-eventfd, etc, etc. >> >> Each iteration will invariably introduce duplicated parts of the stack. >> > > Invariably? As in "always" > Use libraries (virtio-shmem.ko, libvhost.so). What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you dont need libvhost.so per se since you can just use standard kernel interfaces (like configfs/sysfs). I could create an .so going forward for the new ioctl-based interface, I suppose. > > >>> For the N+1th time, no. vhost is perfectly usable without pci. Can we >>> stop raising and debunking this point? >>> >> Again, I understand vhost is decoupled from PCI, and I don't mean to >> imply anything different. I use PCI as an example here because a) its >> the only working example of vhost today (to my knowledge), and b) you >> have stated in the past that PCI is the only "right" way here, to >> paraphrase. Perhaps you no longer feel that way, so I apologize if you >> feel you already recanted your position on PCI and I missed it. >> > > For kvm/x86 pci definitely remains king. For full virtualization, sure. I agree. However, we are talking about PV here. For PV, PCI is not a requirement and is a technical dead-end IMO. KVM seems to be the only virt solution that thinks otherwise (*), but I believe that is primarily a condition of its maturity. I aim to help advance things here. (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some vmi-esq thing (I forget what its called) to name a few. Love 'em or hate 'em, most other hypervisors do something along these lines. I'd like to try to create one for KVM, but to unify them all (at least for the Linux-based host designs). > I was talking about the two > lguest users and Ira. > >> I digress. My point here isn't PCI. The point here is the missing >> component for when PCI is not present. The component that is partially >> satisfied by vbus's devid addressing scheme. If you are going to use >> vhost, and you don't have PCI, you've gotta build something to replace >> it. >> > > Yes, that's why people have keyboards. They'll write that glue code if > they need it. If it turns out to be a hit an people start having virtio > transport module writing parties, they'll figure out a way to share code. Sigh... The party has already started. I tried to invite you months ago... > >>>> All you really need is a simple decode+execute mechanism, and a way to >>>> program it from userspace control. vbus tries to do just that: >>>> commoditize it so all you need is the transport of the control messages >>>> (like DEVCALL()), but the decode+execute itself is reuseable, even >>>> across various environments (like KVM or Iras rig). >>>> >>>> >>> If you think it should be "commodotized", write libvhostconfig.so. >>> >> I know you are probably being facetious here, but what do you propose >> for the parts that must be in-kernel? >> > > On the guest side, virtio-shmem.ko can unify the ring access. It > probably makes sense even today. On the host side I eventfd is the > kernel interface and libvhostconfig.so can provide the configuration > when an existing ABI is not imposed. That won't cut it. For one, creating an eventfd is only part of the equation. I.e. you need to have originate/terminate somewhere interesting (and in-kernel, otherwise use tuntap). > >>>> And your argument, I believe, is that vbus allows both to be >>>> implemented >>>> in the kernel (though to reiterate, its optional) and is therefore a >>>> bad >>>> design, so lets discuss that. >>>> >>>> I believe the assertion is that things like config-space are best left >>>> to userspace, and we should only relegate fast-path duties to the >>>> kernel. The problem is that, in my experience, a good deal of >>>> config-space actually influences the fast-path and thus needs to >>>> interact with the fast-path mechanism eventually anyway. >>>> Whats left >>>> over that doesn't fall into this category may cheaply ride on existing >>>> plumbing, so its not like we created something new or unnatural just to >>>> support this subclass of config-space. >>>> >>>> >>> Flexibility is reduced, because changing code in the kernel is more >>> expensive than in userspace, and kernel/user interfaces aren't typically >>> as wide as pure userspace interfaces. Security is reduced, since a bug >>> in the kernel affects the host, while a bug in userspace affects just on >>> guest. >>> >> For a mac-address attribute? Thats all we are really talking about >> here. These points you raise, while true of any kernel code I suppose, >> are a bit of a stretch in this context. >> > > Look at the virtio-net feature negotiation. There's a lot more there > than the MAC address, and it's going to grow. Agreed, but note that makes my point. That feature negotiation almost invariably influences the device-model, not some config-space shim. IOW: terminating config-space at some userspace shim is pointless. The model ultimately needs the result of whatever transpires during that negotiation anyway. > >>> Example: feature negotiation. If it happens in userspace, it's easy to >>> limit what features we expose to the guest. >>> >> Its not any harder in the kernel. I do this today. >> >> And when you are done negotiating said features, you will generally have >> to turn around and program the feature into the backend anyway (e.g. >> ioctl() to vhost module). Now you have to maintain some knowledge of >> that particular feature and how to program it in two places. >> > > No, you can leave it enabled unconditionally in vhost (the guest won't > use what it doesn't know about). Perhaps, but IMO sending a "feature-mask"-like object down is far easier then proxying/refactoring config-space and sending that down. I'd still chalk the win here to the vbus model used in AlacrityVM. FWIW: venet has the ability to enable/disable features on the host side, so clearly userspace config-space is not required for the basic premise. > >> Conversely, I am eliminating the (unnecessary) middleman by letting the >> feature negotiating take place directly between the two entities that >> will consume it. >> > > The middleman is necessary, if you want to support live migration Orchestrating live-migration has nothing to do with whether config-space is serviced by a middle-man or not. It shouldn't be required to have device-specific knowledge at all beyond what was initially needed to create/config the object at boot time. IOW, the orchestrator merely needs to know that a device-model object is present and a method to serialize and reconstitute its state (if appropriate). , or to > restrict a guest to a subset of your features. No, that is incorrect. We are not talking about directly exposing something like HW cpuid here. These are all virtual models, and they can optionally expose as much or as little as we want. They do this under administrative control by userspace, and independent of the location of the config-space handler. > >>> If it happens in the >>> kernel, we need to add an interface to let the kernel know which >>> features it should expose to the guest. >>> >> You need this already either way for both models anyway. As an added >> bonus, vbus has generalized that interface using sysfs attributes, so >> all models are handled in a similar and community accepted way. >> > > vhost doesn't need it since userspace takes care of it. Ok, but see my related reply above. > >>> We also need to add an >>> interface to let userspace know which features were negotiated, if we >>> want to implement live migration. Something fairly trivial bloats >>> rapidly. >>> >> Can you elaborate on the requirements for live-migration? Wouldnt an >> opaque save/restore model work here? (e.g. why does userspace need to be >> able to interpret the in-kernel state, just pass it along as a blob to >> the new instance). >> > > A blob would work, if you commit to forward and backward compatibility > in the kernel side (i.e. an older kernel must be able to accept a blob > from a newer one). Thats understood and acceptable. > I don't like blobs though, they tie you to the implemenetation. What would you suggest otherwise? > >>> As you can see above, userspace needs to be involved in this, and the >>> number of interfaces required is smaller if it's in userspace: >>> >> Actually, no. My experience has been the opposite. Anytime I sat down >> and tried to satisfy your request to move things to the userspace, >> things got ugly and duplicative really quick. I suspect part of the >> reason you may think its easier because you already have part of >> virtio-net in userspace and its surrounding support, but that is not the >> case moving forward for new device types. >> > > I can't comment on your experience, but we'll definitely build on > existing code for new device types. Fair enough. I'll build on my experience, either reusing existing code or implementing new designs where appropriate. If you or anyone else want to join me in my efforts, the more the merrier. > >>> you only >>> need to know which features the kernel supports (they can be enabled >>> unconditionally, just not exposed). >>> >>> Further, some devices are perfectly happy to be implemented in >>> userspace, so we need userspace configuration support anyway. Why >>> reimplement it in the kernel? >>> >> Thats fine. vbus is targetted for high-performance IO. So if you have >> a robust userspace (like KVM+QEMU) and low-performance constraints (say, >> for a console or something), put it in userspace and vbus is not >> involved. I don't care. >> > > So now the hypothetical non-pci hypervisor needs to support two busses. No. The hypothetical hypervisor only needs to decide where low-performance devices should live. If that is best served by making/reusing a unique bus for them, I have no specific problem with that. Systems are typically composed of multiple buses anyway. Conversely, there is nothing wrong with putting low-performance devices on a bus designed for high-performance either, and vbus can accommodate both types. The latter is what I would advocate for simplicity's sake, but its not a requirement. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-24 18:03 ` Gregory Haskins @ 2009-09-25 8:22 ` Avi Kivity 2009-09-25 21:32 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-25 8:22 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/24/2009 09:03 PM, Gregory Haskins wrote: > >> I don't really see how vhost and vbus are different here. vhost expects >> signalling to happen through a couple of eventfds and requires someone >> to supply them and implement kernel support (if needed). vbus requires >> someone to write a connector to provide the signalling implementation. >> Neither will work out-of-the-box when implementing virtio-net over >> falling dominos, for example. >> > I realize in retrospect that my choice of words above implies vbus _is_ > complete, but this is not what I was saying. What I was trying to > convey is that vbus is _more_ complete. Yes, in either case some kind > of glue needs to be written. The difference is that vbus implements > more of the glue generally, and leaves less required to be customized > for each iteration. > No argument there. Since you care about non-virt scenarios and virtio doesn't, naturally vbus is a better fit for them as the code stands. But that's not a strong argument for vbus; instead of adding vbus you could make virtio more friendly to non-virt (there's a limit how far you can take this, not imposed by the code, but by virtio's charter as a virtual device driver framework). > Going back to our stack diagrams, you could think of a vhost solution > like this: > > -------------------------- > | virtio-net > -------------------------- > | virtio-ring > -------------------------- > | virtio-bus > -------------------------- > | ? undefined-1 ? > -------------------------- > | vhost > -------------------------- > > and you could think of a vbus solution like this > > -------------------------- > | virtio-net > -------------------------- > | virtio-ring > -------------------------- > | virtio-bus > -------------------------- > | bus-interface > -------------------------- > | ? undefined-2 ? > -------------------------- > | bus-model > -------------------------- > | virtio-net-device (vhost ported to vbus model? :) > -------------------------- > > > So the difference between vhost and vbus in this particular context is > that you need to have "undefined-1" do device discovery/hotswap, > config-space, address-decode/isolation, signal-path routing, memory-path > routing, etc. Today this function is filled by things like virtio-pci, > pci-bus, KVM/ioeventfd, and QEMU for x86. I am not as familiar with > lguest, but presumably it is filled there by components like > virtio-lguest, lguest-bus, lguest.ko, and lguest-launcher. And to use > more contemporary examples, we might have virtio-domino, domino-bus, > domino.ko, and domino-launcher as well as virtio-ira, ira-bus, ira.ko, > and ira-launcher. > > Contrast this to the vbus stack: The bus-X components (when optionally > employed by the connector designer) do device-discovery, hotswap, > config-space, address-decode/isolation, signal-path and memory-path > routing, etc in a general (and pv-centric) way. The "undefined-2" > portion is the "connector", and just needs to convey messages like > "DEVCALL" and "SHMSIGNAL". The rest is handled in other parts of the stack. > > Right. virtio assumes that it's in a virt scenario and that the guest architecture already has enumeration and hotplug mechanisms which it would prefer to use. That happens to be the case for kvm/x86. > So to answer your question, the difference is that the part that has to > be customized in vbus should be a fraction of what needs to be > customized with vhost because it defines more of the stack. But if you want to use the native mechanisms, vbus doesn't have any added value. > And, as > eluded to in my diagram, both virtio-net and vhost (with some > modifications to fit into the vbus framework) are potentially > complementary, not competitors. > Only theoretically. The existing installed base would have to be thrown away, or we'd need to support both. >> Without a vbus-connector-falling-dominos, vbus-venet can't do anything >> either. >> > Mostly covered above... > > However, I was addressing your assertion that vhost somehow magically > accomplishes this "container/addressing" function without any specific > kernel support. This is incorrect. I contend that this kernel support > is required and present. The difference is that its defined elsewhere > (and typically in a transport/arch specific way). > > IOW: You can basically think of the programmed PIO addresses as forming > its "container". Only addresses explicitly added are visible, and > everything else is inaccessible. This whole discussion is merely a > question of what's been generalized verses what needs to be > re-implemented each time. > Sorry, this is too abstract for me. >> vbus doesn't do kvm guest address decoding for the fast path. It's >> still done by ioeventfd. >> > That is not correct. vbus does its own native address decoding in the > fast path, such as here: > > http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=kernel/vbus/client.c;h=e85b2d92d629734866496b67455dd307486e394a;hb=e6cbd4d1decca8e829db3b2b9b6ec65330b379e9#l331 > > All this is after kvm has decoded that vbus is addresses. It can't work without someone outside vbus deciding that. > In fact, it's actually a simpler design to unify things this way because > you avoid splitting the device model up. Consider how painful the vhost > implementation would be if it didn't already have the userspace > virtio-net to fall-back on. This is effectively what we face for new > devices going forward if that model is to persist. > It doesn't have just virtio-net, it has userspace-based hostplug and a bunch of other devices impemented in userspace. Currently qemu has virtio bindings for pci and syborg (whatever that is), and device models for baloon, block, net, and console, so it seems implementing device support in userspace is not as disasterous as you make it to be. >> Invariably? >> > As in "always" > Refactor instead of duplicating. > >> Use libraries (virtio-shmem.ko, libvhost.so). >> > What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you > dont need libvhost.so per se since you can just use standard kernel > interfaces (like configfs/sysfs). I could create an .so going forward > for the new ioctl-based interface, I suppose. > Refactor instead of rewriting. >> For kvm/x86 pci definitely remains king. >> > For full virtualization, sure. I agree. However, we are talking about > PV here. For PV, PCI is not a requirement and is a technical dead-end IMO. > > KVM seems to be the only virt solution that thinks otherwise (*), but I > believe that is primarily a condition of its maturity. I aim to help > advance things here. > > (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some > vmi-esq thing (I forget what its called) to name a few. Love 'em or > hate 'em, most other hypervisors do something along these lines. I'd > like to try to create one for KVM, but to unify them all (at least for > the Linux-based host designs). > VMware are throwing VMI away (won't be supported in their new product, and they've sent a patch to rip it off from Linux); Xen has to tunnel xenbus in pci for full virtualization (which is where Windows is, and where Linux will be too once people realize it's faster). lguest is meant as an example hypervisor, not an attempt to take over the world. "PCI is a dead end" could not be more wrong, it's what guests support. An right now you can have a guest using pci to access a mix of userspace-emulated devices, userspace-emulated-but-kernel-accelerated virtio devices, and real host devices. All on one dead-end bus. Try that with vbus. >>> I digress. My point here isn't PCI. The point here is the missing >>> component for when PCI is not present. The component that is partially >>> satisfied by vbus's devid addressing scheme. If you are going to use >>> vhost, and you don't have PCI, you've gotta build something to replace >>> it. >>> >>> >> Yes, that's why people have keyboards. They'll write that glue code if >> they need it. If it turns out to be a hit an people start having virtio >> transport module writing parties, they'll figure out a way to share code. >> > Sigh... The party has already started. I tried to invite you months ago... > I've been voting virtio since 2007. >> On the guest side, virtio-shmem.ko can unify the ring access. It >> probably makes sense even today. On the host side I eventfd is the >> kernel interface and libvhostconfig.so can provide the configuration >> when an existing ABI is not imposed. >> > That won't cut it. For one, creating an eventfd is only part of the > equation. I.e. you need to have originate/terminate somewhere > interesting (and in-kernel, otherwise use tuntap). > vbus needs the same thing so it cancels out. >> Look at the virtio-net feature negotiation. There's a lot more there >> than the MAC address, and it's going to grow. >> > Agreed, but note that makes my point. That feature negotiation almost > invariably influences the device-model, not some config-space shim. > IOW: terminating config-space at some userspace shim is pointless. The > model ultimately needs the result of whatever transpires during that > negotiation anyway. > Well, let's see. Can vbus today: - let userspace know which features are available (so it can decide if live migration is possible) - let userspace limit which features are exposed to the guest (so it can make live migration possible among hosts of different capabilities) - let userspace know which features were negotiated (so it can transfer them to the other host during live migration) - let userspace tell the kernel which features were negotiated (when live migration completes, to avoid requiring the guest to re-negotiate) - do all that from an unprivileged process - securely wrt other unprivileged processes ? What are your plans here? -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-25 8:22 ` Avi Kivity @ 2009-09-25 21:32 ` Gregory Haskins 2009-09-27 9:43 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-25 21:32 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 21865 bytes --] Avi Kivity wrote: > On 09/24/2009 09:03 PM, Gregory Haskins wrote: >> >>> I don't really see how vhost and vbus are different here. vhost expects >>> signalling to happen through a couple of eventfds and requires someone >>> to supply them and implement kernel support (if needed). vbus requires >>> someone to write a connector to provide the signalling implementation. >>> Neither will work out-of-the-box when implementing virtio-net over >>> falling dominos, for example. >>> >> I realize in retrospect that my choice of words above implies vbus _is_ >> complete, but this is not what I was saying. What I was trying to >> convey is that vbus is _more_ complete. Yes, in either case some kind >> of glue needs to be written. The difference is that vbus implements >> more of the glue generally, and leaves less required to be customized >> for each iteration. >> > > > No argument there. Since you care about non-virt scenarios and virtio > doesn't, naturally vbus is a better fit for them as the code stands. Thanks for finally starting to acknowledge there's a benefit, at least. To be more precise, IMO virtio is designed to be a performance oriented ring-based driver interface that supports all types of hypervisors (e.g. shmem based kvm, and non-shmem based Xen). vbus is designed to be a high-performance generic shared-memory interconnect (for rings or otherwise) framework for environments where linux is the underpinning "host" (physical or virtual). They are distinctly different, but complementary (the former addresses the part of the front-end, and latter addresses the back-end, and a different part of the front-end). In addition, the kvm-connector used in AlacrityVM's design strives to add value and improve performance via other mechanisms, such as dynamic allocation, interrupt coalescing (thus reducing exit-ratio, which is a serious issue in KVM) and priortizable/nestable signals. Today there is a large performance disparity between what a KVM guest sees and what a native linux application sees on that same host. Just take a look at some of my graphs between "virtio", and "native", for example: http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png A dominant vbus design principle is to try to achieve the same IO performance for all "linux applications" whether they be literally userspace applications, or things like KVM vcpus or Ira's physical boards. It also aims to solve problems not previously expressible with current technologies (even virtio), like nested real-time. And even though you repeatedly insist otherwise, the neat thing here is that the two technologies mesh (at least under certain circumstances, like when virtio is deployed on a shared-memory friendly linux backend like KVM). I hope that my stack diagram below depicts that clearly. > But that's not a strong argument for vbus; instead of adding vbus you > could make virtio more friendly to non-virt Actually, it _is_ a strong argument then because adding vbus is what helps makes virtio friendly to non-virt, at least for when performance matters. > (there's a limit how far you > can take this, not imposed by the code, but by virtio's charter as a > virtual device driver framework). > >> Going back to our stack diagrams, you could think of a vhost solution >> like this: >> >> -------------------------- >> | virtio-net >> -------------------------- >> | virtio-ring >> -------------------------- >> | virtio-bus >> -------------------------- >> | ? undefined-1 ? >> -------------------------- >> | vhost >> -------------------------- >> >> and you could think of a vbus solution like this >> >> -------------------------- >> | virtio-net >> -------------------------- >> | virtio-ring >> -------------------------- >> | virtio-bus >> -------------------------- >> | bus-interface >> -------------------------- >> | ? undefined-2 ? >> -------------------------- >> | bus-model >> -------------------------- >> | virtio-net-device (vhost ported to vbus model? :) >> -------------------------- >> >> >> So the difference between vhost and vbus in this particular context is >> that you need to have "undefined-1" do device discovery/hotswap, >> config-space, address-decode/isolation, signal-path routing, memory-path >> routing, etc. Today this function is filled by things like virtio-pci, >> pci-bus, KVM/ioeventfd, and QEMU for x86. I am not as familiar with >> lguest, but presumably it is filled there by components like >> virtio-lguest, lguest-bus, lguest.ko, and lguest-launcher. And to use >> more contemporary examples, we might have virtio-domino, domino-bus, >> domino.ko, and domino-launcher as well as virtio-ira, ira-bus, ira.ko, >> and ira-launcher. >> >> Contrast this to the vbus stack: The bus-X components (when optionally >> employed by the connector designer) do device-discovery, hotswap, >> config-space, address-decode/isolation, signal-path and memory-path >> routing, etc in a general (and pv-centric) way. The "undefined-2" >> portion is the "connector", and just needs to convey messages like >> "DEVCALL" and "SHMSIGNAL". The rest is handled in other parts of the >> stack. >> >> > > Right. virtio assumes that it's in a virt scenario and that the guest > architecture already has enumeration and hotplug mechanisms which it > would prefer to use. That happens to be the case for kvm/x86. No, virtio doesn't assume that. It's stack provides the "virtio-bus" abstraction and what it does assume is that it will be wired up to something underneath. Kvm/x86 conveniently has pci, so the virtio-pci adapter was created to reuse much of that facility. For other things like lguest and s360, something new had to be created underneath to make up for the lack of pci-like support. vbus, in conjunction with the kvm-connector, tries to unify that process a little more by creating a PV-optimized bus. The idea is that it can be reused in that situation instead of creating a new hypervisor specific bus each time. It's also designed for high-performance, so you get that important trait for free simply by tying into it. > >> So to answer your question, the difference is that the part that has to >> be customized in vbus should be a fraction of what needs to be >> customized with vhost because it defines more of the stack. > > But if you want to use the native mechanisms, vbus doesn't have any > added value. First of all, thats incorrect. If you want to use the "native" mechanisms (via the way the vbus-connector is implemented, for instance) you at least still have the benefit that the backend design is more broadly re-useable in more environments (like non-virt, for instance), because vbus does a proper job of defining the requisite layers/abstractions compared to vhost. So it adds value even in that situation. Second of all, with PV there is no such thing as "native". It's software so it can be whatever we want. Sure, you could argue that the guest may have built-in support for something like PCI protocol. However, PCI protocol itself isn't suitable for high-performance PV out of the can. So you will therefore invariably require new software layers on top anyway, even if part of the support is already included. And lastly, why would you _need_ to use the so called "native" mechanism? The short answer is, "you don't". Any given system (guest or bare-metal) already have a wide-range of buses (try running "tree /sys/bus" in Linux). More importantly, the concept of adding new buses is widely supported in both the Windows and Linux driver model (and probably any other guest-type that matters). Therefore, despite claims to the contrary, its not hard or even unusual to add a new bus to the mix. In summary, vbus is simply one more bus of many, purpose built to support high-end IO in a virt-like model, giving controlled access to the linux-host underneath it. You can write a high-performance layer below the OS bus-model (vbus), or above it (virtio-pci) but either way you are modifying the stack to add these capabilities, so we might as well try to get this right. With all due respect, you are making a big deal out of a minor issue. > >> And, as >> eluded to in my diagram, both virtio-net and vhost (with some >> modifications to fit into the vbus framework) are potentially >> complementary, not competitors. >> > > Only theoretically. The existing installed base would have to be thrown > away "Thrown away" is pure hyperbole. The installed base, worse case, needs to load a new driver for a missing device. This is pretty much how every machine works today, anyway. And if loading a driver was actually some insurmountable hurdle, as its sometimes implied (but its not in reality), you can alternatively make vbus look like a legacy bus if you are willing to sacrifice some of features, like exit-ratio reduction and priority. FWIW: AlacrityVM isn't willing to sacrifice those features, so we will provide a Linux and Windows driver for explicit bus support, as well as open-specs and community development assistance to any other guest that wants to add support in the future. > or we'd need to support both. > > No matter what model we talk about, there's always going to be a "both" since the userspace virtio models are probably not going to go away (nor should they). > > >>> Without a vbus-connector-falling-dominos, vbus-venet can't do anything >>> either. >>> >> Mostly covered above... >> >> However, I was addressing your assertion that vhost somehow magically >> accomplishes this "container/addressing" function without any specific >> kernel support. This is incorrect. I contend that this kernel support >> is required and present. The difference is that its defined elsewhere >> (and typically in a transport/arch specific way). >> >> IOW: You can basically think of the programmed PIO addresses as forming >> its "container". Only addresses explicitly added are visible, and >> everything else is inaccessible. This whole discussion is merely a >> question of what's been generalized verses what needs to be >> re-implemented each time. >> > > Sorry, this is too abstract for me. With all due respect, understanding my point above is required to have any kind of meaningful discussion here. > > > >>> vbus doesn't do kvm guest address decoding for the fast path. It's >>> still done by ioeventfd. >>> >> That is not correct. vbus does its own native address decoding in the >> fast path, such as here: >> >> http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=kernel/vbus/client.c;h=e85b2d92d629734866496b67455dd307486e394a;hb=e6cbd4d1decca8e829db3b2b9b6ec65330b379e9#l331 >> >> >> > > All this is after kvm has decoded that vbus is addresses. It can't work > without someone outside vbus deciding that. How the connector message is delivered is really not relevant. Some architectures will simply deliver the message point-to-point (like the original hypercall design for KVM, or something like Ira's rig), and some will need additional demuxing (like pci-bridge/pio based KVM). It's an implementation detail of the connector. However, the real point here is that something needs to establish a scoped namespace mechanism, add items to that namespace, and advertise the presence of the items to the guest. vbus has this facility built in to its stack. vhost doesn't, so it must come from elsewhere. > >> In fact, it's actually a simpler design to unify things this way because >> you avoid splitting the device model up. Consider how painful the vhost >> implementation would be if it didn't already have the userspace >> virtio-net to fall-back on. This is effectively what we face for new >> devices going forward if that model is to persist. >> > > > It doesn't have just virtio-net, it has userspace-based hostplug vbus has hotplug too: mkdir and rmdir As an added bonus, its device-model is modular. A developer can write a new device model, compile it, insmod it to the host kernel, hotplug it to the running guest with mkdir/ln, and the come back out again (hotunplug with rmdir, rmmod, etc). They may do this all without taking the guest down, and while eating QEMU based IO solutions for breakfast performance wise. Afaict, qemu can't do either of those things. > and a bunch of other devices impemented in userspace. Thats fine. I am primarily interested in the high-performance components, so most of those other items can stay there in userspace if that is their ideal location. > Currently qemu has > virtio bindings for pci and syborg (whatever that is), and device models > for baloon, block, net, and console, so it seems implementing device > support in userspace is not as disasterous as you make it to be. I intentionally qualified "device" with "new" in my statement. And in that context I was talking about ultimately developing/supporting in-kernel models, not pure legacy userspace ones. I have no doubt the implementation of the original userpsace devices was not a difficult or horrific endeavor. Requiring new models to be implemented (at least) twice is a poor design IMO, however. Requiring them to split such a minor portion of their functionality (like read-only attributes) is a poor design, too. I have already demonstrated there are other ways to achieve the same high-performance goals without requiring two models developed/tested each time and for each manager. For the times I went and tried to satisfy your request in this manner, developing the code and managing the resources in two places, for lack of a better description, made me want to wretch. So I gave up, resolved that my original design was better, and hoped that I could convince you and the community of the same. > >>> Invariably? >>> >> As in "always" >> > > Refactor instead of duplicating. There is no duplicating. vbus has no equivalent today as virtio doesn't define these layers. > >> >>> Use libraries (virtio-shmem.ko, libvhost.so). >>> >> What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you >> dont need libvhost.so per se since you can just use standard kernel >> interfaces (like configfs/sysfs). I could create an .so going forward >> for the new ioctl-based interface, I suppose. >> > > Refactor instead of rewriting. There is no rewriting. vbus has no equivalent today as virtio doesn't define these layers. By your own admission, you said if you wanted that capability, use a library. What I think you are not understanding is vbus _is_ that library. So what is the problem, exactly? > > > >>> For kvm/x86 pci definitely remains king. >>> >> For full virtualization, sure. I agree. However, we are talking about >> PV here. For PV, PCI is not a requirement and is a technical dead-end >> IMO. >> >> KVM seems to be the only virt solution that thinks otherwise (*), but I >> believe that is primarily a condition of its maturity. I aim to help >> advance things here. >> >> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some >> vmi-esq thing (I forget what its called) to name a few. Love 'em or >> hate 'em, most other hypervisors do something along these lines. I'd >> like to try to create one for KVM, but to unify them all (at least for >> the Linux-based host designs). >> > > VMware are throwing VMI away (won't be supported in their new product, > and they've sent a patch to rip it off from Linux); vmware only cares about x86 iiuc, so probably not a good example. > Xen has to tunnel > xenbus in pci for full virtualization (which is where Windows is, and > where Linux will be too once people realize it's faster). lguest is > meant as an example hypervisor, not an attempt to take over the world. So pick any other hypervisor, and the situation is often similar. > > "PCI is a dead end" could not be more wrong, it's what guests support. It's what _some_ guests support. Even for the guests that support it, it's not well designed for PV. Therefore, you have to do a bunch of dancing and waste resources on top to squeeze every last drop of performance out of your platform. In addition, it has a bunch of baggage that goes with it that is not necessary to do the job in a software environment. It is therefore burdensome to recreate if you don't already have something to leverage, like QEMU, just for the sake of creating the illusion that its there. Sounds pretty dead to me, sorry. We don't need it. Alternatively, you can just try to set a stake in the ground for looking forward and fixing those PV-specific problems hopefully once and for all, like vbus and the kvm-connector tries to do. Sure, there will be some degree of pain first as we roll out the subsystem and deploy support, but thats true for lots of things. It's simply a platform investment. > An right now you can have a guest using pci to access a mix of > userspace-emulated devices, userspace-emulated-but-kernel-accelerated > virtio devices, and real host devices. All on one dead-end bus. Try > that with vbus. vbus is not interested in userspace devices. The charter is to provide facilities for utilizing the host linux kernel's IO capabilities in the most efficient, yet safe, manner possible. Those devices that fit outside that charter can ride on legacy mechanisms if that suits them best. > > >>>> I digress. My point here isn't PCI. The point here is the missing >>>> component for when PCI is not present. The component that is partially >>>> satisfied by vbus's devid addressing scheme. If you are going to use >>>> vhost, and you don't have PCI, you've gotta build something to replace >>>> it. >>>> >>>> >>> Yes, that's why people have keyboards. They'll write that glue code if >>> they need it. If it turns out to be a hit an people start having virtio >>> transport module writing parties, they'll figure out a way to share >>> code. >>> >> Sigh... The party has already started. I tried to invite you months >> ago... >> > > I've been voting virtio since 2007. That doesn't have much to do with whats underneath it, since it doesn't define these layers. See my stack diagram's for details. > >>> On the guest side, virtio-shmem.ko can unify the ring access. It >>> probably makes sense even today. On the host side I eventfd is the >>> kernel interface and libvhostconfig.so can provide the configuration >>> when an existing ABI is not imposed. >>> >> That won't cut it. For one, creating an eventfd is only part of the >> equation. I.e. you need to have originate/terminate somewhere >> interesting (and in-kernel, otherwise use tuntap). >> > > vbus needs the same thing so it cancels out. No, it does not. vbus just needs a relatively simple single message pipe between the guest and host (think "hypercall tunnel", if you will). Per queue/device addressing is handled by the same conceptual namespace as the one that would trigger eventfds in the model you mention. And that namespace is built in to the vbus stack, and objects are registered automatically as they are created. Contrast that to vhost, which requires some other kernel interface to exist, and to be managed manually for each object that is created. Your libvhostconfig would need to somehow know how to perform this registration operation, and there would have to be something in the kernel to receive it, presumably on a per platform basis. Solving this problem generally would probably end up looking eerily like vbus, because thats what vbus does. > >>> Look at the virtio-net feature negotiation. There's a lot more there >>> than the MAC address, and it's going to grow. >>> >> Agreed, but note that makes my point. That feature negotiation almost >> invariably influences the device-model, not some config-space shim. >> IOW: terminating config-space at some userspace shim is pointless. The >> model ultimately needs the result of whatever transpires during that >> negotiation anyway. >> > > Well, let's see. Can vbus today: > > - let userspace know which features are available (so it can decide if > live migration is possible) yes, its in sysfs. > - let userspace limit which features are exposed to the guest (so it can > make live migration possible among hosts of different capabilities) yes, its in sysfs. > - let userspace know which features were negotiated (so it can transfer > them to the other host during live migration) no, but we can easily add ->save()/->restore() to the model going forward, and the negotiated features are just a subcomponent if its serialized stream. > - let userspace tell the kernel which features were negotiated (when > live migration completes, to avoid requiring the guest to re-negotiate) that would be the function of the ->restore() deserializer. > - do all that from an unprivileged process yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane. > - securely wrt other unprivileged processes yes, same mechanism plus it has a fork-inheritance model. Bottom line: vbus isn't done, especially w.r.t. live-migration..but that is not an valid argument against the idea if you believe in release-early/release-often. kvm wasn't (isn't) done either when it was proposed/merged. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-25 21:32 ` Gregory Haskins @ 2009-09-27 9:43 ` Avi Kivity 2009-09-30 20:04 ` Gregory Haskins 0 siblings, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-27 9:43 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/26/2009 12:32 AM, Gregory Haskins wrote: >>> >>> I realize in retrospect that my choice of words above implies vbus _is_ >>> complete, but this is not what I was saying. What I was trying to >>> convey is that vbus is _more_ complete. Yes, in either case some kind >>> of glue needs to be written. The difference is that vbus implements >>> more of the glue generally, and leaves less required to be customized >>> for each iteration. >>> >>> >> >> No argument there. Since you care about non-virt scenarios and virtio >> doesn't, naturally vbus is a better fit for them as the code stands. >> > Thanks for finally starting to acknowledge there's a benefit, at least. > I think I've mentioned vbus' finer grained layers as helpful here, though I doubt the value of this. Hypervisors are added rarely, while devices and drivers are added (and modified) much more often. I don't buy the anything-to-anything promise. > To be more precise, IMO virtio is designed to be a performance oriented > ring-based driver interface that supports all types of hypervisors (e.g. > shmem based kvm, and non-shmem based Xen). vbus is designed to be a > high-performance generic shared-memory interconnect (for rings or > otherwise) framework for environments where linux is the underpinning > "host" (physical or virtual). They are distinctly different, but > complementary (the former addresses the part of the front-end, and > latter addresses the back-end, and a different part of the front-end). > They're not truly complementary since they're incompatible. A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work over vbus. Further, non-shmem virtio can't work over vbus. Since virtio is guest-oriented and host-agnostic, it can't ignore non-shared-memory hosts (even though it's unlikely virtio will be adopted there). > In addition, the kvm-connector used in AlacrityVM's design strives to > add value and improve performance via other mechanisms, such as dynamic > allocation, interrupt coalescing (thus reducing exit-ratio, which is a > serious issue in KVM) Do you have measurements of inter-interrupt coalescing rates (excluding intra-interrupt coalescing). > and priortizable/nestable signals. > That doesn't belong in a bus. > Today there is a large performance disparity between what a KVM guest > sees and what a native linux application sees on that same host. Just > take a look at some of my graphs between "virtio", and "native", for > example: > > http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png > That's a red herring. The problem is not with virtio as an ABI, but with its implementation in userspace. vhost-net should offer equivalent performance to vbus. > A dominant vbus design principle is to try to achieve the same IO > performance for all "linux applications" whether they be literally > userspace applications, or things like KVM vcpus or Ira's physical > boards. It also aims to solve problems not previously expressible with > current technologies (even virtio), like nested real-time. > > And even though you repeatedly insist otherwise, the neat thing here is > that the two technologies mesh (at least under certain circumstances, > like when virtio is deployed on a shared-memory friendly linux backend > like KVM). I hope that my stack diagram below depicts that clearly. > Right, when you ignore the points where they don't fit, it's a perfect mesh. >> But that's not a strong argument for vbus; instead of adding vbus you >> could make virtio more friendly to non-virt >> > Actually, it _is_ a strong argument then because adding vbus is what > helps makes virtio friendly to non-virt, at least for when performance > matters. > As vhost-net shows, you can do that without vbus and without breaking compatibility. >> Right. virtio assumes that it's in a virt scenario and that the guest >> architecture already has enumeration and hotplug mechanisms which it >> would prefer to use. That happens to be the case for kvm/x86. >> > No, virtio doesn't assume that. It's stack provides the "virtio-bus" > abstraction and what it does assume is that it will be wired up to > something underneath. Kvm/x86 conveniently has pci, so the virtio-pci > adapter was created to reuse much of that facility. For other things > like lguest and s360, something new had to be created underneath to make > up for the lack of pci-like support. > Right, I was wrong there. But it does allow you to have a 1:1 mapping between native devices and virtio devices. >>> So to answer your question, the difference is that the part that has to >>> be customized in vbus should be a fraction of what needs to be >>> customized with vhost because it defines more of the stack. >>> >> But if you want to use the native mechanisms, vbus doesn't have any >> added value. >> > First of all, thats incorrect. If you want to use the "native" > mechanisms (via the way the vbus-connector is implemented, for instance) > you at least still have the benefit that the backend design is more > broadly re-useable in more environments (like non-virt, for instance), > because vbus does a proper job of defining the requisite > layers/abstractions compared to vhost. So it adds value even in that > situation. > Maybe. If vhost-net isn't sufficient I'm sure there will be patches sent. > Second of all, with PV there is no such thing as "native". It's > software so it can be whatever we want. Sure, you could argue that the > guest may have built-in support for something like PCI protocol. > However, PCI protocol itself isn't suitable for high-performance PV out > of the can. So you will therefore invariably require new software > layers on top anyway, even if part of the support is already included. > Of course there is such a thing as native, a pci-ready guest has tons of support built into it that doesn't need to be retrofitted. Since practically everyone (including Xen) does their paravirt drivers atop pci, the claim that pci isn't suitable for high performance is incorrect. > And lastly, why would you _need_ to use the so called "native" > mechanism? The short answer is, "you don't". Any given system (guest > or bare-metal) already have a wide-range of buses (try running "tree > /sys/bus" in Linux). More importantly, the concept of adding new buses > is widely supported in both the Windows and Linux driver model (and > probably any other guest-type that matters). Therefore, despite claims > to the contrary, its not hard or even unusual to add a new bus to the mix. > The short answer is "compatibility". > In summary, vbus is simply one more bus of many, purpose built to > support high-end IO in a virt-like model, giving controlled access to > the linux-host underneath it. You can write a high-performance layer > below the OS bus-model (vbus), or above it (virtio-pci) but either way > you are modifying the stack to add these capabilities, so we might as > well try to get this right. > > With all due respect, you are making a big deal out of a minor issue. > It's not minor to me. >>> And, as >>> eluded to in my diagram, both virtio-net and vhost (with some >>> modifications to fit into the vbus framework) are potentially >>> complementary, not competitors. >>> >>> >> Only theoretically. The existing installed base would have to be thrown >> away >> > "Thrown away" is pure hyperbole. The installed base, worse case, needs > to load a new driver for a missing device. Yes, we all know how fun this is. Especially if the device changed is your boot disk. You may not care about the pain caused to users, but I do, so I will continue to insist on compatibility. >> or we'd need to support both. >> >> >> > No matter what model we talk about, there's always going to be a "both" > since the userspace virtio models are probably not going to go away (nor > should they). > virtio allows you to have userspace-only, kernel-only, or start-with-userspace-and-move-to-kernel-later, all transparent to the guest. In many cases we'll stick with userspace-only. >> All this is after kvm has decoded that vbus is addresses. It can't work >> without someone outside vbus deciding that. >> > How the connector message is delivered is really not relevant. Some > architectures will simply deliver the message point-to-point (like the > original hypercall design for KVM, or something like Ira's rig), and > some will need additional demuxing (like pci-bridge/pio based KVM). > It's an implementation detail of the connector. > > However, the real point here is that something needs to establish a > scoped namespace mechanism, add items to that namespace, and advertise > the presence of the items to the guest. vbus has this facility built in > to its stack. vhost doesn't, so it must come from elsewhere. > So we have: vbus needs a connector, vhost needs a connector. vbus doesn't need userspace to program the addresses (but does need userspace to instantiate the devices and to program the bus address decode), vhost needs userspace to instantiate the devices and program the addresses. >>> In fact, it's actually a simpler design to unify things this way because >>> you avoid splitting the device model up. Consider how painful the vhost >>> implementation would be if it didn't already have the userspace >>> virtio-net to fall-back on. This is effectively what we face for new >>> devices going forward if that model is to persist. >>> >>> >> >> It doesn't have just virtio-net, it has userspace-based hostplug >> > vbus has hotplug too: mkdir and rmdir > Does that work from nonprivileged processes? Does it work on Windows? > As an added bonus, its device-model is modular. A developer can write a > new device model, compile it, insmod it to the host kernel, hotplug it > to the running guest with mkdir/ln, and the come back out again > (hotunplug with rmdir, rmmod, etc). They may do this all without taking > the guest down, and while eating QEMU based IO solutions for breakfast > performance wise. > > Afaict, qemu can't do either of those things. > We've seen that herring before, and it's redder than ever. >> Refactor instead of duplicating. >> > There is no duplicating. vbus has no equivalent today as virtio doesn't > define these layers. > So define them if they're missing. >>> >>> >>>> Use libraries (virtio-shmem.ko, libvhost.so). >>>> >>>> >>> What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you >>> dont need libvhost.so per se since you can just use standard kernel >>> interfaces (like configfs/sysfs). I could create an .so going forward >>> for the new ioctl-based interface, I suppose. >>> >>> >> Refactor instead of rewriting. >> > There is no rewriting. vbus has no equivalent today as virtio doesn't > define these layers. > > By your own admission, you said if you wanted that capability, use a > library. What I think you are not understanding is vbus _is_ that > library. So what is the problem, exactly? > It's not compatible. If you were truly worried about code duplication in virtio, you'd refactor it to remove the duplication, without affecting existing guests. >>>> For kvm/x86 pci definitely remains king. >>>> >>>> >>> For full virtualization, sure. I agree. However, we are talking about >>> PV here. For PV, PCI is not a requirement and is a technical dead-end >>> IMO. >>> >>> KVM seems to be the only virt solution that thinks otherwise (*), but I >>> believe that is primarily a condition of its maturity. I aim to help >>> advance things here. >>> >>> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some >>> vmi-esq thing (I forget what its called) to name a few. Love 'em or >>> hate 'em, most other hypervisors do something along these lines. I'd >>> like to try to create one for KVM, but to unify them all (at least for >>> the Linux-based host designs). >>> >>> >> VMware are throwing VMI away (won't be supported in their new product, >> and they've sent a patch to rip it off from Linux); >> > vmware only cares about x86 iiuc, so probably not a good example. > Well, you brought it up. Between you and me, I only care about x86 too. >> Xen has to tunnel >> xenbus in pci for full virtualization (which is where Windows is, and >> where Linux will be too once people realize it's faster). lguest is >> meant as an example hypervisor, not an attempt to take over the world. >> > So pick any other hypervisor, and the situation is often similar. > The situation is often pci. > >> An right now you can have a guest using pci to access a mix of >> userspace-emulated devices, userspace-emulated-but-kernel-accelerated >> virtio devices, and real host devices. All on one dead-end bus. Try >> that with vbus. >> > vbus is not interested in userspace devices. The charter is to provide > facilities for utilizing the host linux kernel's IO capabilities in the > most efficient, yet safe, manner possible. Those devices that fit > outside that charter can ride on legacy mechanisms if that suits them best. > vbus isn't, but I am. I would prefer not to have to expose implementation decisions (kernel vs userspace) to the guest (vbus vs pci). >>> That won't cut it. For one, creating an eventfd is only part of the >>> equation. I.e. you need to have originate/terminate somewhere >>> interesting (and in-kernel, otherwise use tuntap). >>> >>> >> vbus needs the same thing so it cancels out. >> > No, it does not. vbus just needs a relatively simple single message > pipe between the guest and host (think "hypercall tunnel", if you will). > That's ioeventfd. So far so similar. > Per queue/device addressing is handled by the same conceptual namespace > as the one that would trigger eventfds in the model you mention. And > that namespace is built in to the vbus stack, and objects are registered > automatically as they are created. > > Contrast that to vhost, which requires some other kernel interface to > exist, and to be managed manually for each object that is created. Your > libvhostconfig would need to somehow know how to perform this > registration operation, and there would have to be something in the > kernel to receive it, presumably on a per platform basis. Solving this > problem generally would probably end up looking eerily like vbus, > because thats what vbus does. > vbus devices aren't magically instantiated. Userspace needs to instantiate them too. Sure, there's less work on the host side since you're using vbus instead of the native interface, but more work on the guest side since you're using vbus instead of the native interface. >> Well, let's see. Can vbus today: >> >> - let userspace know which features are available (so it can decide if >> live migration is possible) >> > yes, its in sysfs. > > >> - let userspace limit which features are exposed to the guest (so it can >> make live migration possible among hosts of different capabilities) >> > yes, its in sysfs. > Per-device? non-privileged-user capable? >> - let userspace know which features were negotiated (so it can transfer >> them to the other host during live migration) >> > no, but we can easily add ->save()/->restore() to the model going > forward, and the negotiated features are just a subcomponent if its > serialized stream. > > >> - let userspace tell the kernel which features were negotiated (when >> live migration completes, to avoid requiring the guest to re-negotiate) >> > that would be the function of the ->restore() deserializer. > > >> - do all that from an unprivileged process >> > yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane. > Ah, so you have two control planes. > Bottom line: vbus isn't done, especially w.r.t. live-migration..but that > is not an valid argument against the idea if you believe in > release-early/release-often. kvm wasn't (isn't) done either when it was > proposed/merged. > > kvm didn't have an existing counterpart in Linux when it was proposed/merged. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-27 9:43 ` Avi Kivity @ 2009-09-30 20:04 ` Gregory Haskins 2009-10-01 8:34 ` Avi Kivity 0 siblings, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-09-30 20:04 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 35235 bytes --] Avi Kivity wrote: > On 09/26/2009 12:32 AM, Gregory Haskins wrote: >>>> >>>> I realize in retrospect that my choice of words above implies vbus _is_ >>>> complete, but this is not what I was saying. What I was trying to >>>> convey is that vbus is _more_ complete. Yes, in either case some kind >>>> of glue needs to be written. The difference is that vbus implements >>>> more of the glue generally, and leaves less required to be customized >>>> for each iteration. >>>> >>>> >>> >>> No argument there. Since you care about non-virt scenarios and virtio >>> doesn't, naturally vbus is a better fit for them as the code stands. >>> >> Thanks for finally starting to acknowledge there's a benefit, at least. >> > > I think I've mentioned vbus' finer grained layers as helpful here, > though I doubt the value of this. Hypervisors are added rarely, while > devices and drivers are added (and modified) much more often. I don't > buy the anything-to-anything promise. The ease in which a new hypervisor should be able to integrate into the stack is only one of vbus's many benefits. > >> To be more precise, IMO virtio is designed to be a performance oriented >> ring-based driver interface that supports all types of hypervisors (e.g. >> shmem based kvm, and non-shmem based Xen). vbus is designed to be a >> high-performance generic shared-memory interconnect (for rings or >> otherwise) framework for environments where linux is the underpinning >> "host" (physical or virtual). They are distinctly different, but >> complementary (the former addresses the part of the front-end, and >> latter addresses the back-end, and a different part of the front-end). >> > > They're not truly complementary since they're incompatible. No, that is incorrect. Not to be rude, but for clarity: Complementary \Com`ple*men"ta*ry\, a. Serving to fill out or to complete; as, complementary numbers. [1913 Webster] Citation: www.dict.org IOW: Something being complementary has nothing to do with guest/host binary compatibility. virtio-pci and virtio-vbus are both equally complementary to virtio since they fill in the bottom layer of the virtio stack. So yes, vbus is truly complementary to virtio afaict. > A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work > over vbus. Binary compatibility with existing virtio drivers, while nice to have, is not a specific requirement nor goal. We will simply load an updated KMP/MSI into those guests and they will work again. As previously discussed, this is how more or less any system works today. It's like we are removing an old adapter card and adding a new one to "uprev the silicon". > Further, non-shmem virtio can't work over vbus. Actually I misspoke earlier when I said virtio works over non-shmem. Thinking about it some more, both virtio and vbus fundamentally require shared-memory, since sharing their metadata concurrently on both sides is their raison d'être. The difference is that virtio utilizes a pre-translation/mapping (via ->add_buf) from the guest side. OTOH, vbus uses a post translation scheme (via memctx) from the host-side. If anything, vbus is actually more flexible because it doesn't assume the entire guest address space is directly mappable. In summary, your statement is incorrect (though it is my fault for putting that idea in your head). > Since > virtio is guest-oriented and host-agnostic, it can't ignore > non-shared-memory hosts (even though it's unlikely virtio will be > adopted there) Well, to be fair no one said it has to ignore them. Either virtio-vbus transport is present and available to the virtio stack, or it isn't. If its present, it may or may not publish objects for consumption. Providing a virtio-vbus transport in no way limits or degrades the existing capabilities of the virtio stack. It only enhances them. I digress. The whole point is moot since I realized that the non-shmem distinction isn't accurate anyway. They both require shared-memory for the metadata, and IIUC virtio requires the entire address space to be mappable whereas vbus only assumes the metadata is. > >> In addition, the kvm-connector used in AlacrityVM's design strives to >> add value and improve performance via other mechanisms, such as dynamic >> allocation, interrupt coalescing (thus reducing exit-ratio, which is a >> serious issue in KVM) > > Do you have measurements of inter-interrupt coalescing rates (excluding > intra-interrupt coalescing). I actually do not have a rig setup to explicitly test inter-interrupt rates at the moment. Once things stabilize for me, I will try to re-gather some numbers here. Last time I looked, however, there were some decent savings for inter as well. Inter rates are interesting because they are what tends to ramp up with IO load more than intra since guest interrupt mitigation techniques like NAPI often quell intra-rates naturally. This is especially true for data-center, cloud, hpc-grid, etc, kind of workloads (vs vanilla desktops, etc) that tend to have multiple IO ports (multi-homed nics, disk-io, etc). Those various ports tend to be workload-related to one another (e.g. 3-tier web stack may use multi-homed network and disk-io at the same time, trigged by one IO event). An interesting thing here is that you don't even need a fancy multi-homed setup to see the effects of my exit-ratio reduction work: even single port configurations suffer from the phenomenon since many devices have multiple signal-flows (e.g. network adapters tend to have at least 3 flows: rx-ready, tx-complete, and control-events (link-state, etc). Whats worse, is that the flows often are indirectly related (for instance, many host adapters will free tx skbs during rx operations, so you tend to get bursts of tx-completes at the same time as rx-ready. If the flows map 1:1 with IDT, they will suffer the same problem. In any case, here is an example run of a simple single-homed guest over standard GigE. Whats interesting here is that .qnotify to .notify ratio, as this is the interrupt-to-signal ratio. In this case, its 170047/151918, which comes out to about 11% savings in interrupt injections: vbus-guest:/home/ghaskins # netperf -H dev TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to dev.laurelwood.net (192.168.1.10) port 0 AF_INET Recv Send Send Socket Socket Message Elapsed Size Size Size Time Throughput bytes bytes bytes secs. 10^6bits/sec 1048576 16384 16384 10.01 940.77 vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge .events : 170048 .qnotify : 151918 .qinject : 0 .notify : 170047 .inject : 18238 .bridgecalls : 18 .buscalls : 12 vbus-guest:/home/ghaskins # cat /proc/interrupts CPU0 0: 87 IO-APIC-edge timer 1: 6 IO-APIC-edge i8042 4: 733 IO-APIC-edge serial 6: 2 IO-APIC-edge floppy 7: 0 IO-APIC-edge parport0 8: 0 IO-APIC-edge rtc0 9: 0 IO-APIC-fasteoi acpi 10: 0 IO-APIC-fasteoi virtio1 12: 90 IO-APIC-edge i8042 14: 3041 IO-APIC-edge ata_piix 15: 1008 IO-APIC-edge ata_piix 24: 151933 PCI-MSI-edge vbus 25: 0 PCI-MSI-edge virtio0-config 26: 190 PCI-MSI-edge virtio0-input 27: 28 PCI-MSI-edge virtio0-output NMI: 0 Non-maskable interrupts LOC: 9854 Local timer interrupts SPU: 0 Spurious interrupts CNT: 0 Performance counter interrupts PND: 0 Performance pending work RES: 0 Rescheduling interrupts CAL: 0 Function call interrupts TLB: 0 TLB shootdowns TRM: 0 Thermal event interrupts THR: 0 Threshold APIC interrupts MCE: 0 Machine check exceptions MCP: 1 Machine check polls ERR: 0 MIS: 0 Its important to note here that we are actually looking at the interrupt rate, not the exit rate (which is usually a multiple of the interrupt rate, since you have to factor in as many as three exits per interrupt (IPI, window, EOI). Therefore we saved about 18k interrupts in this 10 second burst, but we may have actually saved up to 54k exits in the process. This is only over a 10 second window at GigE rates, so YMMV. These numbers get even more dramatic on higher end hardware, but I haven't had a chance to generate new numbers yet. Looking at some external stats paints an even bleaker picture: "exits" as reported by kvm_stat for virtio-pci based virtio-net tip the scales at 65k/s vs 36k/s for vbus based venet. And virtio is consuming ~30% of my quad-core's cpu, vs 19% for venet during the test. Its hard to know which innovation or innovations may be responsible for the entire reduction, but certainly the interrupt-to-signal ratio mentioned above is probably helping. The even worse news for 1:1 models is that the ratio of exits-per-interrupt climbs with load (exactly when it hurts the most) since that is when the probability that the vcpu will need all three exits is the highest. > >> and priortizable/nestable signals. >> > > That doesn't belong in a bus. Everyone is of course entitled to an opinion, but the industry as a whole would disagree with you. Signal path routing (1:1, aggregated, etc) is at the discretion of the bus designer. Most buses actually do _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc). PCI is somewhat of an outlier in that regard afaict. Its actually a nice feature of PCI when its used within its design spec (HW). For SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling" issue in the signal path I mentioned above. This is one of the many reasons I think PCI is not the best choice for PV. > >> Today there is a large performance disparity between what a KVM guest >> sees and what a native linux application sees on that same host. Just >> take a look at some of my graphs between "virtio", and "native", for >> example: >> >> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png >> > > That's a red herring. The problem is not with virtio as an ABI, but > with its implementation in userspace. vhost-net should offer equivalent > performance to vbus. That's pure speculation. I would advise you to reserve such statements until after a proper bakeoff can be completed. This is not to mention that vhost-net does nothing to address our other goals, like scheduler coordination and non-802.x fabrics. > >> A dominant vbus design principle is to try to achieve the same IO >> performance for all "linux applications" whether they be literally >> userspace applications, or things like KVM vcpus or Ira's physical >> boards. It also aims to solve problems not previously expressible with >> current technologies (even virtio), like nested real-time. >> >> And even though you repeatedly insist otherwise, the neat thing here is >> that the two technologies mesh (at least under certain circumstances, >> like when virtio is deployed on a shared-memory friendly linux backend >> like KVM). I hope that my stack diagram below depicts that clearly. >> > > Right, when you ignore the points where they don't fit, it's a perfect > mesh. Where doesn't it fit? > >>> But that's not a strong argument for vbus; instead of adding vbus you >>> could make virtio more friendly to non-virt >>> >> Actually, it _is_ a strong argument then because adding vbus is what >> helps makes virtio friendly to non-virt, at least for when performance >> matters. >> > > As vhost-net shows, you can do that without vbus Citation please. Afaict, the one use case that we looked at for vhost outside of KVM failed to adapt properly, so I do not see how this is true. > and without breaking compatibility. Compatibility with what? vhost hasn't even been officially deployed in KVM environments afaict, nevermind non-virt. Therefore, how could it possibly have compatibility constraints with something non-virt already? Citation please. > > > >>> Right. virtio assumes that it's in a virt scenario and that the guest >>> architecture already has enumeration and hotplug mechanisms which it >>> would prefer to use. That happens to be the case for kvm/x86. >>> >> No, virtio doesn't assume that. It's stack provides the "virtio-bus" >> abstraction and what it does assume is that it will be wired up to >> something underneath. Kvm/x86 conveniently has pci, so the virtio-pci >> adapter was created to reuse much of that facility. For other things >> like lguest and s360, something new had to be created underneath to make >> up for the lack of pci-like support. >> > > Right, I was wrong there. But it does allow you to have a 1:1 mapping > between native devices and virtio devices. vbus allows you to have 1:1 if that is what you want, but we strive to do better. > > >>>> So to answer your question, the difference is that the part that has to >>>> be customized in vbus should be a fraction of what needs to be >>>> customized with vhost because it defines more of the stack. >>>> >>> But if you want to use the native mechanisms, vbus doesn't have any >>> added value. >>> >> First of all, thats incorrect. If you want to use the "native" >> mechanisms (via the way the vbus-connector is implemented, for instance) >> you at least still have the benefit that the backend design is more >> broadly re-useable in more environments (like non-virt, for instance), >> because vbus does a proper job of defining the requisite >> layers/abstractions compared to vhost. So it adds value even in that >> situation. >> > > Maybe. If vhost-net isn't sufficient I'm sure there will be patches sent. It isn't, and I've already done that. > >> Second of all, with PV there is no such thing as "native". It's >> software so it can be whatever we want. Sure, you could argue that the >> guest may have built-in support for something like PCI protocol. [1] >> However, PCI protocol itself isn't suitable for high-performance PV out >> of the can. So you will therefore invariably require new software >> layers on top anyway, even if part of the support is already included. >> > > Of course there is such a thing as native, a pci-ready guest has tons of > support built into it I specifically mentioned that already ([1]). You are also overstating its role, since the basic OS is what implements the native support for bus-objects, hotswap, etc, _not_ PCI. PCI just rides underneath and feeds trivial events up, as do other bus-types (usb, scsi, vbus, etc). And once those events are fed, you still need a PV layer to actually handle the bus interface in a high-performance manner so its not like you really have a "native" stack in either case. > that doesn't need to be retrofitted. No, that is incorrect. You have to heavily modify the pci model with layers on top to get any kind of performance out of it. Otherwise, we would just use realtek emulation, which is technically the native PCI you are apparently so enamored with. Not to mention there are things you just plain can't do in PCI today, like dynamically assign signal-paths, priority, and coalescing, etc. > Since > practically everyone (including Xen) does their paravirt drivers atop > pci, the claim that pci isn't suitable for high performance is incorrect. Actually IIUC, I think Xen bridges to their own bus as well (and only where they have to), just like vbus. They don't use PCI natively. PCI is perfectly suited as a bridge transport for PV, as I think the Xen and vbus examples have demonstrated. Its the 1:1 device-model where PCI has the most problems. > > >> And lastly, why would you _need_ to use the so called "native" >> mechanism? The short answer is, "you don't". Any given system (guest >> or bare-metal) already have a wide-range of buses (try running "tree >> /sys/bus" in Linux). More importantly, the concept of adding new buses >> is widely supported in both the Windows and Linux driver model (and >> probably any other guest-type that matters). Therefore, despite claims >> to the contrary, its not hard or even unusual to add a new bus to the >> mix. >> > > The short answer is "compatibility". There was a point in time where the same could be said for virtio-pci based drivers vs realtek and e1000, so that argument is demonstrably silly. No one tried to make virtio work in a binary compatible way with realtek emulation, yet we all survived the requirement for loading a virtio driver to my knowledge. The bottom line is: Binary device compatibility is not required in any other system (as long as you follow sensible versioning/id rules), so why is KVM considered special? The fact is, it isn't special (at least not in this regard). What _is_ required is "support" and we fully intend to support these proposed components. I assure you that at least the users that care about maximum performance will not generally mind loading a driver. Most of them would have to anyway if they want to get beyond realtek emulation. > > >> In summary, vbus is simply one more bus of many, purpose built to >> support high-end IO in a virt-like model, giving controlled access to >> the linux-host underneath it. You can write a high-performance layer >> below the OS bus-model (vbus), or above it (virtio-pci) but either way >> you are modifying the stack to add these capabilities, so we might as >> well try to get this right. >> >> With all due respect, you are making a big deal out of a minor issue. >> > > It's not minor to me. I am certainly in no position to tell you how to feel, but this declaration would seem from my perspective to be more of a means to an end than a legitimate concern. Otherwise we would never have had virtio support in the first place, since it was not "compatible" with previous releases. > >>>> And, as >>>> eluded to in my diagram, both virtio-net and vhost (with some >>>> modifications to fit into the vbus framework) are potentially >>>> complementary, not competitors. >>>> >>>> >>> Only theoretically. The existing installed base would have to be thrown >>> away >>> >> "Thrown away" is pure hyperbole. The installed base, worse case, needs >> to load a new driver for a missing device. > > Yes, we all know how fun this is. Making systems perform 5x faster _is_ fun, yes. I love what I do for a living. > Especially if the device changed is your boot disk. If and when that becomes a priority concern, that would be a function transparently supported in the BIOS shipped with the hypervisor, and would thus be invisible to the user. > You may not care about the pain caused to users, but I do, so I will > continue to insist on compatibility. No, you are incorrect on two counts. 1) Of course I care about pain to users or I wouldn't be funded. Right now the pain from my perspective is caused to users in the high-performance community who want to deploy KVM based solutions. They are unable to do so due to its performance disparity compared to bare-metal, outside of pass-through hardware which is not widely available in a lot of existing deployments. I aim to fix that disparity while reusing the existing hardware investment by writing smarter software, and I assure you that these users won't mind loading a driver in the guest to take advantage of it. For the users that don't care about maximum performance, there is no change (and thus zero pain) required. They can use realtek or virtio if they really want to. Neither is going away to my knowledge, and lets face it: 2.6Gb/s out of virtio to userspace isn't *that* bad. But "good enough" isn't good enough, and I won't rest till we get to native performance. Additionally, I want to support previously unavailable modes of operations (e.g. real-time) and advanced fabrics (e.g. IB). 2) True pain to users is not caused by lack of binary compatibility. Its caused by lack of support. And its a good thing or we would all be emulating 8086 architecture forever... ..oh wait, I guess we kind of do that already ;). But at least we can slip in something more advanced once in a while (APIC vs PIC, USB vs uart, iso9660 vs floppy, for instance) and update the guest stack instead of insisting it must look like ISA forever for compatibility's sake. > >>> or we'd need to support both. >>> >>> >>> >> No matter what model we talk about, there's always going to be a "both" >> since the userspace virtio models are probably not going to go away (nor >> should they). >> > > virtio allows you to have userspace-only, kernel-only, or > start-with-userspace-and-move-to-kernel-later, all transparent to the > guest. In many cases we'll stick with userspace-only. The user will not care where the model lives, per se. Only that it is supported, and it works well. Likewise, I know from experience that the developer will not like writing the same code twice, so the "runs in both" model is not necessarily a great design trait either. > >>> All this is after kvm has decoded that vbus is addresses. It can't work >>> without someone outside vbus deciding that. >>> >> How the connector message is delivered is really not relevant. Some >> architectures will simply deliver the message point-to-point (like the >> original hypercall design for KVM, or something like Ira's rig), and >> some will need additional demuxing (like pci-bridge/pio based KVM). >> It's an implementation detail of the connector. >> >> However, the real point here is that something needs to establish a >> scoped namespace mechanism, add items to that namespace, and advertise >> the presence of the items to the guest. vbus has this facility built in >> to its stack. vhost doesn't, so it must come from elsewhere. >> > > So we have: vbus needs a connector, vhost needs a connector. vbus > doesn't need userspace to program the addresses (but does need userspace > to instantiate the devices and to program the bus address decode) First of all, bus-decode is substantially easier than per-device decode (you have to track all those per-device/per-signal fds somewhere, integrate with hotswap, etc), and its only done once per guest at startup and left alone. So its already not apples to apples. Second, while its true that the general kvm-connector bus-decode needs to be programmed, that is a function of adapting to the environment that _you_ created for me. The original kvm-connector was discovered via cpuid and hypercalls, and didn't need userspace at all to set it up. Therefore it would be entirely unfair of you to turn around and somehow try to use that trait of the design against me since you yourself imposed it. As an additional data point, our other connectors have no such bus-decode programming requirement. Therefore, this is clearly just a property of the KVM environment, not a function of the overall vbus design. > vhost needs userspace to instantiate the devices and program the addresses. > Right. And among other shortcomings it also requires a KVM-esque memory model (which is not always going to work as we recently discussed), and a redundant device-model to back it up in userspace, which is a development and maintenance burden, and an external bus-model (filled by pio-bus in KVM today). >>>> In fact, it's actually a simpler design to unify things this way >>>> because >>>> you avoid splitting the device model up. Consider how painful the vhost >>>> implementation would be if it didn't already have the userspace >>>> virtio-net to fall-back on. This is effectively what we face for new >>>> devices going forward if that model is to persist. >>>> >>>> >>> >>> It doesn't have just virtio-net, it has userspace-based hostplug >>> >> vbus has hotplug too: mkdir and rmdir >> > > Does that work from nonprivileged processes? It will with the ioctl based control interface that I'll merge shortly. > Does it work on Windows? This question doesn't make sense. Hotswap control occurs on the host, which is always Linux. If you were asking about whether a windows guest will support hotswap: the answer is "yes". Our windows driver presents a unique PDO/FDO pair for each logical device instance that is pushed out (just like the built in usb, pci, scsi bus drivers that windows supports natively). > >> As an added bonus, its device-model is modular. A developer can write a >> new device model, compile it, insmod it to the host kernel, hotplug it >> to the running guest with mkdir/ln, and the come back out again >> (hotunplug with rmdir, rmmod, etc). They may do this all without taking >> the guest down, and while eating QEMU based IO solutions for breakfast >> performance wise. >> >> Afaict, qemu can't do either of those things. >> > > We've seen that herring before, Citation? > and it's redder than ever. This is more hyperbole. I doubt that there would be many that would argue that a modular architecture (that we get for free with LKM support) is not desirable, even if its never used dynamically with a running guest. OTOH, I actually use this dynamic feature all the time as I test my components, so its at least useful to me. > > > >>> Refactor instead of duplicating. >>> >> There is no duplicating. vbus has no equivalent today as virtio doesn't >> define these layers. >> > > So define them if they're missing. I just did. > > >>>> >>>> >>>>> Use libraries (virtio-shmem.ko, libvhost.so). >>>>> >>>>> >>>> What do you suppose vbus is? vbus-proxy.ko = virtio-shmem.ko, and you >>>> dont need libvhost.so per se since you can just use standard kernel >>>> interfaces (like configfs/sysfs). I could create an .so going forward >>>> for the new ioctl-based interface, I suppose. >>>> >>>> >>> Refactor instead of rewriting. >>> >> There is no rewriting. vbus has no equivalent today as virtio doesn't >> define these layers. >> >> By your own admission, you said if you wanted that capability, use a >> library. What I think you are not understanding is vbus _is_ that >> library. So what is the problem, exactly? >> > > It's not compatible. No, that is incorrect. What you are apparently not understanding is that not only is vbus that library, but its extensible. So even if compatibility is your goal (it doesn't need to be IMO) it can be accommodated by how you interface to the library. > If you were truly worried about code duplication > in virtio, you'd refactor it to remove the duplication, My primary objective is creating an extensible, high-performance, shared-memory interconnect for systems that utilize a Linux host as their IO-hub. It just so happens that virtio can sit nicely on top of such a model because shmem-rings are a subclass of shmem. As a result of its design, vbus also helps to reduce code duplication in the stack for new environments due to its extensible nature. However, vbus also has goals beyond what virtio is providing today that are of more concern, and part of that is designing a connector/bus that eliminates the shortcomings in the current pci-based design. > without affecting existing guests. Already covered above. > >>>>> For kvm/x86 pci definitely remains king. >>>>> >>>>> >>>> For full virtualization, sure. I agree. However, we are talking about >>>> PV here. For PV, PCI is not a requirement and is a technical dead-end >>>> IMO. >>>> >>>> KVM seems to be the only virt solution that thinks otherwise (*), but I >>>> believe that is primarily a condition of its maturity. I aim to help >>>> advance things here. >>>> >>>> (*) citation: xen has xenbus, lguest has lguest-bus, vmware has some >>>> vmi-esq thing (I forget what its called) to name a few. Love 'em or >>>> hate 'em, most other hypervisors do something along these lines. I'd >>>> like to try to create one for KVM, but to unify them all (at least for >>>> the Linux-based host designs). >>>> >>>> >>> VMware are throwing VMI away (won't be supported in their new product, >>> and they've sent a patch to rip it off from Linux); >>> >> vmware only cares about x86 iiuc, so probably not a good example. >> > > Well, you brought it up. Between you and me, I only care about x86 too. Fair enough. > >>> Xen has to tunnel >>> xenbus in pci for full virtualization (which is where Windows is, and >>> where Linux will be too once people realize it's faster). lguest is >>> meant as an example hypervisor, not an attempt to take over the world. >>> >> So pick any other hypervisor, and the situation is often similar. >> > > The situation is often pci. Even if that were true, which is debatable, do not confuse "convenient" with "optimal". If you don't care about maximum performance and advanced features like QOS, sure go ahead and use PCI. Why not. > >> >>> An right now you can have a guest using pci to access a mix of >>> userspace-emulated devices, userspace-emulated-but-kernel-accelerated >>> virtio devices, and real host devices. All on one dead-end bus. Try >>> that with vbus. >>> >> vbus is not interested in userspace devices. The charter is to provide >> facilities for utilizing the host linux kernel's IO capabilities in the >> most efficient, yet safe, manner possible. Those devices that fit >> outside that charter can ride on legacy mechanisms if that suits them >> best. >> > > vbus isn't, but I am. I would prefer not to have to expose > implementation decisions (kernel vs userspace) to the guest (vbus vs pci). > >>>> That won't cut it. For one, creating an eventfd is only part of the >>>> equation. I.e. you need to have originate/terminate somewhere >>>> interesting (and in-kernel, otherwise use tuntap). >>>> >>>> >>> vbus needs the same thing so it cancels out. >>> >> No, it does not. vbus just needs a relatively simple single message >> pipe between the guest and host (think "hypercall tunnel", if you will). >> > > That's ioeventfd. So far so similar. No, that is incorrect. For one, vhost uses them on a per-signal path basis, whereas vbus only has one channel for the entire guest->host. Second, I do not use ioeventfd anymore because it has too many problems with the surrounding technology. However, that is a topic for a different thread. > >> Per queue/device addressing is handled by the same conceptual namespace >> as the one that would trigger eventfds in the model you mention. And >> that namespace is built in to the vbus stack, and objects are registered >> automatically as they are created. >> >> Contrast that to vhost, which requires some other kernel interface to >> exist, and to be managed manually for each object that is created. Your >> libvhostconfig would need to somehow know how to perform this >> registration operation, and there would have to be something in the >> kernel to receive it, presumably on a per platform basis. Solving this >> problem generally would probably end up looking eerily like vbus, >> because thats what vbus does. >> > > vbus devices aren't magically instantiated. Userspace needs to > instantiate them too. Sure, there's less work on the host side since > you're using vbus instead of the native interface, but more work on the > guest side since you're using vbus instead of the native interface. No, that is incorrect. The amount of "work" that a guest does is actually the same in both cases, since the guest OS peforms the hotswap handling natively for all bus types (at least for Linux and Windows). You still need to have a PV layer to interface with those objects in both cases, as well, so there is no such thing as "native interface" for PV. Its only a matter of where it occurs in the stack. > > > >>> Well, let's see. Can vbus today: >>> >>> - let userspace know which features are available (so it can decide if >>> live migration is possible) >>> >> yes, its in sysfs. >> >> >>> - let userspace limit which features are exposed to the guest (so it can >>> make live migration possible among hosts of different capabilities) >>> >> yes, its in sysfs. >> > > Per-device? Yes, see /sys/vbus/devices/$dev/ to get per-instance attributes > non-privileged-user capable? The short answer is "not yet (I think)". I need to write a patch to properly set the mode attribute in sysfs, but I think this will be trivial. > >>> - let userspace know which features were negotiated (so it can transfer >>> them to the other host during live migration) >>> >> no, but we can easily add ->save()/->restore() to the model going >> forward, and the negotiated features are just a subcomponent if its >> serialized stream. >> >> >>> - let userspace tell the kernel which features were negotiated (when >>> live migration completes, to avoid requiring the guest to re-negotiate) >>> >> that would be the function of the ->restore() deserializer. >> >> >>> - do all that from an unprivileged process >>> >> yes, in the upcoming alacrityvm v0.3 with the ioctl based control plane. >> > > Ah, so you have two control planes. So what? If anything, it goes to show how extensible the framework is that a new plane could be added in 119 lines of code: ~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat Makefile | 3 - config-ioctl.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 1 deletion(-) if and when having two control planes exceeds its utility, I will submit a simple patch that removes the useless one. > >> Bottom line: vbus isn't done, especially w.r.t. live-migration..but that >> is not an valid argument against the idea if you believe in >> release-early/release-often. kvm wasn't (isn't) done either when it was >> proposed/merged. >> >> > > kvm didn't have an existing counterpart in Linux when it was > proposed/merged. > And likewise, neither does vbus. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-30 20:04 ` Gregory Haskins @ 2009-10-01 8:34 ` Avi Kivity 2009-10-01 9:28 ` Michael S. Tsirkin 2009-10-01 19:24 ` Gregory Haskins 0 siblings, 2 replies; 83+ messages in thread From: Avi Kivity @ 2009-10-01 8:34 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/30/2009 10:04 PM, Gregory Haskins wrote: >> A 2.6.27 guest, or Windows guest with the existing virtio drivers, won't work >> over vbus. >> > Binary compatibility with existing virtio drivers, while nice to have, > is not a specific requirement nor goal. We will simply load an updated > KMP/MSI into those guests and they will work again. As previously > discussed, this is how more or less any system works today. It's like > we are removing an old adapter card and adding a new one to "uprev the > silicon". > Virtualization is about not doing that. Sometimes it's necessary (when you have made unfixable design mistakes), but just to replace a bus, with no advantages to the guest that has to be changed (other hypervisors or hypervisorless deployment scenarios aren't). >> Further, non-shmem virtio can't work over vbus. >> > Actually I misspoke earlier when I said virtio works over non-shmem. > Thinking about it some more, both virtio and vbus fundamentally require > shared-memory, since sharing their metadata concurrently on both sides > is their raison d'être. > > The difference is that virtio utilizes a pre-translation/mapping (via > ->add_buf) from the guest side. OTOH, vbus uses a post translation > scheme (via memctx) from the host-side. If anything, vbus is actually > more flexible because it doesn't assume the entire guest address space > is directly mappable. > > In summary, your statement is incorrect (though it is my fault for > putting that idea in your head). > Well, Xen requires pre-translation (since the guest has to give the host (which is just another guest) permissions to access the data). So neither is a superset of the other, they're just different. It doesn't really matter since Xen is unlikely to adopt virtio. > An interesting thing here is that you don't even need a fancy > multi-homed setup to see the effects of my exit-ratio reduction work: > even single port configurations suffer from the phenomenon since many > devices have multiple signal-flows (e.g. network adapters tend to have > at least 3 flows: rx-ready, tx-complete, and control-events (link-state, > etc). Whats worse, is that the flows often are indirectly related (for > instance, many host adapters will free tx skbs during rx operations, so > you tend to get bursts of tx-completes at the same time as rx-ready. If > the flows map 1:1 with IDT, they will suffer the same problem. > You can simply use the same vector for both rx and tx and poll both at every interrupt. > In any case, here is an example run of a simple single-homed guest over > standard GigE. Whats interesting here is that .qnotify to .notify > ratio, as this is the interrupt-to-signal ratio. In this case, its > 170047/151918, which comes out to about 11% savings in interrupt injections: > > vbus-guest:/home/ghaskins # netperf -H dev > TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to > dev.laurelwood.net (192.168.1.10) port 0 AF_INET > Recv Send Send > Socket Socket Message Elapsed > Size Size Size Time Throughput > bytes bytes bytes secs. 10^6bits/sec > > 1048576 16384 16384 10.01 940.77 > vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge > .events : 170048 > .qnotify : 151918 > .qinject : 0 > .notify : 170047 > .inject : 18238 > .bridgecalls : 18 > .buscalls : 12 > vbus-guest:/home/ghaskins # cat /proc/interrupts > CPU0 > 0: 87 IO-APIC-edge timer > 1: 6 IO-APIC-edge i8042 > 4: 733 IO-APIC-edge serial > 6: 2 IO-APIC-edge floppy > 7: 0 IO-APIC-edge parport0 > 8: 0 IO-APIC-edge rtc0 > 9: 0 IO-APIC-fasteoi acpi > 10: 0 IO-APIC-fasteoi virtio1 > 12: 90 IO-APIC-edge i8042 > 14: 3041 IO-APIC-edge ata_piix > 15: 1008 IO-APIC-edge ata_piix > 24: 151933 PCI-MSI-edge vbus > 25: 0 PCI-MSI-edge virtio0-config > 26: 190 PCI-MSI-edge virtio0-input > 27: 28 PCI-MSI-edge virtio0-output > NMI: 0 Non-maskable interrupts > LOC: 9854 Local timer interrupts > SPU: 0 Spurious interrupts > CNT: 0 Performance counter interrupts > PND: 0 Performance pending work > RES: 0 Rescheduling interrupts > CAL: 0 Function call interrupts > TLB: 0 TLB shootdowns > TRM: 0 Thermal event interrupts > THR: 0 Threshold APIC interrupts > MCE: 0 Machine check exceptions > MCP: 1 Machine check polls > ERR: 0 > MIS: 0 > > Its important to note here that we are actually looking at the interrupt > rate, not the exit rate (which is usually a multiple of the interrupt > rate, since you have to factor in as many as three exits per interrupt > (IPI, window, EOI). Therefore we saved about 18k interrupts in this 10 > second burst, but we may have actually saved up to 54k exits in the > process. This is only over a 10 second window at GigE rates, so YMMV. > These numbers get even more dramatic on higher end hardware, but I > haven't had a chance to generate new numbers yet. > (irq window exits should only be required on a small percentage of interrupt injections, since the guest will try to disable interrupts for short periods only) > Looking at some external stats paints an even bleaker picture: "exits" > as reported by kvm_stat for virtio-pci based virtio-net tip the scales > at 65k/s vs 36k/s for vbus based venet. And virtio is consuming ~30% of > my quad-core's cpu, vs 19% for venet during the test. Its hard to know > which innovation or innovations may be responsible for the entire > reduction, but certainly the interrupt-to-signal ratio mentioned above > is probably helping. > Can you please stop comparing userspace-based virtio hosts to kernel-based venet hosts? We know the userspace implementation sucks. > The even worse news for 1:1 models is that the ratio of > exits-per-interrupt climbs with load (exactly when it hurts the most) > since that is when the probability that the vcpu will need all three > exits is the highest. > Requiring all three exits means the guest is spending most of its time with interrupts disabled; that's unlikely. Thanks for the numbers. Are those 11% attributable to rx/tx piggybacking from the same interface? Also, 170K interupts -> 17K interrupts/sec -> 55kbit/interrupt -> 6.8kB/interrupt. Ignoring interrupt merging and assuming equal rx/tx distribution, that's about 13kB/interrupt. Seems rather low for a saturated link. >> >>> and priortizable/nestable signals. >>> >>> >> That doesn't belong in a bus. >> > Everyone is of course entitled to an opinion, but the industry as a > whole would disagree with you. Signal path routing (1:1, aggregated, > etc) is at the discretion of the bus designer. Most buses actually do > _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc). > With standard PCI, they do not. But all modern host adapters support MSI and they will happily give you one interrupt per queue. > PCI is somewhat of an outlier in that regard afaict. Its actually a > nice feature of PCI when its used within its design spec (HW). For > SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling" > issue in the signal path I mentioned above. This is one of the many > reasons I think PCI is not the best choice for PV. > Look at the vmxnet3 submission (recently posted on virtualization@). It's a perfectly ordinary PCI NIC driver, apart from having so many 'V's in the code. 16 rx queues, 8 tx queues, 25 MSIs, BARs for the registers. So while the industry as a whole might disagree with me, it seems VMware does not. >>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png >>> >>> >> That's a red herring. The problem is not with virtio as an ABI, but >> with its implementation in userspace. vhost-net should offer equivalent >> performance to vbus. >> > That's pure speculation. I would advise you to reserve such statements > until after a proper bakeoff can be completed. Let's do that then. Please reserve the corresponding comparisons from your side as well. > This is not to mention > that vhost-net does nothing to address our other goals, like scheduler > coordination and non-802.x fabrics. > What are scheduler coordination and non-802.x fabrics? >> Right, when you ignore the points where they don't fit, it's a perfect >> mesh. >> > Where doesn't it fit? > (avoiding infinite loop) >>>> But that's not a strong argument for vbus; instead of adding vbus you >>>> could make virtio more friendly to non-virt >>>> >>>> >>> Actually, it _is_ a strong argument then because adding vbus is what >>> helps makes virtio friendly to non-virt, at least for when performance >>> matters. >>> >>> >> As vhost-net shows, you can do that without vbus >> > Citation please. Afaict, the one use case that we looked at for vhost > outside of KVM failed to adapt properly, so I do not see how this is true. > I think Ira said he can make vhost work? >> and without breaking compatibility. >> > Compatibility with what? vhost hasn't even been officially deployed in > KVM environments afaict, nevermind non-virt. Therefore, how could it > possibly have compatibility constraints with something non-virt already? > Citation please. > virtio-net over pci is deployed. Replacing the backend with vhost-net will require no guest modifications. Replacing the frontend with venet or virt-net/vbus-pci will require guest modifications. Obviously virtio-net isn't deployed in non-virt. But if we adopt vbus, we have to migrate guests. >> Of course there is such a thing as native, a pci-ready guest has tons of >> support built into it >> > I specifically mentioned that already ([1]). > > You are also overstating its role, since the basic OS is what implements > the native support for bus-objects, hotswap, etc, _not_ PCI. PCI just > rides underneath and feeds trivial events up, as do other bus-types > (usb, scsi, vbus, etc). But we have to implement vbus for each guest we want to support. That includes Windows and older Linux which has a different internal API, so we have to port the code multiple times, to get existing functionality. > And once those events are fed, you still need a > PV layer to actually handle the bus interface in a high-performance > manner so its not like you really have a "native" stack in either case. > virtio-net doesn't use any pv layer. >> that doesn't need to be retrofitted. >> > No, that is incorrect. You have to heavily modify the pci model with > layers on top to get any kind of performance out of it. Otherwise, we > would just use realtek emulation, which is technically the native PCI > you are apparently so enamored with. > virtio-net doesn't modify the PCI model. And if you look at vmxnet3, they mention that it conforms to somthing called UPT, which allows hardware vendors to implement parts of their NIC model. So vmxnet3 is apparently suitable to both hardware and software implementations. > Not to mention there are things you just plain can't do in PCI today, > like dynamically assign signal-paths, You can have dynamic MSI/queue routing with virtio, and each MSI can be routed to a vcpu at will. > priority, and coalescing, etc. > Do you mean interrupt priority? Well, apic allows interrupt priorities and Windows uses them; Linux doesn't. I don't see a reason to provide more than native hardware. >> Since >> practically everyone (including Xen) does their paravirt drivers atop >> pci, the claim that pci isn't suitable for high performance is incorrect. >> > Actually IIUC, I think Xen bridges to their own bus as well (and only > where they have to), just like vbus. They don't use PCI natively. PCI > is perfectly suited as a bridge transport for PV, as I think the Xen and > vbus examples have demonstrated. Its the 1:1 device-model where PCI has > the most problems. > N:1 breaks down on large guests since one vcpu will have to process all events. You could do N:M, with commands to change routings, but where's your userspace interface? you can't tell from /proc/interrupts which vbus interupts are active, and irqbalance can't steer them towards less busy cpus since they're invisible to the interrupt controller. >>> And lastly, why would you _need_ to use the so called "native" >>> mechanism? The short answer is, "you don't". Any given system (guest >>> or bare-metal) already have a wide-range of buses (try running "tree >>> /sys/bus" in Linux). More importantly, the concept of adding new buses >>> is widely supported in both the Windows and Linux driver model (and >>> probably any other guest-type that matters). Therefore, despite claims >>> to the contrary, its not hard or even unusual to add a new bus to the >>> mix. >>> >>> >> The short answer is "compatibility". >> > There was a point in time where the same could be said for virtio-pci > based drivers vs realtek and e1000, so that argument is demonstrably > silly. No one tried to make virtio work in a binary compatible way with > realtek emulation, yet we all survived the requirement for loading a > virtio driver to my knowledge. > The larger your installed base, the more difficult it is. Of course it's doable, but I prefer not doing it and instead improving things in a binary backwards compatible manner. If there is no choice we will bow to the inevitable and make our users upgrade. But at this point there is a choice, and I prefer to stick with vhost-net until it is proven that it won't work. > The bottom line is: Binary device compatibility is not required in any > other system (as long as you follow sensible versioning/id rules), so > why is KVM considered special? > One of the benefits of virtualization is that the guest model is stable. You can live-migrate guests and upgrade the hardware underneath. You can have a single guest image that you clone to provision new guests. If you switch to a new model, you give up those benefits, or you support both models indefinitely. Note even hardware nowadays is binary compatible. One e1000 driver supports a ton of different cards, and I think (not sure) newer cards will work with older drivers, just without all their features. > The fact is, it isn't special (at least not in this regard). What _is_ > required is "support" and we fully intend to support these proposed > components. I assure you that at least the users that care about > maximum performance will not generally mind loading a driver. Most of > them would have to anyway if they want to get beyond realtek emulation. > For a new install, sure. I'm talking about existing deployments (and those that will exist by the time vbus is ready for roll out). > I am certainly in no position to tell you how to feel, but this > declaration would seem from my perspective to be more of a means to an > end than a legitimate concern. Otherwise we would never have had virtio > support in the first place, since it was not "compatible" with previous > releases. > virtio was certainly not pain free, needing Windows drivers, updates to management tools (you can't enable it by default, so you have to offer it as a choice), mkinitrd, etc. I'd rather not have to go through that again. >> Especially if the device changed is your boot disk. >> > If and when that becomes a priority concern, that would be a function > transparently supported in the BIOS shipped with the hypervisor, and > would thus be invisible to the user. > No, you have to update the driver in your initrd (for Linux) or properly install the new driver (for Windows). It's especially difficult for Windows. >> You may not care about the pain caused to users, but I do, so I will >> continue to insist on compatibility. >> > For the users that don't care about maximum performance, there is no > change (and thus zero pain) required. They can use realtek or virtio if > they really want to. Neither is going away to my knowledge, and lets > face it: 2.6Gb/s out of virtio to userspace isn't *that* bad. But "good > enough" isn't good enough, and I won't rest till we get to native > performance. I don't want to support both virtio and vbus in parallel. There's enough work already. If we adopt vbus, we'll have to deprecate and eventually kill off virtio. > 2) True pain to users is not caused by lack of binary compatibility. > Its caused by lack of support. And its a good thing or we would all be > emulating 8086 architecture forever... > > ..oh wait, I guess we kind of do that already ;). But at least we can > slip in something more advanced once in a while (APIC vs PIC, USB vs > uart, iso9660 vs floppy, for instance) and update the guest stack > instead of insisting it must look like ISA forever for compatibility's sake. > PCI is continuously updated, with MSI, MSI-X, and IOMMU support being some recent updates. I'd like to ride on top of that instead of having to clone it for every guest I support. >> So we have: vbus needs a connector, vhost needs a connector. vbus >> doesn't need userspace to program the addresses (but does need userspace >> to instantiate the devices and to program the bus address decode) >> > First of all, bus-decode is substantially easier than per-device decode > (you have to track all those per-device/per-signal fds somewhere, > integrate with hotswap, etc), and its only done once per guest at > startup and left alone. So its already not apples to apples. > Right, it means you can hand off those eventfds to other qemus or other pure userspace servers. It's more flexible. > Second, while its true that the general kvm-connector bus-decode needs > to be programmed, that is a function of adapting to the environment > that _you_ created for me. The original kvm-connector was discovered > via cpuid and hypercalls, and didn't need userspace at all to set it up. > Therefore it would be entirely unfair of you to turn around and somehow > try to use that trait of the design against me since you yourself > imposed it. > No kvm feature will ever be exposed to a guest without userspace intervention. It's a basic requirement. If it causes complexity (and it does) we have to live with it. >> Does it work on Windows? >> > This question doesn't make sense. Hotswap control occurs on the host, > which is always Linux. > > If you were asking about whether a windows guest will support hotswap: > the answer is "yes". Our windows driver presents a unique PDO/FDO pair > for each logical device instance that is pushed out (just like the built > in usb, pci, scsi bus drivers that windows supports natively). > Ah, you have a Windows venet driver? >>> As an added bonus, its device-model is modular. A developer can write a >>> new device model, compile it, insmod it to the host kernel, hotplug it >>> to the running guest with mkdir/ln, and the come back out again >>> (hotunplug with rmdir, rmmod, etc). They may do this all without taking >>> the guest down, and while eating QEMU based IO solutions for breakfast >>> performance wise. >>> >>> Afaict, qemu can't do either of those things. >>> >>> >> We've seen that herring before, >> > Citation? > It's the compare venet-in-kernel to virtio-in-userspace thing again. Let's defer that until mst complete vhost-net mergable buffers, it which time we can compare vhost-net to venet and see how much vbus contributes to performance and how much of it comes from being in-kernel. >>>> Refactor instead of duplicating. >>>> >>>> >>> There is no duplicating. vbus has no equivalent today as virtio doesn't >>> define these layers. >>> >>> >> So define them if they're missing. >> > I just did. > Since this is getting confusing to me, I'll start from scratch looking at the vbus layers, top to bottom: Guest side: 1. venet guest kernel driver - AFAICT, duplicates the virtio-net guest driver functionality 2. vbus guest driver (config and hotplug) - duplicates pci, or if you need non-pci support, virtio config and its pci bindings; needs reimplementation for all supported guests 3. vbus guest driver (interrupt coalescing, priority) - if needed, should be implemented as an irqchip (and be totally orthogonal to the driver); needs reimplementation for all supported guests 4. vbus guest driver (shm/ioq) - finder grained layering than virtio (which only supports the combination, due to the need for Xen support); can be retrofitted to virtio at some cost Host side: 1. venet host kernel driver - is duplicated by vhost-net; doesn't support live migration, unprivileged users, or slirp 2. vbus host driver (config and hotplug) - duplicates pci support in userspace (which will need to be kept in any case); already has two userspace interfaces 3. vbus host driver (interrupt coalescing, priority) - if we think we need it (and I don't), should be part of kvm core, not a bus 4. vbus host driver (shm) - partially duplicated by vhost memory slots 5. vbus host driver (ioq) - duplicates userspace virtio, duplicated by vhost >>> There is no rewriting. vbus has no equivalent today as virtio doesn't >>> define these layers. >>> >>> By your own admission, you said if you wanted that capability, use a >>> library. What I think you are not understanding is vbus _is_ that >>> library. So what is the problem, exactly? >>> >>> >> It's not compatible. >> > No, that is incorrect. What you are apparently not understanding is > that not only is vbus that library, but its extensible. So even if > compatibility is your goal (it doesn't need to be IMO) it can be > accommodated by how you interface to the library. > To me, compatible means I can live migrate an image to a new system without the user knowing about the change. You'll be able to do that with vhost-net. >>>> >>>> >>> No, it does not. vbus just needs a relatively simple single message >>> pipe between the guest and host (think "hypercall tunnel", if you will). >>> >>> >> That's ioeventfd. So far so similar. >> > No, that is incorrect. For one, vhost uses them on a per-signal path > basis, whereas vbus only has one channel for the entire guest->host. > You'll probably need to change that as you start running smp guests. > Second, I do not use ioeventfd anymore because it has too many problems > with the surrounding technology. However, that is a topic for a > different thread. > Please post your issues. I see ioeventfd/irqfd as critical kvm interfaces. >> vbus devices aren't magically instantiated. Userspace needs to >> instantiate them too. Sure, there's less work on the host side since >> you're using vbus instead of the native interface, but more work on the >> guest side since you're using vbus instead of the native interface. >> > > No, that is incorrect. The amount of "work" that a guest does is > actually the same in both cases, since the guest OS peforms the hotswap > handling natively for all bus types (at least for Linux and Windows). > You still need to have a PV layer to interface with those objects in > both cases, as well, so there is no such thing as "native interface" for > PV. Its only a matter of where it occurs in the stack. > I'm missing something. Where's the pv layer for virtio-net? Linux drivers have an abstraction layer to deal with non-pci. But the Windows drivers are ordinary pci drivers with nothing that looks pv-ish. You could implement virtio-net hardware if you wanted to. >> non-privileged-user capable? >> > The short answer is "not yet (I think)". I need to write a patch to > properly set the mode attribute in sysfs, but I think this will be trivial. > > (and selinux label) >> Ah, so you have two control planes. >> > So what? If anything, it goes to show how extensible the framework is > that a new plane could be added in 119 lines of code: > > ~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat > Makefile | 3 - > config-ioctl.c | 117 > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 2 files changed, 119 insertions(+), 1 deletion(-) > > if and when having two control planes exceeds its utility, I will submit > a simple patch that removes the useless one. > It always begins with a 119-line patch and then grows, that's life. >> kvm didn't have an existing counterpart in Linux when it was >> proposed/merged. >> > And likewise, neither does vbus. > > For virt uses, I don't see the need. For non-virt, I have no opinion. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-10-01 8:34 ` Avi Kivity @ 2009-10-01 9:28 ` Michael S. Tsirkin 2009-10-01 19:24 ` Gregory Haskins 1 sibling, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-10-01 9:28 UTC (permalink / raw) To: Avi Kivity Cc: Gregory Haskins, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Thu, Oct 01, 2009 at 10:34:17AM +0200, Avi Kivity wrote: >> Second, I do not use ioeventfd anymore because it has too many problems >> with the surrounding technology. However, that is a topic for a >> different thread. >> > > Please post your issues. I see ioeventfd/irqfd as critical kvm interfaces. I second that. AFAIK ioeventfd/irqfd got exposed to userspace in 2.6.32-rc1, if there are issues we better nail them before 2.6.32 is out. And yes, please start a different thread. -- MST -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-10-01 8:34 ` Avi Kivity 2009-10-01 9:28 ` Michael S. Tsirkin @ 2009-10-01 19:24 ` Gregory Haskins 2009-10-03 10:00 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Gregory Haskins @ 2009-10-01 19:24 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 42845 bytes --] Avi Kivity wrote: > On 09/30/2009 10:04 PM, Gregory Haskins wrote: > > >>> A 2.6.27 guest, or Windows guest with the existing virtio drivers, >>> won't work >>> over vbus. >>> >> Binary compatibility with existing virtio drivers, while nice to have, >> is not a specific requirement nor goal. We will simply load an updated >> KMP/MSI into those guests and they will work again. As previously >> discussed, this is how more or less any system works today. It's like >> we are removing an old adapter card and adding a new one to "uprev the >> silicon". >> > > Virtualization is about not doing that. Sometimes it's necessary (when > you have made unfixable design mistakes), but just to replace a bus, > with no advantages to the guest that has to be changed (other > hypervisors or hypervisorless deployment scenarios aren't). The problem is that your continued assertion that there is no advantage to the guest is a completely unsubstantiated claim. As it stands right now, I have a public git tree that, to my knowledge, is the fastest KVM PV networking implementation around. It also has capabilities that are demonstrably not found elsewhere, such as the ability to render generic shared-memory interconnects (scheduling, timers), interrupt-priority (qos), and interrupt-coalescing (exit-ratio reduction). I designed each of these capabilities after carefully analyzing where KVM was coming up short. Those are facts. I can't easily prove which of my new features alone are what makes it special per se, because I don't have unit tests for each part that breaks it down. What I _can_ state is that its the fastest and most feature rich KVM-PV tree that I am aware of, and others may download and test it themselves to verify my claims. The disproof, on the other hand, would be in a counter example that still meets all the performance and feature criteria under all the same conditions while maintaining the existing ABI. To my knowledge, this doesn't exist. Therefore, if you believe my work is irrelevant, show me a git tree that accomplishes the same feats in a binary compatible way, and I'll rethink my position. Until then, complaining about lack of binary compatibility is pointless since it is not an insurmountable proposition, and the one and only available solution declares it a required casualty. > >>> Further, non-shmem virtio can't work over vbus. >>> >> Actually I misspoke earlier when I said virtio works over non-shmem. >> Thinking about it some more, both virtio and vbus fundamentally require >> shared-memory, since sharing their metadata concurrently on both sides >> is their raison d'être. >> >> The difference is that virtio utilizes a pre-translation/mapping (via >> ->add_buf) from the guest side. OTOH, vbus uses a post translation >> scheme (via memctx) from the host-side. If anything, vbus is actually >> more flexible because it doesn't assume the entire guest address space >> is directly mappable. >> >> In summary, your statement is incorrect (though it is my fault for >> putting that idea in your head). >> > > Well, Xen requires pre-translation (since the guest has to give the host > (which is just another guest) permissions to access the data). Actually I am not sure that it does require pre-translation. You might be able to use the memctx->copy_to/copy_from scheme in post translation as well, since those would be able to communicate to something like the xen kernel. But I suppose either method would result in extra exits, so there is no distinct benefit using vbus there..as you say below "they're just different". The biggest difference is that my proposed model gets around the notion that the entire guest address space can be represented by an arbitrary pointer. For instance, the copy_to/copy_from routines take a GPA, but may use something indirect like a DMA controller to access that GPA. On the other hand, virtio fully expects a viable pointer to come out of the interface iiuc. This is in part what makes vbus more adaptable to non-virt. > So neither is a superset of the other, they're just different. > > It doesn't really matter since Xen is unlikely to adopt virtio. Agreed. > >> An interesting thing here is that you don't even need a fancy >> multi-homed setup to see the effects of my exit-ratio reduction work: >> even single port configurations suffer from the phenomenon since many >> devices have multiple signal-flows (e.g. network adapters tend to have >> at least 3 flows: rx-ready, tx-complete, and control-events (link-state, >> etc). Whats worse, is that the flows often are indirectly related (for >> instance, many host adapters will free tx skbs during rx operations, so >> you tend to get bursts of tx-completes at the same time as rx-ready. If >> the flows map 1:1 with IDT, they will suffer the same problem. >> > > You can simply use the same vector for both rx and tx and poll both at > every interrupt. Yes, but that has its own problems: e.g. additional exits or at least additional overhead figuring out what happens each time. This is even more important as we scale out to MQ which may have dozens of queue pairs. You really want finer grained signal-path decode if you want peak performance. > >> In any case, here is an example run of a simple single-homed guest over >> standard GigE. Whats interesting here is that .qnotify to .notify >> ratio, as this is the interrupt-to-signal ratio. In this case, its >> 170047/151918, which comes out to about 11% savings in interrupt >> injections: >> >> vbus-guest:/home/ghaskins # netperf -H dev >> TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to >> dev.laurelwood.net (192.168.1.10) port 0 AF_INET >> Recv Send Send >> Socket Socket Message Elapsed >> Size Size Size Time Throughput >> bytes bytes bytes secs. 10^6bits/sec >> >> 1048576 16384 16384 10.01 940.77 >> vbus-guest:/home/ghaskins # cat /sys/kernel/debug/pci-to-vbus-bridge >> .events : 170048 >> .qnotify : 151918 >> .qinject : 0 >> .notify : 170047 >> .inject : 18238 >> .bridgecalls : 18 >> .buscalls : 12 >> vbus-guest:/home/ghaskins # cat /proc/interrupts >> CPU0 >> 0: 87 IO-APIC-edge timer >> 1: 6 IO-APIC-edge i8042 >> 4: 733 IO-APIC-edge serial >> 6: 2 IO-APIC-edge floppy >> 7: 0 IO-APIC-edge parport0 >> 8: 0 IO-APIC-edge rtc0 >> 9: 0 IO-APIC-fasteoi acpi >> 10: 0 IO-APIC-fasteoi virtio1 >> 12: 90 IO-APIC-edge i8042 >> 14: 3041 IO-APIC-edge ata_piix >> 15: 1008 IO-APIC-edge ata_piix >> 24: 151933 PCI-MSI-edge vbus >> 25: 0 PCI-MSI-edge virtio0-config >> 26: 190 PCI-MSI-edge virtio0-input >> 27: 28 PCI-MSI-edge virtio0-output >> NMI: 0 Non-maskable interrupts >> LOC: 9854 Local timer interrupts >> SPU: 0 Spurious interrupts >> CNT: 0 Performance counter interrupts >> PND: 0 Performance pending work >> RES: 0 Rescheduling interrupts >> CAL: 0 Function call interrupts >> TLB: 0 TLB shootdowns >> TRM: 0 Thermal event interrupts >> THR: 0 Threshold APIC interrupts >> MCE: 0 Machine check exceptions >> MCP: 1 Machine check polls >> ERR: 0 >> MIS: 0 >> >> Its important to note here that we are actually looking at the interrupt >> rate, not the exit rate (which is usually a multiple of the interrupt >> rate, since you have to factor in as many as three exits per interrupt >> (IPI, window, EOI). Therefore we saved about 18k interrupts in this 10 >> second burst, but we may have actually saved up to 54k exits in the >> process. This is only over a 10 second window at GigE rates, so YMMV. >> These numbers get even more dramatic on higher end hardware, but I >> haven't had a chance to generate new numbers yet. >> > > (irq window exits should only be required on a small percentage of > interrupt injections, since the guest will try to disable interrupts for > short periods only) Good point. You are probably right. Certainly the other 2 remain, however. Ultimately, the fastest exit is the one you do not take. That is what I am trying to achieve. > >> Looking at some external stats paints an even bleaker picture: "exits" >> as reported by kvm_stat for virtio-pci based virtio-net tip the scales >> at 65k/s vs 36k/s for vbus based venet. And virtio is consuming ~30% of >> my quad-core's cpu, vs 19% for venet during the test. Its hard to know >> which innovation or innovations may be responsible for the entire >> reduction, but certainly the interrupt-to-signal ratio mentioned above >> is probably helping. >> > > Can you please stop comparing userspace-based virtio hosts to > kernel-based venet hosts? We know the userspace implementation sucks. Sorry, but its all I have right now. Last time I tried vhost it required a dedicated adapter which was a non-starter for my lab rig since I share it with others. I didn't want to tear apart the bridge setup, especially since mst told me the performance was worse than userspace. Therefore, there was no real point in working hard to get it running. I figured I would wait till the config and performance issues were resolved and there is a git tree to pull in. > >> The even worse news for 1:1 models is that the ratio of >> exits-per-interrupt climbs with load (exactly when it hurts the most) >> since that is when the probability that the vcpu will need all three >> exits is the highest. >> > > Requiring all three exits means the guest is spending most of its time > with interrupts disabled; that's unlikely. (see "softirqs" above) > > Thanks for the numbers. Are those 11% attributable to rx/tx > piggybacking from the same interface? Its hard to tell, since I am not instrumented to discern the difference in this run. I do know from previous traces on the 10GE rig that the chelsio T3 that I am running reaps the pending-tx ring at the same time as a rx polling, so its very likely that both events are often coincident at least there. > > Also, 170K interupts -> 17K interrupts/sec -> 55kbit/interrupt -> > 6.8kB/interrupt. Ignoring interrupt merging and assuming equal rx/tx > distribution, that's about 13kB/interrupt. Seems rather low for a > saturated link. I am not following: Do you suspect that I have too few interrupts to represent 940Mb/s, or that I have too little data/interrupt and this ratio should be improved? > >>> >>>> and priortizable/nestable signals. >>>> >>>> >>> That doesn't belong in a bus. >>> >> Everyone is of course entitled to an opinion, but the industry as a >> whole would disagree with you. Signal path routing (1:1, aggregated, >> etc) is at the discretion of the bus designer. Most buses actually do >> _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc). >> > > With standard PCI, they do not. But all modern host adapters support > MSI and they will happily give you one interrupt per queue. While MSI is a good technological advancement for PCI, I was referring to signal:IDT ratio. MSI would still classify as 1:1. > >> PCI is somewhat of an outlier in that regard afaict. Its actually a >> nice feature of PCI when its used within its design spec (HW). For >> SW/PV, 1:1 suffers from, among other issues, that "triple-exit scaling" >> issue in the signal path I mentioned above. This is one of the many >> reasons I think PCI is not the best choice for PV. >> > > Look at the vmxnet3 submission (recently posted on virtualization@). > It's a perfectly ordinary PCI NIC driver, apart from having so many 'V's > in the code. 16 rx queues, 8 tx queues, 25 MSIs, BARs for the > registers. So while the industry as a whole might disagree with me, it > seems VMware does not. At the very least, BARs for the registers is worrisome, but I will reserve judgment until I see the numbers and review the code. > > >>>> http://developer.novell.com/wiki/images/b/b7/31-rc4_throughput.png >>>> >>>> >>> That's a red herring. The problem is not with virtio as an ABI, but >>> with its implementation in userspace. vhost-net should offer equivalent >>> performance to vbus. >>> >> That's pure speculation. I would advise you to reserve such statements >> until after a proper bakeoff can be completed. > > Let's do that then. Please reserve the corresponding comparisons from > your side as well. That is quite the odd request. My graphs are all built using readily available code and open tools and do not speculate as to what someone else may come up with in the future. They reflect what is available today. Do you honestly think I should wait indefinitely for a competing idea to try to catch up before I talk about my results? That's certainly an interesting perspective. With all due respect, the only red-herring is your unsubstantiated claims that my results do not matter. > >> This is not to mention >> that vhost-net does nothing to address our other goals, like scheduler >> coordination and non-802.x fabrics. >> > > What are scheduler coordination and non-802.x fabrics? We are working on real-time, IB and QOS, for examples, in addition to the now well known 802.x venet driver. > >>> Right, when you ignore the points where they don't fit, it's a perfect >>> mesh. >>> >> Where doesn't it fit? >> > > (avoiding infinite loop) I'm serious. Where doesn't it fit? Point me at a URL if its already discussed. > >>>>> But that's not a strong argument for vbus; instead of adding vbus you >>>>> could make virtio more friendly to non-virt >>>>> >>>>> >>>> Actually, it _is_ a strong argument then because adding vbus is what >>>> helps makes virtio friendly to non-virt, at least for when performance >>>> matters. >>>> >>>> >>> As vhost-net shows, you can do that without vbus >>> >> Citation please. Afaict, the one use case that we looked at for vhost >> outside of KVM failed to adapt properly, so I do not see how this is >> true. >> > > I think Ira said he can make vhost work? > Not exactly. It kind of works for 802.x only (albeit awkwardly) because there is no strong distinction between "resource" and "consumer" with ethernet. So you can run it inverted without any serious consequences (at least, not from consequences of the inversion). Since the x86 boards are the actual resource providers in his system, other device types will fail to map to the vhost model properly, like disk-io or consoles for instance. >>> and without breaking compatibility. >>> >> Compatibility with what? vhost hasn't even been officially deployed in >> KVM environments afaict, nevermind non-virt. Therefore, how could it >> possibly have compatibility constraints with something non-virt already? >> Citation please. >> > > virtio-net over pci is deployed. Replacing the backend with vhost-net > will require no guest modifications. That _is_ a nice benefit, I agree. I just do not agree its a hard requirement. > Replacing the frontend with venet or virt-net/vbus-pci will require guest modifications. Understood, and I am ok with that. I think its necessary to gain critical performance enhancing features, and I think it will help in the long term to support more guests. I have not yet been proven wrong. > > Obviously virtio-net isn't deployed in non-virt. But if we adopt vbus, > we have to migrate guests. As a first step, lets just shoot for "support" instead of "adopt". Ill continue to push patches to you that help interfacing with the guest in a vbus neutral way (like irqfd/ioeventfd) and we can go from there. Are you open to this work assuming it passes normal review cycles, etc? It would presumably be of use to others that want to interface to a guest (e.g. vhost) as well. > > > >>> Of course there is such a thing as native, a pci-ready guest has tons of >>> support built into it >>> >> I specifically mentioned that already ([1]). >> >> You are also overstating its role, since the basic OS is what implements >> the native support for bus-objects, hotswap, etc, _not_ PCI. PCI just >> rides underneath and feeds trivial events up, as do other bus-types >> (usb, scsi, vbus, etc). > > But we have to implement vbus for each guest we want to support. That > includes Windows and older Linux which has a different internal API, so > we have to port the code multiple times, to get existing functionality. Perhaps, but in reality its not very bad. The windows driver will already support any recent version that matters (at least back to 2000/XP), and the Linux side doesn't do anything weird so I know it works at least back to 2.6.16 iirc, and probably further. > >> And once those events are fed, you still need a >> PV layer to actually handle the bus interface in a high-performance >> manner so its not like you really have a "native" stack in either case. >> > > virtio-net doesn't use any pv layer. Well, it does when you really look closely at how it works. For one, it has the virtqueues library that would be (or at least _should be_) common for all virtio-X adapters, etc etc. Even if this layer is collapsed into each driver on the Windows platform, its still there nonetheless. > >>> that doesn't need to be retrofitted. >>> >> No, that is incorrect. You have to heavily modify the pci model with >> layers on top to get any kind of performance out of it. Otherwise, we >> would just use realtek emulation, which is technically the native PCI >> you are apparently so enamored with. >> > > virtio-net doesn't modify the PCI model. Sure it does. It doesn't use MMIO/PIO bars for registers, it uses vq->kick(). It doesn't use pci-config-space, it uses virtio->features. It doesn't use PCI interrupts, it uses a callback on the vq etc, etc. You would never use raw "registers", as the exit rate would crush you. You would never use raw interrupts, as you need a shared-memory based mitigation scheme. IOW: Virtio has a device model layer that tunnels over PCI. It doesn't actually use PCI directly. This is in fact what allows the linux version to work over lguest, s390 and vbus in addition to PCI. > And if you look at vmxnet3, > they mention that it conforms to somthing called UPT, which allows > hardware vendors to implement parts of their NIC model. So vmxnet3 is > apparently suitable to both hardware and software implementations. > That's interesting and all, but the charter for vbus is for optimal software-to-software interfaces to a linux host, so I don't mind if my spec doesn't look conducive to a hardware implementation. As it turns out, I'm sure it would work there as well, but some of the optimizations wouldn't matter as much since hardware behaves differently. >> Not to mention there are things you just plain can't do in PCI today, >> like dynamically assign signal-paths, > > You can have dynamic MSI/queue routing with virtio, and each MSI can be > routed to a vcpu at will. Can you arbitrarily create a new MSI/queue on a per-device basis on the fly? We want to do this for some upcoming designs. Or do you need to predeclare the vectors when the device is hot-added? > >> priority, and coalescing, etc. >> > > Do you mean interrupt priority? Well, apic allows interrupt priorities > and Windows uses them; Linux doesn't. I don't see a reason to provide > more than native hardware. The APIC model is not optimal for PV given the exits required for a basic operation like an interrupt injection, and has scaling/flexibility issues with its 16:16 priority mapping. OTOH, you don't necessarily want to rip it out because of all the additional features it has like the IPI facility and the handling of many low-performance data-paths. Therefore, I am of the opinion that the optimal placement for advanced signal handling is directly at the bus that provides the high-performance resources. I could be convinced otherwise with a compelling argument, but I think this is the path of least resistance. > >>> Since >>> practically everyone (including Xen) does their paravirt drivers atop >>> pci, the claim that pci isn't suitable for high performance is >>> incorrect. >>> >> Actually IIUC, I think Xen bridges to their own bus as well (and only >> where they have to), just like vbus. They don't use PCI natively. PCI >> is perfectly suited as a bridge transport for PV, as I think the Xen and >> vbus examples have demonstrated. Its the 1:1 device-model where PCI has >> the most problems. >> > > N:1 breaks down on large guests since one vcpu will have to process all > events. Well, first of all that is not necessarily true. Some high performance buses like SCSI and FC work fine with an aggregated model, so its not a foregone conclusion that aggregation kills SMP IO performance. This is especially true when you adding coalescing on top, like AlacrityVM does. I do agree that other subsystems, like networking for instance, may sometimes benefit from flexible signal-routing because of multiqueue, etc, for particularly large guests. However, the decision to make the current kvm-connector used in AlacrityVM aggregate one priority FIFO per IRQ was an intentional design tradeoff. My experience with my target user base is that these data-centers are typically deploying 1-4 vcpu guests, so I optimized for that. YMMV, so we can design a different connector, or a different mode of the existing connector, to accommodate large guests as well if that was something desirable. > You could do N:M, with commands to change routings, but where's > your userspace interface? Well, we should be able to add that when/if its needed. I just don't think the need is there yet. KVM tops out at 16 IIUC anyway. > you can't tell from /proc/interrupts which > vbus interupts are active This should be trivial to add some kind of *fs display. I will fix this shortly. > and irqbalance can't steer them towards less > busy cpus since they're invisible to the interrupt controller. (see N:M above) > > >>>> And lastly, why would you _need_ to use the so called "native" >>>> mechanism? The short answer is, "you don't". Any given system (guest >>>> or bare-metal) already have a wide-range of buses (try running "tree >>>> /sys/bus" in Linux). More importantly, the concept of adding new buses >>>> is widely supported in both the Windows and Linux driver model (and >>>> probably any other guest-type that matters). Therefore, despite claims >>>> to the contrary, its not hard or even unusual to add a new bus to the >>>> mix. >>>> >>>> >>> The short answer is "compatibility". >>> >> There was a point in time where the same could be said for virtio-pci >> based drivers vs realtek and e1000, so that argument is demonstrably >> silly. No one tried to make virtio work in a binary compatible way with >> realtek emulation, yet we all survived the requirement for loading a >> virtio driver to my knowledge. >> > > The larger your installed base, the more difficult it is. Of course > it's doable, but I prefer not doing it and instead improving things in a > binary backwards compatible manner. If there is no choice we will bow > to the inevitable and make our users upgrade. But at this point there > is a choice, and I prefer to stick with vhost-net until it is proven > that it won't work. Fair enough. But note you are likely going to need to respin your existing drivers anyway to gain peak performance, since there are known shortcomings in the virtio-pci ABI today (like queue identification in the interrupt hotpath) as it stands. So that pain is coming one way or the other. > >> The bottom line is: Binary device compatibility is not required in any >> other system (as long as you follow sensible versioning/id rules), so >> why is KVM considered special? >> > > One of the benefits of virtualization is that the guest model is > stable. You can live-migrate guests and upgrade the hardware > underneath. You can have a single guest image that you clone to > provision new guests. If you switch to a new model, you give up those > benefits, or you support both models indefinitely. I understand what you are saying, but I don't buy it. If you add a new feature to an existing model even without something as drastic as a new bus, you still have the same exact dilemma: The migration target needs feature parity with consumed features in the guest. Its really the same no matter what unless you never add guest-visible features. > > Note even hardware nowadays is binary compatible. One e1000 driver > supports a ton of different cards, and I think (not sure) newer cards > will work with older drivers, just without all their features. Noted, but that is not really the same thing. Thats more like adding a feature bit to virtio, not replacing GigE with 10GE. > >> The fact is, it isn't special (at least not in this regard). What _is_ >> required is "support" and we fully intend to support these proposed >> components. I assure you that at least the users that care about >> maximum performance will not generally mind loading a driver. Most of >> them would have to anyway if they want to get beyond realtek emulation. >> > > For a new install, sure. I'm talking about existing deployments (and > those that will exist by the time vbus is ready for roll out). The user will either specify "-net nic,model=venet", or they won't. Its their choice. Changing those parameters, vbus or otherwise, has ramifications w.r.t. what drivers must be loaded, and the user will understand this. > >> I am certainly in no position to tell you how to feel, but this >> declaration would seem from my perspective to be more of a means to an >> end than a legitimate concern. Otherwise we would never have had virtio >> support in the first place, since it was not "compatible" with previous >> releases. >> > > virtio was certainly not pain free, needing Windows drivers, updates to > management tools (you can't enable it by default, so you have to offer > it as a choice), mkinitrd, etc. I'd rather not have to go through that > again. No general argument here, other than to reiterate that the driver is going to have to be redeployed anyway, since it will likely need new feature bits to fix the current ABI. > >>> Especially if the device changed is your boot disk. >>> >> If and when that becomes a priority concern, that would be a function >> transparently supported in the BIOS shipped with the hypervisor, and >> would thus be invisible to the user. >> > > No, you have to update the driver in your initrd (for Linux) Thats fine, the distros generally do this automatically when you load the updated KMP package. > or properly install the new driver (for Windows). It's especially > difficult for Windows. What is difficult here? I never seem to have any problems and I have all kinds of guests from XP to Win7. >>> You may not care about the pain caused to users, but I do, so I will >>> continue to insist on compatibility. >>> >> For the users that don't care about maximum performance, there is no >> change (and thus zero pain) required. They can use realtek or virtio if >> they really want to. Neither is going away to my knowledge, and lets >> face it: 2.6Gb/s out of virtio to userspace isn't *that* bad. But "good >> enough" isn't good enough, and I won't rest till we get to native >> performance. > > I don't want to support both virtio and vbus in parallel. There's > enough work already. Until I find some compelling reason that indicates I was wrong about all of this, I will continue building a community around the vbus code base and developing support for its components anyway. So that effort is going to happen in parallel regardless. This is purely a question about whether you will work with me to make vbus an available option in upstream KVM or not. > If we adopt vbus, we'll have to deprecate and eventually kill off virtio. Thats more hyperbole. virtio is technically fine and complementary as it is. No one says you have to do anything drastic w.r.t. virtio. If you _did_ adopt vbus, perhaps you would want to optionally deprecate vhost or possibly the virtio-pci adapter, but that is about it. The rest of the infrastructure should be preserved if it was designed properly. > >> 2) True pain to users is not caused by lack of binary compatibility. >> Its caused by lack of support. And its a good thing or we would all be >> emulating 8086 architecture forever... >> >> ..oh wait, I guess we kind of do that already ;). But at least we can >> slip in something more advanced once in a while (APIC vs PIC, USB vs >> uart, iso9660 vs floppy, for instance) and update the guest stack >> instead of insisting it must look like ISA forever for compatibility's >> sake. >> > > PCI is continuously updated, with MSI, MSI-X, and IOMMU support being > some recent updates. I'd like to ride on top of that instead of having > to clone it for every guest I support. While a noble goal, one of the points I keep making though, as someone who has built the stack both ways, is almost none of the PCI stack is actually needed to get the PV job done. The part you do need is primarily a function of the generic OS stack and trivial to interface with anyway. Plus, as a lesser point: it doesn't work everywhere so you end up solving the same kind of vbus-like design problem again and again when PCI is missing. > >>> So we have: vbus needs a connector, vhost needs a connector. vbus >>> doesn't need userspace to program the addresses (but does need userspace >>> to instantiate the devices and to program the bus address decode) >>> >> First of all, bus-decode is substantially easier than per-device decode >> (you have to track all those per-device/per-signal fds somewhere, >> integrate with hotswap, etc), and its only done once per guest at >> startup and left alone. So its already not apples to apples. >> > > Right, it means you can hand off those eventfds to other qemus or other > pure userspace servers. It's more flexible. > >> Second, while its true that the general kvm-connector bus-decode needs >> to be programmed, that is a function of adapting to the environment >> that _you_ created for me. The original kvm-connector was discovered >> via cpuid and hypercalls, and didn't need userspace at all to set it up. >> Therefore it would be entirely unfair of you to turn around and somehow >> try to use that trait of the design against me since you yourself >> imposed it. >> > > No kvm feature will ever be exposed to a guest without userspace > intervention. It's a basic requirement. If it causes complexity (and > it does) we have to live with it. Right. cpuid is exposed by userspace, so that was the control point in the original design. The presence of the PCI_BRIDGE in the new code (again exported by userspace) is what controls it now. From there, there are various mechanisms we employ to control that features the guest may see, such as the sysfs/attribute system, the revision of the bridge, and the feature bits that it and its subordinate devices expose. > >>> Does it work on Windows? >>> >> This question doesn't make sense. Hotswap control occurs on the host, >> which is always Linux. >> >> If you were asking about whether a windows guest will support hotswap: >> the answer is "yes". Our windows driver presents a unique PDO/FDO pair >> for each logical device instance that is pushed out (just like the built >> in usb, pci, scsi bus drivers that windows supports natively). >> > > Ah, you have a Windows venet driver? Almost. It's WIP, but hopefully soon, along with core support for the bus, etc. > > >>>> As an added bonus, its device-model is modular. A developer can >>>> write a >>>> new device model, compile it, insmod it to the host kernel, hotplug it >>>> to the running guest with mkdir/ln, and the come back out again >>>> (hotunplug with rmdir, rmmod, etc). They may do this all without >>>> taking >>>> the guest down, and while eating QEMU based IO solutions for breakfast >>>> performance wise. >>>> >>>> Afaict, qemu can't do either of those things. >>>> >>>> >>> We've seen that herring before, >>> >> Citation? >> > > It's the compare venet-in-kernel to virtio-in-userspace thing again. No, you said KVM has "userspace hotplug". I retorted that vbus not only has hotplug, it also has a modular architecture. You then countered that this feature is a red-herring. If this was previously discussed and rejected for some reason, I would like to know the history. Or did I misunderstand you? Or if you are somehow implying that the lack of modularity has to do with virtio-in-userspace, I beg to differ. Even with vhost, you still have to have a paired model in qemu, so it will not be a modular architecture by virtue of the vhost patch series either. You would need qemu to support modular devices as well, which I've been told that isn't going to happen any time soon. > Let's defer that until mst complete vhost-net mergable buffers, it which > time we can compare vhost-net to venet and see how much vbus contributes > to performance and how much of it comes from being in-kernel. I look forward to it. > >>>>> Refactor instead of duplicating. >>>>> >>>>> >>>> There is no duplicating. vbus has no equivalent today as virtio >>>> doesn't >>>> define these layers. >>>> >>>> >>> So define them if they're missing. >>> >> I just did. >> > > Since this is getting confusing to me, I'll start from scratch looking > at the vbus layers, top to bottom: I wouldn't describe it like this > > Guest side: > 1. venet guest kernel driver - AFAICT, duplicates the virtio-net guest > driver functionality > 2. vbus guest driver (config and hotplug) - duplicates pci, or if you > need non-pci support, virtio config and its pci bindings; needs > reimplementation for all supported guests > 3. vbus guest driver (interrupt coalescing, priority) - if needed, > should be implemented as an irqchip (and be totally orthogonal to the > driver); needs reimplementation for all supported guests > 4. vbus guest driver (shm/ioq) - finder grained layering than virtio > (which only supports the combination, due to the need for Xen support); > can be retrofitted to virtio at some cost > > Host side: > 1. venet host kernel driver - is duplicated by vhost-net; doesn't > support live migration, unprivileged users, or slirp > 2. vbus host driver (config and hotplug) - duplicates pci support in > userspace (which will need to be kept in any case); already has two > userspace interfaces > 3. vbus host driver (interrupt coalescing, priority) - if we think we > need it (and I don't), should be part of kvm core, not a bus > 4. vbus host driver (shm) - partially duplicated by vhost memory slots > 5. vbus host driver (ioq) - duplicates userspace virtio, duplicated by > vhost For one, we have the common layer of shm-signal, and IOQ. These libraries were designed to be reused on both sides of the link. Generally shm-signal has no counterpart in the existing model, though its functionality is integrated into the virtqueue. IOQ is duplicated by virtqueue, but I think its a better design at least in this role, so I use it pervasively throughout the stack. We can discuss that in a separate thread. From there, going down the stack, it looks like (guest-side) |------------------------- | venet (competes with virtio-net) |------------------------- | vbus-proxy (competes with pci-bus, config+hotplug, sync/async) |------------------------- | vbus-pcibridge (interrupt coalescing + priority, fastpath) |------------------------- | |------------------------- | vbus-kvmconnector (interrupt coalescing + priority, fast-path) |------------------------- | vbus-core (hotplug, address decoding, etc) |------------------------- | venet-device (ioq frame/deframe to tap/macvlan/vmdq, etc) |------------------------- If you want to use virtio, insert a virtio layer between the "driver" and "device" components at the outer edges of the stack. > >>>> There is no rewriting. vbus has no equivalent today as virtio doesn't >>>> define these layers. >>>> >>>> By your own admission, you said if you wanted that capability, use a >>>> library. What I think you are not understanding is vbus _is_ that >>>> library. So what is the problem, exactly? >>>> >>>> >>> It's not compatible. >>> >> No, that is incorrect. What you are apparently not understanding is >> that not only is vbus that library, but its extensible. So even if >> compatibility is your goal (it doesn't need to be IMO) it can be >> accommodated by how you interface to the library. >> > > To me, compatible means I can live migrate an image to a new system > without the user knowing about the change. You'll be able to do that > with vhost-net. As soon as you add any new guest-visible feature, you are in the same exact boat. > >>>>> >>>>> >>>> No, it does not. vbus just needs a relatively simple single message >>>> pipe between the guest and host (think "hypercall tunnel", if you >>>> will). >>>> >>>> >>> That's ioeventfd. So far so similar. >>> >> No, that is incorrect. For one, vhost uses them on a per-signal path >> basis, whereas vbus only has one channel for the entire guest->host. >> > > You'll probably need to change that as you start running smp guests. The hypercall channel is already SMP optimized over a single PIO path, so I think we are covered there. See "fastcall" in my code for details: http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=drivers/vbus/pci-bridge.c;h=81f7cdd2167ae2f53406850ebac448a2183842f2;hb=fd1c156be7735f8b259579f18268a756beccfc96#l102 It just passes the cpuid into the PIO write so we can have parallel, lockless "hypercalls". This forms the basis of our guest scheduler support, for instance. > >> Second, I do not use ioeventfd anymore because it has too many problems >> with the surrounding technology. However, that is a topic for a >> different thread. >> > > Please post your issues. I see ioeventfd/irqfd as critical kvm interfaces. Will do. It would be nice to come back to this interface. > >>> vbus devices aren't magically instantiated. Userspace needs to >>> instantiate them too. Sure, there's less work on the host side since >>> you're using vbus instead of the native interface, but more work on the >>> guest side since you're using vbus instead of the native interface. >>> >> >> No, that is incorrect. The amount of "work" that a guest does is >> actually the same in both cases, since the guest OS peforms the hotswap >> handling natively for all bus types (at least for Linux and Windows). >> You still need to have a PV layer to interface with those objects in >> both cases, as well, so there is no such thing as "native interface" for >> PV. Its only a matter of where it occurs in the stack. >> > > I'm missing something. Where's the pv layer for virtio-net? covered above > > Linux drivers have an abstraction layer to deal with non-pci. But the > Windows drivers are ordinary pci drivers with nothing that looks > pv-ish. They certainly do not have to be since Windows supports a similar notion as the LDM in Linux. In fact, we are exploiting that Windows facility in our drivers. It's rather unfortunate if its true that your drivers were not designed this way, since virtio has a rather nice stack model on Linux that could work in Windows as well. > You could implement virtio-net hardware if you wanted to. Technically you could build vbus in hardware too, I suppose, since the bridge is PCI compliant. I would never advocate it, however, since many of our tricks do not matter if its real hardware (e.g. they are optimized for the costs associated with VM). > >>> non-privileged-user capable? >>> >> The short answer is "not yet (I think)". I need to write a patch to >> properly set the mode attribute in sysfs, but I think this will be >> trivial. >> >> > > (and selinux label) If any of these things that are problems, they can simply be exposed via the new ioctl admin interface, I suppose. > >>> Ah, so you have two control planes. >>> >> So what? If anything, it goes to show how extensible the framework is >> that a new plane could be added in 119 lines of code: >> >> ~/git/linux-2.6> stg show vbus-add-admin-ioctls.patch | diffstat >> Makefile | 3 - >> config-ioctl.c | 117 >> +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ >> 2 files changed, 119 insertions(+), 1 deletion(-) >> >> if and when having two control planes exceeds its utility, I will submit >> a simple patch that removes the useless one. >> > > It always begins with a 119-line patch and then grows, that's life. > I can't argue with that. >>> kvm didn't have an existing counterpart in Linux when it was >>> proposed/merged. >>> >> And likewise, neither does vbus. >> >> > > For virt uses, I don't see the need. For non-virt, I have no opinion. > > Well, I hope to change your mind on both counts, then. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-10-01 19:24 ` Gregory Haskins @ 2009-10-03 10:00 ` Avi Kivity 0 siblings, 0 replies; 83+ messages in thread From: Avi Kivity @ 2009-10-03 10:00 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 10/01/2009 09:24 PM, Gregory Haskins wrote: > >> Virtualization is about not doing that. Sometimes it's necessary (when >> you have made unfixable design mistakes), but just to replace a bus, >> with no advantages to the guest that has to be changed (other >> hypervisors or hypervisorless deployment scenarios aren't). >> > The problem is that your continued assertion that there is no advantage > to the guest is a completely unsubstantiated claim. As it stands right > now, I have a public git tree that, to my knowledge, is the fastest KVM > PV networking implementation around. It also has capabilities that are > demonstrably not found elsewhere, such as the ability to render generic > shared-memory interconnects (scheduling, timers), interrupt-priority > (qos), and interrupt-coalescing (exit-ratio reduction). I designed each > of these capabilities after carefully analyzing where KVM was coming up > short. > > Those are facts. > > I can't easily prove which of my new features alone are what makes it > special per se, because I don't have unit tests for each part that > breaks it down. What I _can_ state is that its the fastest and most > feature rich KVM-PV tree that I am aware of, and others may download and > test it themselves to verify my claims. > If you wish to introduce a feature which has downsides (and to me, vbus has downsides) then you must prove it is necessary on its own merits. venet is pretty cool but I need proof before I believe its performance is due to vbus and not to venet-host. > The disproof, on the other hand, would be in a counter example that > still meets all the performance and feature criteria under all the same > conditions while maintaining the existing ABI. To my knowledge, this > doesn't exist. > mst is working on it and we should have it soon. > Therefore, if you believe my work is irrelevant, show me a git tree that > accomplishes the same feats in a binary compatible way, and I'll rethink > my position. Until then, complaining about lack of binary compatibility > is pointless since it is not an insurmountable proposition, and the one > and only available solution declares it a required casualty. > Fine, let's defer it until vhost-net is up and running. >> Well, Xen requires pre-translation (since the guest has to give the host >> (which is just another guest) permissions to access the data). >> > Actually I am not sure that it does require pre-translation. You might > be able to use the memctx->copy_to/copy_from scheme in post translation > as well, since those would be able to communicate to something like the > xen kernel. But I suppose either method would result in extra exits, so > there is no distinct benefit using vbus there..as you say below "they're > just different". > > The biggest difference is that my proposed model gets around the notion > that the entire guest address space can be represented by an arbitrary > pointer. For instance, the copy_to/copy_from routines take a GPA, but > may use something indirect like a DMA controller to access that GPA. On > the other hand, virtio fully expects a viable pointer to come out of the > interface iiuc. This is in part what makes vbus more adaptable to non-virt. > No, virtio doesn't expect a pointer (this is what makes Xen possible). vhost does; but nothing prevents an interested party from adapting it. >>> An interesting thing here is that you don't even need a fancy >>> multi-homed setup to see the effects of my exit-ratio reduction work: >>> even single port configurations suffer from the phenomenon since many >>> devices have multiple signal-flows (e.g. network adapters tend to have >>> at least 3 flows: rx-ready, tx-complete, and control-events (link-state, >>> etc). Whats worse, is that the flows often are indirectly related (for >>> instance, many host adapters will free tx skbs during rx operations, so >>> you tend to get bursts of tx-completes at the same time as rx-ready. If >>> the flows map 1:1 with IDT, they will suffer the same problem. >>> >>> >> You can simply use the same vector for both rx and tx and poll both at >> every interrupt. >> > Yes, but that has its own problems: e.g. additional exits or at least > additional overhead figuring out what happens each time. If you're just coalescing tx and rx, it's an additional memory read (which you have anyway in the vbus interrupt queue). > This is even > more important as we scale out to MQ which may have dozens of queue > pairs. You really want finer grained signal-path decode if you want > peak performance. > MQ definitely wants per-queue or per-queue-pair vectors, and it definitely doesn't want all interrupts to be serviced by a single interrupt queue (you could/should make the queue per-vcpu). >>> Its important to note here that we are actually looking at the interrupt >>> rate, not the exit rate (which is usually a multiple of the interrupt >>> rate, since you have to factor in as many as three exits per interrupt >>> (IPI, window, EOI). Therefore we saved about 18k interrupts in this 10 >>> second burst, but we may have actually saved up to 54k exits in the >>> process. This is only over a 10 second window at GigE rates, so YMMV. >>> These numbers get even more dramatic on higher end hardware, but I >>> haven't had a chance to generate new numbers yet. >>> >>> >> (irq window exits should only be required on a small percentage of >> interrupt injections, since the guest will try to disable interrupts for >> short periods only) >> > Good point. You are probably right. Certainly the other 2 remain, however. > > You can easily eliminate most of the EOI exits by patching ack_APIC_irq() to do the following: if (atomic_inc_return(&vapic->eoi_count) < 0) null_hypercall(); Where eoi_count is a per-cpu shared counter that indicates how many EOIs were performed by the guest, with the sign bit a signal from the hypervisor that an lower-priority interrupt is pending. We do something similar for the TPR, which is heavily exercised by Windows XP. Note that svm provides a mechanism to queue interrupts without requiring the interrupt window; we don't use it in kvm (primarily because only a small fraction of injections would benefit). > Ultimately, the fastest exit is the one you do not take. That is what I > am trying to achieve. > The problem is that all those paravirtualizations bring their own problems and are quickly obsoleted by hardware advances. Intel and AMD also see what you're seeing. Sure, it takes hardware a long time to propagate to the field, but the same holds for software. > >>> The even worse news for 1:1 models is that the ratio of >>> exits-per-interrupt climbs with load (exactly when it hurts the most) >>> since that is when the probability that the vcpu will need all three >>> exits is the highest. >>> >>> >> Requiring all three exits means the guest is spending most of its time >> with interrupts disabled; that's unlikely. >> > (see "softirqs" above) > There are no softirqs above, please clarify. >> Thanks for the numbers. Are those 11% attributable to rx/tx >> piggybacking from the same interface? >> > Its hard to tell, since I am not instrumented to discern the difference > in this run. I do know from previous traces on the 10GE rig that the > chelsio T3 that I am running reaps the pending-tx ring at the same time > as a rx polling, so its very likely that both events are often > coincident at least there. > I assume you had only two active interrupts? In that case, tx and rx mitigation should have prevented the same interrupt from coalescing with itself, so that leaves rx/tx coalescing as the only option? >> Also, 170K interupts -> 17K interrupts/sec -> 55kbit/interrupt -> >> 6.8kB/interrupt. Ignoring interrupt merging and assuming equal rx/tx >> distribution, that's about 13kB/interrupt. Seems rather low for a >> saturated link. >> > I am not following: Do you suspect that I have too few interrupts to > represent 940Mb/s, or that I have too little data/interrupt and this > ratio should be improved? > Too few bits/interrupt. With tso, your "packets" should be 64KB at least, and you should expect multiple packets per tx interrupt. Maybe these are all acks? >>>> >>>> >>> Everyone is of course entitled to an opinion, but the industry as a >>> whole would disagree with you. Signal path routing (1:1, aggregated, >>> etc) is at the discretion of the bus designer. Most buses actually do >>> _not_ support 1:1 with IDT (think USB, SCSI, IDE, etc). >>> >>> >> With standard PCI, they do not. But all modern host adapters support >> MSI and they will happily give you one interrupt per queue. >> > While MSI is a good technological advancement for PCI, I was referring > to signal:IDT ratio. MSI would still classify as 1:1. > I meant, a multiqueue SCSI or network adapter is not N:1 but N:M since each queue would get its own interrupt. So it looks like modern cards try to disaggregate, not aggregate. Previously, non-MSI PCI forced them to aggregate by providing a small amount of irq pins. >> Let's do that then. Please reserve the corresponding comparisons from >> your side as well. >> > That is quite the odd request. My graphs are all built using readily > available code and open tools and do not speculate as to what someone > else may come up with in the future. They reflect what is available > today. Do you honestly think I should wait indefinitely for a competing > idea to try to catch up before I talk about my results? That's > certainly an interesting perspective. > You results are excellent and I'm not asking you hide them. But you can't compare (more) complete code to incomplete code and state that this proves you are right, or to use results for an entire stack as proof that one component is what made it possible. > With all due respect, the only red-herring is your unsubstantiated > claims that my results do not matter. > My claim is that your results are mostly due to venet-host. I don't have a proof but you don't have a counterproof. That is why I ask you to wait for vhost-net, it will give us more data so we can see what's what. >>> This is not to mention >>> that vhost-net does nothing to address our other goals, like scheduler >>> coordination and non-802.x fabrics. >>> >>> >> What are scheduler coordination and non-802.x fabrics? >> > We are working on real-time, IB and QOS, for examples, in addition to > the now well known 802.x venet driver. > Won't QoS require a departure from aggregated interrupts? Suppose an low priority interrupt arrives and the guest starts processing, then a high priority interrupt. Don't you need a real (IDT) interrupt to make the guest process the high-priority event? >>>> Right, when you ignore the points where they don't fit, it's a perfect >>>> mesh. >>>> >>>> >>> Where doesn't it fit? >>> >>> >> (avoiding infinite loop) >> > I'm serious. Where doesn't it fit? Point me at a URL if its already > discussed. > Sorry, I lost the context; also my original comment wasn't very constructive, consider it retracted. >>> Citation please. Afaict, the one use case that we looked at for vhost >>> outside of KVM failed to adapt properly, so I do not see how this is >>> true. >>> >>> >> I think Ira said he can make vhost work? >> >> > Not exactly. It kind of works for 802.x only (albeit awkwardly) because > there is no strong distinction between "resource" and "consumer" with > ethernet. So you can run it inverted without any serious consequences > (at least, not from consequences of the inversion). Since the x86 > boards are the actual resource providers in his system, other device > types will fail to map to the vhost model properly, like disk-io or > consoles for instance. > In that case vhost will have to be adapted or they will have to use something else. >> virtio-net over pci is deployed. Replacing the backend with vhost-net >> will require no guest modifications. >> > That _is_ a nice benefit, I agree. I just do not agree its a hard > requirement. > Consider a cloud where the hypervisor is updated without the knowledge of the guest admins. Either we break the guests and require the guest admins to login (without networking) to upgrade their drivers during production and then look for a new cloud, or we maintain both device models and ask the guest admins to upgrade their drivers so we can drop support for the old device, a request which they will rightly ignore. >> Obviously virtio-net isn't deployed in non-virt. But if we adopt vbus, >> we have to migrate guests. >> > As a first step, lets just shoot for "support" instead of "adopt". > "support" means eventually "adopt", it isn't viable to maintain two models in parallel. > Ill continue to push patches to you that help interfacing with the guest > in a vbus neutral way (like irqfd/ioeventfd) and we can go from there. > Are you open to this work assuming it passes normal review cycles, etc? > It would presumably be of use to others that want to interface to a > guest (e.g. vhost) as well. > Neutral interfaces are great, and I've already received feedback from third parties that they ought to work well for their uses. I don't really like xinterface since I think it's too intrusive locking wise, especially when there's currently churn in kvm memory management. But feel free to post your ideas or patches, maybe we can work something out. >>> And once those events are fed, you still need a >>> PV layer to actually handle the bus interface in a high-performance >>> manner so its not like you really have a "native" stack in either case. >>> >>> >> virtio-net doesn't use any pv layer. >> > Well, it does when you really look closely at how it works. For one, it > has the virtqueues library that would be (or at least _should be_) > common for all virtio-X adapters, etc etc. Even if this layer is > collapsed into each driver on the Windows platform, its still there > nonetheless. > By "pv layer" I meant something that is visible along the guest/host interface. virtio devices are completely independent from one another and (using virtio-pci) only talk through interfaces exposed by the relevant card. If you wanted to, you could implement a virtio-pci card in silicon. Practically the only difference between ordinary NICs and virtio-net is that interrupt status and enable/disable are stored in memory instead of NIC registers, but a real NIC could have done it the virtio way. >>>> that doesn't need to be retrofitted. >>>> >>>> >>> No, that is incorrect. You have to heavily modify the pci model with >>> layers on top to get any kind of performance out of it. Otherwise, we >>> would just use realtek emulation, which is technically the native PCI >>> you are apparently so enamored with. >>> >>> >> virtio-net doesn't modify the PCI model. >> > Sure it does. It doesn't use MMIO/PIO bars for registers, it uses > vq->kick(). Which translates to a BAR register. > It doesn't use pci-config-space, it uses virtio->features. > Which translates to a BAR. > It doesn't use PCI interrupts, it uses a callback on the vq etc, etc. > You would never use raw "registers", as the exit rate would crush you. > You would never use raw interrupts, as you need a shared-memory based > mitigation scheme. > > IOW: Virtio has a device model layer that tunnels over PCI. It doesn't > actually use PCI directly. This is in fact what allows the linux > version to work over lguest, s390 and vbus in addition to PCI. > That's just a nice way to reuse the driver across multiple busses. Kind of like isa/pci drivers that might even still exist in the source tree. On x86, virtio doesn't bypass PCI, just adds a layer above it. >> You can have dynamic MSI/queue routing with virtio, and each MSI can be >> routed to a vcpu at will. >> > Can you arbitrarily create a new MSI/queue on a per-device basis on the > fly? We want to do this for some upcoming designs. Or do you need to > predeclare the vectors when the device is hot-added? > You need to predeclare the number of vectors, but queue/interrupt assignment is runtime. >>> priority, and coalescing, etc. >>> >>> >> Do you mean interrupt priority? Well, apic allows interrupt priorities >> and Windows uses them; Linux doesn't. I don't see a reason to provide >> more than native hardware. >> > The APIC model is not optimal for PV given the exits required for a > basic operation like an interrupt injection, and has scaling/flexibility > issues with its 16:16 priority mapping. > > OTOH, you don't necessarily want to rip it out because of all the > additional features it has like the IPI facility and the handling of > many low-performance data-paths. Therefore, I am of the opinion that > the optimal placement for advanced signal handling is directly at the > bus that provides the high-performance resources. I could be convinced > otherwise with a compelling argument, but I think this is the path of > least resistance. > With EOI PV you can reduce the cost of interrupt injection to slightly more than one exit/interrupt. vbus might reduce it to slightly less than one exit/interrupt. wrt priority, if you have 12 or fewer realtime interrupt sources you can map them to available priorities. If you have more then you take extra interrupts, but at a ratio of 12:1 (so 24 realtime interrupts mean you may take a single extra exit). The advantages of this is that all interrupts (not just vbus) are prioritized, and bare metal benefits as well. If 12 is too low for you, pressure Intel to increase the TPR to 8 r/w bits, too bad they missed a chance with x2apic (which btw reduces the apic exit costs significantly). >> N:1 breaks down on large guests since one vcpu will have to process all >> events. >> > Well, first of all that is not necessarily true. Some high performance > buses like SCSI and FC work fine with an aggregated model, so its not a > foregone conclusion that aggregation kills SMP IO performance. This is > especially true when you adding coalescing on top, like AlacrityVM does. > Nevertheless, the high performance adaptors provide multiqueue and MSI; one of the reasons is to distribute processing. > I do agree that other subsystems, like networking for instance, may > sometimes benefit from flexible signal-routing because of multiqueue, > etc, for particularly large guests. However, the decision to make the > current kvm-connector used in AlacrityVM aggregate one priority FIFO per > IRQ was an intentional design tradeoff. My experience with my target > user base is that these data-centers are typically deploying 1-4 vcpu > guests, so I optimized for that. YMMV, so we can design a different > connector, or a different mode of the existing connector, to accommodate > large guests as well if that was something desirable. > > >> You could do N:M, with commands to change routings, but where's >> your userspace interface? >> > Well, we should be able to add that when/if its needed. I just don't > think the need is there yet. KVM tops out at 16 IIUC anyway. > My feeling is that 16 will definitely need multiqueue, and perhaps even 4. (we can probably up the 16, certainly with Marcelo's srcu work). >> you can't tell from /proc/interrupts which >> vbus interupts are active >> > This should be trivial to add some kind of *fs display. I will fix this > shortly. > And update irqbalance and other tools? What about the Windows equivalent? What happens when (say) Linux learns to migrate interrupts to where they're actually used? This should really be done at the irqchip level, but before that, we need to be 100% certain it's worthwhile. >> The larger your installed base, the more difficult it is. Of course >> it's doable, but I prefer not doing it and instead improving things in a >> binary backwards compatible manner. If there is no choice we will bow >> to the inevitable and make our users upgrade. But at this point there >> is a choice, and I prefer to stick with vhost-net until it is proven >> that it won't work. >> > Fair enough. But note you are likely going to need to respin your > existing drivers anyway to gain peak performance, since there are known > shortcomings in the virtio-pci ABI today (like queue identification in > the interrupt hotpath) as it stands. So that pain is coming one way or > the other. > We'll update the drivers but we won't require users to update. The majority will not notice an upgrade; those who are interested in getting more performance will update their drivers (at their own schedule). >> One of the benefits of virtualization is that the guest model is >> stable. You can live-migrate guests and upgrade the hardware >> underneath. You can have a single guest image that you clone to >> provision new guests. If you switch to a new model, you give up those >> benefits, or you support both models indefinitely. >> > I understand what you are saying, but I don't buy it. If you add a new > feature to an existing model even without something as drastic as a new > bus, you still have the same exact dilemma: The migration target needs > feature parity with consumed features in the guest. Its really the same > no matter what unless you never add guest-visible features. > When you upgrade your data center, you start upgrading your hypervisors (one by one, with live migration making it transparent) and certainly not exposing new features to running guests. Once you are done you can expose the new features, and guests which can interested in them can upgrade their drivers and see them. >> Note even hardware nowadays is binary compatible. One e1000 driver >> supports a ton of different cards, and I think (not sure) newer cards >> will work with older drivers, just without all their features. >> > Noted, but that is not really the same thing. Thats more like adding a > feature bit to virtio, not replacing GigE with 10GE. > Right, and that's what virtio-net changes look like. >>> If and when that becomes a priority concern, that would be a function >>> transparently supported in the BIOS shipped with the hypervisor, and >>> would thus be invisible to the user. >>> >>> >> No, you have to update the driver in your initrd (for Linux) >> > Thats fine, the distros generally do this automatically when you load > the updated KMP package. > So it's not invisible to the user. You update your hypervisor and now need to tell your users to add the new driver to their initrd and reboot. They're not going to like you. >> or properly install the new driver (for Windows). It's especially >> difficult for Windows. >> > What is difficult here? I never seem to have any problems and I have > all kinds of guests from XP to Win7. > If you accidentally reboot before you install the new driver, you won't boot; and there are issues with loading a new driver without the hardware present (not sure what exactly). >> I don't want to support both virtio and vbus in parallel. There's >> enough work already. >> > Until I find some compelling reason that indicates I was wrong about all > of this, I will continue building a community around the vbus code base > and developing support for its components anyway. So that effort is > going to happen in parallel regardless. > > This is purely a question about whether you will work with me to make > vbus an available option in upstream KVM or not. > Without xinterface there's no need for vbus support in kvm, so nothing's blocking you there. I'm open to extending the host-side kvm interfaces to improve kernel integration. However I still think vbus is the wrong design and shouldn't be merged. >> If we adopt vbus, we'll have to deprecate and eventually kill off virtio. >> > Thats more hyperbole. virtio is technically fine and complementary as > it is. No one says you have to do anything drastic w.r.t. virtio. If > you _did_ adopt vbus, perhaps you would want to optionally deprecate > vhost or possibly the virtio-pci adapter, but that is about it. The > rest of the infrastructure should be preserved if it was designed properly. > virtio-pci is what makes existing guests work (and vhost-net will certainly need to be killed off). But I really don't see the point of layering virtio on top of vbus. >> PCI is continuously updated, with MSI, MSI-X, and IOMMU support being >> some recent updates. I'd like to ride on top of that instead of having >> to clone it for every guest I support. >> > While a noble goal, one of the points I keep making though, as someone > who has built the stack both ways, is almost none of the PCI stack is > actually needed to get the PV job done. The part you do need is > primarily a function of the generic OS stack and trivial to interface > with anyway. > PCI doesn't stand in the way of pv, and allows us to have a uniform interface to purely emulated, pv, and assigned devices, with minimal changes to the guest. To me that's the path of least resistance. >> >>>>> As an added bonus, its device-model is modular. A developer can >>>>> write a >>>>> new device model, compile it, insmod it to the host kernel, hotplug it >>>>> to the running guest with mkdir/ln, and the come back out again >>>>> (hotunplug with rmdir, rmmod, etc). They may do this all without >>>>> taking >>>>> the guest down, and while eating QEMU based IO solutions for breakfast >>>>> performance wise. >>>>> >>>>> Afaict, qemu can't do either of those things. >>>>> >>>>> >>>>> >>>> We've seen that herring before, >>>> >>>> >>> Citation? >>> >>> >> It's the compare venet-in-kernel to virtio-in-userspace thing again. >> > No, you said KVM has "userspace hotplug". I retorted that vbus not only > has hotplug, it also has a modular architecture. You then countered > that this feature is a red-herring. If this was previously discussed > and rejected for some reason, I would like to know the history. Or did > I misunderstand you? > I was talking about your breakfast (the performance comparison again). > For one, we have the common layer of shm-signal, and IOQ. These > libraries were designed to be reused on both sides of the link. > Generally shm-signal has no counterpart in the existing model, though > its functionality is integrated into the virtqueue. I agree that ioq/shm separation is a nice feature. > From there, going down the stack, it looks like > > (guest-side) > |------------------------- > | venet (competes with virtio-net) > |------------------------- > | vbus-proxy (competes with pci-bus, config+hotplug, sync/async) > |------------------------- > | vbus-pcibridge (interrupt coalescing + priority, fastpath) > |------------------------- > | > |------------------------- > | vbus-kvmconnector (interrupt coalescing + priority, fast-path) > |------------------------- > | vbus-core (hotplug, address decoding, etc) > |------------------------- > | venet-device (ioq frame/deframe to tap/macvlan/vmdq, etc) > |------------------------- > > If you want to use virtio, insert a virtio layer between the "driver" > and "device" components at the outer edges of the stack. > But then it adds no value. It's just another shim. >> To me, compatible means I can live migrate an image to a new system >> without the user knowing about the change. You'll be able to do that >> with vhost-net. >> > As soon as you add any new guest-visible feature, you are in the same > exact boat. > No. You support two-way migration while hiding new features. You support one-way migration if you expose new features (suitable for data center upgrade). You don't support any migration if you switch models. >>> No, that is incorrect. For one, vhost uses them on a per-signal path >>> basis, whereas vbus only has one channel for the entire guest->host. >>> >>> >> You'll probably need to change that as you start running smp guests. >> > The hypercall channel is already SMP optimized over a single PIO path, > so I think we are covered there. See "fastcall" in my code for details: > > http://git.kernel.org/?p=linux/kernel/git/ghaskins/alacrityvm/linux-2.6.git;a=blob;f=drivers/vbus/pci-bridge.c;h=81f7cdd2167ae2f53406850ebac448a2183842f2;hb=fd1c156be7735f8b259579f18268a756beccfc96#l102 > > It just passes the cpuid into the PIO write so we can have parallel, > lockless "hypercalls". This forms the basis of our guest scheduler > support, for instance. > This is... wierd. Scheduler support should be part of kvm core and done using ordinary hypercalls, not as part of a bus model. >> You could implement virtio-net hardware if you wanted to. >> > Technically you could build vbus in hardware too, I suppose, since the > bridge is PCI compliant. I would never advocate it, however, since many > of our tricks do not matter if its real hardware (e.g. they are > optimized for the costs associated with VM). > No, you can't. You won't get the cpuid in your pio writes, for one. And if multiple vbus cards are plugged into different PCI slots, they either lost inter-card interrupt coalescing, or you have to connect them in a side-channel. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-24 7:18 ` Avi Kivity 2009-09-24 18:03 ` Gregory Haskins @ 2009-09-24 19:27 ` Ira W. Snyder 2009-09-25 7:43 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Ira W. Snyder @ 2009-09-24 19:27 UTC (permalink / raw) To: Avi Kivity Cc: Gregory Haskins, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Thu, Sep 24, 2009 at 10:18:28AM +0300, Avi Kivity wrote: > On 09/24/2009 12:15 AM, Gregory Haskins wrote: > > > >>> There are various aspects about designing high-performance virtual > >>> devices such as providing the shortest paths possible between the > >>> physical resources and the consumers. Conversely, we also need to > >>> ensure that we meet proper isolation/protection guarantees at the same > >>> time. What this means is there are various aspects to any > >>> high-performance PV design that require to be placed in-kernel to > >>> maximize the performance yet properly isolate the guest. > >>> > >>> For instance, you are required to have your signal-path (interrupts and > >>> hypercalls), your memory-path (gpa translation), and > >>> addressing/isolation model in-kernel to maximize performance. > >>> > >>> > >> Exactly. That's what vhost puts into the kernel and nothing more. > >> > > Actually, no. Generally, _KVM_ puts those things into the kernel, and > > vhost consumes them. Without KVM (or something equivalent), vhost is > > incomplete. One of my goals with vbus is to generalize the "something > > equivalent" part here. > > > > I don't really see how vhost and vbus are different here. vhost expects > signalling to happen through a couple of eventfds and requires someone > to supply them and implement kernel support (if needed). vbus requires > someone to write a connector to provide the signalling implementation. > Neither will work out-of-the-box when implementing virtio-net over > falling dominos, for example. > > >>> Vbus accomplishes its in-kernel isolation model by providing a > >>> "container" concept, where objects are placed into this container by > >>> userspace. The host kernel enforces isolation/protection by using a > >>> namespace to identify objects that is only relevant within a specific > >>> container's context (namely, a "u32 dev-id"). The guest addresses the > >>> objects by its dev-id, and the kernel ensures that the guest can't > >>> access objects outside of its dev-id namespace. > >>> > >>> > >> vhost manages to accomplish this without any kernel support. > >> > > No, vhost manages to accomplish this because of KVMs kernel support > > (ioeventfd, etc). Without a KVM-like in-kernel support, vhost is a > > merely a kind of "tuntap"-like clone signalled by eventfds. > > > > Without a vbus-connector-falling-dominos, vbus-venet can't do anything > either. Both vhost and vbus need an interface, vhost's is just narrower > since it doesn't do configuration or enumeration. > > > This goes directly to my rebuttal of your claim that vbus places too > > much in the kernel. I state that, one way or the other, address decode > > and isolation _must_ be in the kernel for performance. Vbus does this > > with a devid/container scheme. vhost+virtio-pci+kvm does it with > > pci+pio+ioeventfd. > > > > vbus doesn't do kvm guest address decoding for the fast path. It's > still done by ioeventfd. > > >> The guest > >> simply has not access to any vhost resources other than the guest->host > >> doorbell, which is handed to the guest outside vhost (so it's somebody > >> else's problem, in userspace). > >> > > You mean _controlled_ by userspace, right? Obviously, the other side of > > the kernel still needs to be programmed (ioeventfd, etc). Otherwise, > > vhost would be pointless: e.g. just use vanilla tuntap if you don't need > > fast in-kernel decoding. > > > > Yes (though for something like level-triggered interrupts we're probably > keeping it in userspace, enjoying the benefits of vhost data path while > paying more for signalling). > > >>> All that is required is a way to transport a message with a "devid" > >>> attribute as an address (such as DEVCALL(devid)) and the framework > >>> provides the rest of the decode+execute function. > >>> > >>> > >> vhost avoids that. > >> > > No, it doesn't avoid it. It just doesn't specify how its done, and > > relies on something else to do it on its behalf. > > > > That someone else can be in userspace, apart from the actual fast path. > > > Conversely, vbus specifies how its done, but not how to transport the > > verb "across the wire". That is the role of the vbus-connector abstraction. > > > > So again, vbus does everything in the kernel (since it's so easy and > cheap) but expects a vbus-connector. vhost does configuration in > userspace (since it's so clunky and fragile) but expects a couple of > eventfds. > > >>> Contrast this to vhost+virtio-pci (called simply "vhost" from here). > >>> > >>> > >> It's the wrong name. vhost implements only the data path. > >> > > Understood, but vhost+virtio-pci is what I am contrasting, and I use > > "vhost" for short from that point on because I am too lazy to type the > > whole name over and over ;) > > > > If you #define A A+B+C don't expect intelligent conversation afterwards. > > >>> It is not immune to requiring in-kernel addressing support either, but > >>> rather it just does it differently (and its not as you might expect via > >>> qemu). > >>> > >>> Vhost relies on QEMU to render PCI objects to the guest, which the guest > >>> assigns resources (such as BARs, interrupts, etc). > >>> > >> vhost does not rely on qemu. It relies on its user to handle > >> configuration. In one important case it's qemu+pci. It could just as > >> well be the lguest launcher. > >> > > I meant vhost=vhost+virtio-pci here. Sorry for the confusion. > > > > The point I am making specifically is that vhost in general relies on > > other in-kernel components to function. I.e. It cannot function without > > having something like the PCI model to build an IO namespace. That > > namespace (in this case, pio addresses+data tuples) are used for the > > in-kernel addressing function under KVM + virtio-pci. > > > > The case of the lguest launcher is a good one to highlight. Yes, you > > can presumably also use lguest with vhost, if the requisite facilities > > are exposed to lguest-bus, and some eventfd based thing like ioeventfd > > is written for the host (if it doesnt exist already). > > > > And when the next virt design "foo" comes out, it can make a "foo-bus" > > model, and implement foo-eventfd on the backend, etc, etc. > > > > It's exactly the same with vbus needing additional connectors for > additional transports. > > > Ira can make ira-bus, and ira-eventfd, etc, etc. > > > > Each iteration will invariably introduce duplicated parts of the stack. > > > > Invariably? Use libraries (virtio-shmem.ko, libvhost.so). > Referencing libraries that don't yet exist doesn't seem like a good argument against vbus from my point of view. I'm not speficially advocating for vbus; I'm just letting you know how it looks to another developer in the trenches. If you'd like to see the amount of duplication present, look at the code I'm currently working on. It mostly works at this point, though I haven't finished my userspace, nor figured out how to actually transfer data. The current question I have (just to let you know where I am in development) is: I have the physical address of the remote data, but how do I get it into a userspace buffer, so I can pass it to tun? http://www.mmarray.org/~iws/virtio-phys/ Ira -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-24 19:27 ` Ira W. Snyder @ 2009-09-25 7:43 ` Avi Kivity 0 siblings, 0 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-25 7:43 UTC (permalink / raw) To: Ira W. Snyder Cc: Gregory Haskins, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/24/2009 10:27 PM, Ira W. Snyder wrote: >>> Ira can make ira-bus, and ira-eventfd, etc, etc. >>> >>> Each iteration will invariably introduce duplicated parts of the stack. >>> >>> >> Invariably? Use libraries (virtio-shmem.ko, libvhost.so). >> >> > Referencing libraries that don't yet exist doesn't seem like a good > argument against vbus from my point of view. I'm not speficially > advocating for vbus; I'm just letting you know how it looks to another > developer in the trenches. > My argument is that we shouldn't write a new framework instead of fixing or extending an existing one. > If you'd like to see the amount of duplication present, look at the code > I'm currently working on. Yes, virtio-phys-guest looks pretty much duplicated. Looks like it should be pretty easy to deduplicate. > It mostly works at this point, though I > haven't finished my userspace, nor figured out how to actually transfer > data. > > The current question I have (just to let you know where I am in > development) is: > > I have the physical address of the remote data, but how do I get it into > a userspace buffer, so I can pass it to tun? > vhost does guest physical address to host userspace address (it your scenario, remote physical to local virtual) using a table of memory slots; there's an ioctl that allows userspace to initialize that table. -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-23 19:37 ` Avi Kivity 2009-09-23 21:15 ` Gregory Haskins @ 2009-09-24 8:03 ` Avi Kivity 2009-09-24 18:04 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Avi Kivity @ 2009-09-24 8:03 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On 09/23/2009 10:37 PM, Avi Kivity wrote: > > Example: feature negotiation. If it happens in userspace, it's easy > to limit what features we expose to the guest. If it happens in the > kernel, we need to add an interface to let the kernel know which > features it should expose to the guest. We also need to add an > interface to let userspace know which features were negotiated, if we > want to implement live migration. Something fairly trivial bloats > rapidly. btw, we have this issue with kvm reporting cpuid bits to the guest. Instead of letting kvm talk directly to the hardware and the guest, kvm gets the cpuid bits from the hardware, strips away features it doesn't support, exposes that to userspace, and expects userspace to program the cpuid bits it wants to expose to the guest (which may be different than what kvm exposed to userspace, and different from guest to guest). -- Do not meddle in the internals of kernels, for they are subtle and quick to panic. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-24 8:03 ` Avi Kivity @ 2009-09-24 18:04 ` Gregory Haskins 0 siblings, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-24 18:04 UTC (permalink / raw) To: Avi Kivity Cc: Ira W. Snyder, Michael S. Tsirkin, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 1091 bytes --] Avi Kivity wrote: > On 09/23/2009 10:37 PM, Avi Kivity wrote: >> >> Example: feature negotiation. If it happens in userspace, it's easy >> to limit what features we expose to the guest. If it happens in the >> kernel, we need to add an interface to let the kernel know which >> features it should expose to the guest. We also need to add an >> interface to let userspace know which features were negotiated, if we >> want to implement live migration. Something fairly trivial bloats >> rapidly. > > btw, we have this issue with kvm reporting cpuid bits to the guest. > Instead of letting kvm talk directly to the hardware and the guest, kvm > gets the cpuid bits from the hardware, strips away features it doesn't > support, exposes that to userspace, and expects userspace to program the > cpuid bits it wants to expose to the guest (which may be different than > what kvm exposed to userspace, and different from guest to guest). > This issue doesn't exist in the model I am referring to, as these are all virtual-devices anyway. See my last reply -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-16 14:10 ` Gregory Haskins 2009-09-16 15:59 ` Avi Kivity @ 2009-09-17 3:57 ` Michael S. Tsirkin 2009-09-17 4:13 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-17 3:57 UTC (permalink / raw) To: Gregory Haskins Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel On Wed, Sep 16, 2009 at 10:10:55AM -0400, Gregory Haskins wrote: > > There is no role reversal. > > So if I have virtio-blk driver running on the x86 and vhost-blk device > running on the ppc board, I can use the ppc board as a block-device. > What if I really wanted to go the other way? It seems ppc is the only one that can initiate DMA to an arbitrary address, so you can't do this really, or you can by tunneling each request back to ppc, or doing an extra data copy, but it's unlikely to work well. The limitation comes from hardware, not from the API we use. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-17 3:57 ` Michael S. Tsirkin @ 2009-09-17 4:13 ` Gregory Haskins 0 siblings, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-17 4:13 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Avi Kivity, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze, alacrityvm-devel [-- Attachment #1: Type: text/plain, Size: 1561 bytes --] Michael S. Tsirkin wrote: > On Wed, Sep 16, 2009 at 10:10:55AM -0400, Gregory Haskins wrote: >>> There is no role reversal. >> So if I have virtio-blk driver running on the x86 and vhost-blk device >> running on the ppc board, I can use the ppc board as a block-device. >> What if I really wanted to go the other way? > > It seems ppc is the only one that can initiate DMA to an arbitrary > address, so you can't do this really, or you can by tunneling each > request back to ppc, or doing an extra data copy, but it's unlikely to > work well. > > The limitation comes from hardware, not from the API we use. Understood, but presumably it can be exposed as a sub-function of the ppc's board's register file as a DMA-controller service to the x86. This would fall into the "tunnel requests back" category you mention above, though I think "tunnel" implies a heavier protocol than it would actually require. This would look more like a PIO cycle to a DMA controller than some higher layer protocol. You would then utilize that DMA service inside the memctx, and it the rest of vbus would work transparently with the existing devices/drivers. I do agree it would require some benchmarking to determine its feasibility, which is why I was careful to say things like "may work" ;). I also do not even know if its possible to expose the service this way on his system. If this design is not possible or performs poorly, I admit vbus is just as hosed as vhost in regard to the "role correction" benefit. Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 16:47 ` Michael S. Tsirkin 2009-09-14 19:14 ` Gregory Haskins @ 2009-09-15 12:32 ` Avi Kivity 1 sibling, 0 replies; 83+ messages in thread From: Avi Kivity @ 2009-09-15 12:32 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Gregory Haskins, Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze On 09/14/2009 07:47 PM, Michael S. Tsirkin wrote: > On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote: > >> For Ira's example, the addresses would represent a physical address on >> the PCI boards, and would follow any kind of relevant rules for >> converting a "GPA" to a host accessible address (even if indirectly, via >> a dma controller). >> > I don't think limiting addresses to PCI physical addresses will work > well. From what I rememeber, Ira's x86 can not initiate burst > transactions on PCI, and it's the ppc that initiates all DMA. > vhost-net would run on the PPC then. >>> But we can't let the guest specify physical addresses. >>> >> Agreed. Neither your proposal nor mine operate this way afaict. >> > But this seems to be what Ira needs. > In Ira's scenario, the "guest" (x86 host) specifies x86 physical addresses, and the ppc dmas to them. It's the virtio model without any change. A normal guest also specifis physical addresses. -- error compiling committee.c: too many arguments to function -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 16:08 ` Gregory Haskins 2009-09-14 16:47 ` Michael S. Tsirkin @ 2009-09-14 16:53 ` Michael S. Tsirkin 2009-09-14 19:28 ` Gregory Haskins 1 sibling, 1 reply; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-14 16:53 UTC (permalink / raw) To: Gregory Haskins Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote: > Michael S. Tsirkin wrote: > > On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote: > >> FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, > >> the memory is not assumed to be a userspace address. Rather, it is a > >> memctx-specific address, which can be userspace, or any other type > >> (including hardware, dma-engine, etc). As long as the memctx knows how > >> to translate it, it will work. > > > > How would permissions be handled? > > Same as anything else, really. Read on for details. > > > it's easy to allow an app to pass in virtual addresses in its own address space. > > Agreed, and this is what I do. > > The guest always passes its own physical addresses (using things like > __pa() in linux). This address passed is memctx specific, but generally > would fall into the category of "virtual-addresses" from the hosts > perspective. > > For a KVM/AlacrityVM guest example, the addresses are GPAs, accessed > internally to the context via a gfn_to_hva conversion (you can see this > occuring in the citation links I sent) > > For Ira's example, the addresses would represent a physical address on > the PCI boards, and would follow any kind of relevant rules for > converting a "GPA" to a host accessible address (even if indirectly, via > a dma controller). So vbus can let an application access either its own virtual memory or a physical memory on a PCI device. My question is, is any application that's allowed to do the former also granted rights to do the later? > > But we can't let the guest specify physical addresses. > > Agreed. Neither your proposal nor mine operate this way afaict. > > HTH > > Kind Regards, > -Greg > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-14 16:53 ` Michael S. Tsirkin @ 2009-09-14 19:28 ` Gregory Haskins 0 siblings, 0 replies; 83+ messages in thread From: Gregory Haskins @ 2009-09-14 19:28 UTC (permalink / raw) To: Michael S. Tsirkin Cc: Ira W. Snyder, netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, Rusty Russell, s.hetze [-- Attachment #1: Type: text/plain, Size: 2169 bytes --] Michael S. Tsirkin wrote: > On Mon, Sep 14, 2009 at 12:08:55PM -0400, Gregory Haskins wrote: >> Michael S. Tsirkin wrote: >>> On Fri, Sep 11, 2009 at 12:00:21PM -0400, Gregory Haskins wrote: >>>> FWIW: VBUS handles this situation via the "memctx" abstraction. IOW, >>>> the memory is not assumed to be a userspace address. Rather, it is a >>>> memctx-specific address, which can be userspace, or any other type >>>> (including hardware, dma-engine, etc). As long as the memctx knows how >>>> to translate it, it will work. >>> How would permissions be handled? >> Same as anything else, really. Read on for details. >> >>> it's easy to allow an app to pass in virtual addresses in its own address space. >> Agreed, and this is what I do. >> >> The guest always passes its own physical addresses (using things like >> __pa() in linux). This address passed is memctx specific, but generally >> would fall into the category of "virtual-addresses" from the hosts >> perspective. >> >> For a KVM/AlacrityVM guest example, the addresses are GPAs, accessed >> internally to the context via a gfn_to_hva conversion (you can see this >> occuring in the citation links I sent) >> >> For Ira's example, the addresses would represent a physical address on >> the PCI boards, and would follow any kind of relevant rules for >> converting a "GPA" to a host accessible address (even if indirectly, via >> a dma controller). > > So vbus can let an application "application" means KVM guest, or ppc board, right? > access either its own virtual memory or a physical memory on a PCI device. To reiterate from the last reply: the model is the "guest" owns the memory. The host is granted access to that memory by means of a memctx object, which must be admitted to the host kernel and accessed according to standard access-policy mechanisms. Generally the "application" or guest would never be accessing anything other than its own memory. > My question is, is any application > that's allowed to do the former also granted rights to do the later? If I understand your question, no. Can you elaborate? Kind Regards, -Greg [-- Attachment #2: OpenPGP digital signature --] [-- Type: application/pgp-signature, Size: 267 bytes --] ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-27 16:07 ` [PATCHv5 3/3] vhost_net: a kernel-level virtio server Michael S. Tsirkin 2009-09-03 18:39 ` Ira W. Snyder @ 2009-09-25 17:01 ` Ira W. Snyder 2009-09-27 7:43 ` Michael S. Tsirkin 1 sibling, 1 reply; 83+ messages in thread From: Ira W. Snyder @ 2009-09-25 17:01 UTC (permalink / raw) To: Michael S. Tsirkin Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Thu, Aug 27, 2009 at 07:07:50PM +0300, Michael S. Tsirkin wrote: > What it is: vhost net is a character device that can be used to reduce > the number of system calls involved in virtio networking. > Existing virtio net code is used in the guest without modification. > > There's similarity with vringfd, with some differences and reduced scope > - uses eventfd for signalling > - structures can be moved around in memory at any time (good for migration) > - support memory table and not just an offset (needed for kvm) > > common virtio related code has been put in a separate file vhost.c and > can be made into a separate module if/when more backends appear. I used > Rusty's lguest.c as the source for developing this part : this supplied > me with witty comments I wouldn't be able to write myself. > > What it is not: vhost net is not a bus, and not a generic new system > call. No assumptions are made on how guest performs hypercalls. > Userspace hypervisors are supported as well as kvm. > > How it works: Basically, we connect virtio frontend (configured by > userspace) to a backend. The backend could be a network device, or a > tun-like device. In this version I only support raw socket as a backend, > which can be bound to e.g. SR IOV, or to macvlan device. Backend is > also configured by userspace, including vlan/mac etc. > > Status: > This works for me, and I haven't see any crashes. > I have done some light benchmarking (with v4), compared to userspace, I > see improved latency (as I save up to 4 system calls per packet) but not > bandwidth/CPU (as TSO and interrupt mitigation are not supported). For > ping benchmark (where there's no TSO) troughput is also improved. > > Features that I plan to look at in the future: > - tap support > - TSO > - interrupt mitigation > - zero copy > > Acked-by: Arnd Bergmann <arnd@arndb.de> > Signed-off-by: Michael S. Tsirkin <mst@redhat.com> > > --- > MAINTAINERS | 10 + > arch/x86/kvm/Kconfig | 1 + > drivers/Makefile | 1 + > drivers/vhost/Kconfig | 11 + > drivers/vhost/Makefile | 2 + > drivers/vhost/net.c | 475 ++++++++++++++++++++++++++++++ > drivers/vhost/vhost.c | 688 ++++++++++++++++++++++++++++++++++++++++++++ > drivers/vhost/vhost.h | 122 ++++++++ > include/linux/Kbuild | 1 + > include/linux/miscdevice.h | 1 + > include/linux/vhost.h | 101 +++++++ > 11 files changed, 1413 insertions(+), 0 deletions(-) > create mode 100644 drivers/vhost/Kconfig > create mode 100644 drivers/vhost/Makefile > create mode 100644 drivers/vhost/net.c > create mode 100644 drivers/vhost/vhost.c > create mode 100644 drivers/vhost/vhost.h > create mode 100644 include/linux/vhost.h > > diff --git a/MAINTAINERS b/MAINTAINERS > index b1114cf..de4587f 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -5431,6 +5431,16 @@ S: Maintained > F: Documentation/filesystems/vfat.txt > F: fs/fat/ > > +VIRTIO HOST (VHOST) > +P: Michael S. Tsirkin > +M: mst@redhat.com > +L: kvm@vger.kernel.org > +L: virtualization@lists.osdl.org > +L: netdev@vger.kernel.org > +S: Maintained > +F: drivers/vhost/ > +F: include/linux/vhost.h > + > VIA RHINE NETWORK DRIVER > M: Roger Luethi <rl@hellgate.ch> > S: Maintained > diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig > index b84e571..94f44d9 100644 > --- a/arch/x86/kvm/Kconfig > +++ b/arch/x86/kvm/Kconfig > @@ -64,6 +64,7 @@ config KVM_AMD > > # OK, it's a little counter-intuitive to do this, but it puts it neatly under > # the virtualization menu. > +source drivers/vhost/Kconfig > source drivers/lguest/Kconfig > source drivers/virtio/Kconfig > > diff --git a/drivers/Makefile b/drivers/Makefile > index bc4205d..1551ae1 100644 > --- a/drivers/Makefile > +++ b/drivers/Makefile > @@ -105,6 +105,7 @@ obj-$(CONFIG_HID) += hid/ > obj-$(CONFIG_PPC_PS3) += ps3/ > obj-$(CONFIG_OF) += of/ > obj-$(CONFIG_SSB) += ssb/ > +obj-$(CONFIG_VHOST_NET) += vhost/ > obj-$(CONFIG_VIRTIO) += virtio/ > obj-$(CONFIG_VLYNQ) += vlynq/ > obj-$(CONFIG_STAGING) += staging/ > diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig > new file mode 100644 > index 0000000..d955406 > --- /dev/null > +++ b/drivers/vhost/Kconfig > @@ -0,0 +1,11 @@ > +config VHOST_NET > + tristate "Host kernel accelerator for virtio net" > + depends on NET && EVENTFD > + ---help--- > + This kernel module can be loaded in host kernel to accelerate > + guest networking with virtio_net. Not to be confused with virtio_net > + module itself which needs to be loaded in guest kernel. > + > + To compile this driver as a module, choose M here: the module will > + be called vhost_net. > + > diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile > new file mode 100644 > index 0000000..72dd020 > --- /dev/null > +++ b/drivers/vhost/Makefile > @@ -0,0 +1,2 @@ > +obj-$(CONFIG_VHOST_NET) += vhost_net.o > +vhost_net-y := vhost.o net.o > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c > new file mode 100644 > index 0000000..2210eaa > --- /dev/null > +++ b/drivers/vhost/net.c > @@ -0,0 +1,475 @@ > +/* Copyright (C) 2009 Red Hat, Inc. > + * Author: Michael S. Tsirkin <mst@redhat.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * virtio-net server in host kernel. > + */ > + > +#include <linux/compat.h> > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <linux/mmu_context.h> > +#include <linux/miscdevice.h> > +#include <linux/module.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/rcupdate.h> > +#include <linux/file.h> > + > +#include <linux/net.h> > +#include <linux/if_packet.h> > +#include <linux/if_arp.h> > + > +#include <net/sock.h> > + > +#include "vhost.h" > + > +enum { > + VHOST_NET_VQ_RX = 0, > + VHOST_NET_VQ_TX = 1, > + VHOST_NET_VQ_MAX = 2, > +}; > + > +struct vhost_net { > + struct vhost_dev dev; > + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; > + /* We use a kind of RCU to access sock pointer. > + * All readers access it from workqueue, which makes it possible to > + * flush the workqueue instead of synchronize_rcu. Therefore readers do > + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of > + * work item execution acts instead of rcu_read_lock() and the end of > + * work item execution acts instead of rcu_read_lock(). > + * Writers use device mutex. */ > + struct socket *sock; > + struct vhost_poll poll[VHOST_NET_VQ_MAX]; > +}; > + > +/* Pop first len bytes from iovec. Return number of segments used. */ > +static int move_iovec_hdr(struct iovec *from, struct iovec *to, > + size_t len, int iov_count) > +{ > + int seg = 0; > + size_t size; > + while (len && seg < iov_count) { > + size = min(from->iov_len, len); > + to->iov_base = from->iov_base; > + to->iov_len = size; > + from->iov_len -= size; > + from->iov_base += size; > + len -= size; > + ++from; > + ++to; > + ++seg; > + } > + return seg; > +} > + > +/* Expects to be always run from workqueue - which acts as > + * read-size critical section for our kind of RCU. */ > +static void handle_tx(struct vhost_net *net) > +{ > + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; > + unsigned head, out, in, s; > + struct msghdr msg = { > + .msg_name = NULL, > + .msg_namelen = 0, > + .msg_control = NULL, > + .msg_controllen = 0, > + .msg_iov = vq->iov, > + .msg_flags = MSG_DONTWAIT, > + }; > + size_t len; > + int err; > + struct socket *sock = rcu_dereference(net->sock); > + if (!sock || !sock_writeable(sock->sk)) > + return; > + > + use_mm(net->dev.mm); > + mutex_lock(&vq->mutex); > + for (;;) { > + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); > + /* Nothing new? Wait for eventfd to tell us they refilled. */ > + if (head == vq->num) > + break; > + if (in) { > + vq_err(vq, "Unexpected descriptor format for TX: " > + "out %d, int %d\n", out, in); > + break; > + } > + /* Skip header. TODO: support TSO. */ > + s = move_iovec_hdr(vq->iov, vq->hdr, > + sizeof(struct virtio_net_hdr), out); > + msg.msg_iovlen = out; > + len = iov_length(vq->iov, out); > + /* Sanity check */ > + if (!len) { > + vq_err(vq, "Unexpected header len for TX: " > + "%ld expected %zd\n", > + iov_length(vq->hdr, s), > + sizeof(struct virtio_net_hdr)); > + break; > + } > + /* TODO: Check specific error and bomb out unless ENOBUFS? */ > + err = sock->ops->sendmsg(NULL, sock, &msg, len); > + if (err < 0) { > + vhost_discard_vq_desc(vq); > + break; > + } > + if (err != len) > + pr_err("Truncated TX packet: " > + " len %d != %zd\n", err, len); > + vhost_add_used_and_trigger(&net->dev, vq, head, 0); > + } > + > + mutex_unlock(&vq->mutex); > + unuse_mm(net->dev.mm); > +} > + > +/* Expects to be always run from workqueue - which acts as > + * read-size critical section for our kind of RCU. */ > +static void handle_rx(struct vhost_net *net) > +{ > + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; > + unsigned head, out, in, s; > + struct msghdr msg = { > + .msg_name = NULL, > + .msg_namelen = 0, > + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ > + .msg_controllen = 0, > + .msg_iov = vq->iov, > + .msg_flags = MSG_DONTWAIT, > + }; > + > + struct virtio_net_hdr hdr = { > + .flags = 0, > + .gso_type = VIRTIO_NET_HDR_GSO_NONE > + }; > + > + size_t len; > + int err; > + struct socket *sock = rcu_dereference(net->sock); > + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) > + return; > + > + use_mm(net->dev.mm); > + mutex_lock(&vq->mutex); > + vhost_no_notify(vq); > + > + for (;;) { > + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); > + /* OK, now we need to know about added descriptors. */ > + if (head == vq->num && vhost_notify(vq)) > + /* They could have slipped one in as we were doing that: > + * check again. */ > + continue; > + /* Nothing new? Wait for eventfd to tell us they refilled. */ > + if (head == vq->num) > + break; > + /* We don't need to be notified again. */ > + vhost_no_notify(vq); > + if (out) { > + vq_err(vq, "Unexpected descriptor format for RX: " > + "out %d, int %d\n", > + out, in); > + break; > + } > + /* Skip header. TODO: support TSO/mergeable rx buffers. */ > + s = move_iovec_hdr(vq->iov, vq->hdr, sizeof hdr, in); > + msg.msg_iovlen = in; > + len = iov_length(vq->iov, in); > + /* Sanity check */ > + if (!len) { > + vq_err(vq, "Unexpected header len for RX: " > + "%zd expected %zd\n", > + iov_length(vq->hdr, s), sizeof hdr); > + break; > + } > + err = sock->ops->recvmsg(NULL, sock, &msg, > + len, MSG_DONTWAIT | MSG_TRUNC); > + /* TODO: Check specific error and bomb out unless EAGAIN? */ > + if (err < 0) { > + vhost_discard_vq_desc(vq); > + break; > + } > + /* TODO: Should check and handle checksum. */ > + if (err > len) { > + pr_err("Discarded truncated rx packet: " > + " len %d > %zd\n", err, len); > + vhost_discard_vq_desc(vq); > + continue; > + } > + len = err; > + err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, sizeof hdr); > + if (err) { > + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", > + vq->iov->iov_base, err); > + break; > + } > + vhost_add_used_and_trigger(&net->dev, vq, head, > + len + sizeof hdr); > + } > + > + mutex_unlock(&vq->mutex); > + unuse_mm(net->dev.mm); > +} > + > +static void handle_tx_kick(struct work_struct *work) > +{ > + struct vhost_virtqueue *vq; > + struct vhost_net *net; > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + net = container_of(vq->dev, struct vhost_net, dev); > + handle_tx(net); > +} > + > +static void handle_rx_kick(struct work_struct *work) > +{ > + struct vhost_virtqueue *vq; > + struct vhost_net *net; > + vq = container_of(work, struct vhost_virtqueue, poll.work); > + net = container_of(vq->dev, struct vhost_net, dev); > + handle_rx(net); > +} > + > +static void handle_tx_net(struct work_struct *work) > +{ > + struct vhost_net *net; > + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); > + handle_tx(net); > +} > + > +static void handle_rx_net(struct work_struct *work) > +{ > + struct vhost_net *net; > + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); > + handle_rx(net); > +} > + > +static int vhost_net_open(struct inode *inode, struct file *f) > +{ > + struct vhost_net *n = kzalloc(sizeof *n, GFP_KERNEL); > + int r; > + if (!n) > + return -ENOMEM; > + f->private_data = n; > + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; > + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; > + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); > + if (r < 0) { > + kfree(n); > + return r; > + } > + > + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); > + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); > + return 0; > +} > + > +static struct socket *vhost_net_stop(struct vhost_net *n) > +{ > + struct socket *sock = n->sock; > + rcu_assign_pointer(n->sock, NULL); > + if (sock) { > + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); > + } > + return sock; > +} > + > +static int vhost_net_release(struct inode *inode, struct file *f) > +{ > + struct vhost_net *n = f->private_data; > + struct socket *sock; > + > + sock = vhost_net_stop(n); > + vhost_dev_cleanup(&n->dev); > + if (sock) > + fput(sock->file); > + kfree(n); > + return 0; > +} > + > +static void vhost_net_flush(struct vhost_net *n) > +{ > + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); > + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_TX].poll); > + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_RX].poll); > +} > + > +static long vhost_net_set_socket(struct vhost_net *n, int fd) > +{ > + struct { > + struct sockaddr_ll sa; > + char buf[MAX_ADDR_LEN]; > + } uaddr; > + struct socket *sock, *oldsock = NULL; > + int uaddr_len = sizeof uaddr, r; > + > + mutex_lock(&n->dev.mutex); > + r = vhost_dev_check_owner(&n->dev); > + if (r) > + goto done; > + > + if (fd == -1) { > + /* Disconnect from socket and device. */ > + oldsock = vhost_net_stop(n); > + goto done; > + } > + > + sock = sockfd_lookup(fd, &r); > + if (!sock) { > + r = -ENOTSOCK; > + goto done; > + } > + > + /* Parameter checking */ > + if (sock->sk->sk_type != SOCK_RAW) { > + r = -ESOCKTNOSUPPORT; > + goto done; > + } > + > + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, > + &uaddr_len, 0); > + if (r) > + goto done; > + > + if (uaddr.sa.sll_family != AF_PACKET) { > + r = -EPFNOSUPPORT; > + goto done; > + } > + > + /* start polling new socket */ > + if (sock == oldsock) > + goto done; > + > + if (oldsock) { > + vhost_poll_stop(n->poll + VHOST_NET_VQ_TX); > + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); > + } > + oldsock = n->sock; > + rcu_assign_pointer(n->sock, sock); > + vhost_poll_start(n->poll + VHOST_NET_VQ_TX, sock->file); > + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); > +done: > + mutex_unlock(&n->dev.mutex); > + if (oldsock) { > + vhost_net_flush(n); > + fput(oldsock->file); > + } > + return r; > +} > + > +static long vhost_net_reset_owner(struct vhost_net *n) > +{ > + struct socket *sock = NULL; > + long r; > + mutex_lock(&n->dev.mutex); > + r = vhost_dev_check_owner(&n->dev); > + if (r) > + goto done; > + sock = vhost_net_stop(n); > + r = vhost_dev_reset_owner(&n->dev); > +done: > + mutex_unlock(&n->dev.mutex); > + if (sock) > + fput(sock->file); > + return r; > +} > + > +static void vhost_net_set_features(struct vhost_net *n, u64 features) > +{ > + mutex_unlock(&n->dev.mutex); > + n->dev.acked_features = features; > + mutex_unlock(&n->dev.mutex); > + vhost_net_flush(n); > +} > + > +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + struct vhost_net *n = f->private_data; > + void __user *argp = (void __user *)arg; > + u32 __user *featurep = argp; > + int __user *fdp = argp; > + u64 features; > + int fd, r; > + switch (ioctl) { > + case VHOST_NET_SET_SOCKET: > + r = get_user(fd, fdp); > + if (r < 0) > + return r; > + return vhost_net_set_socket(n, fd); > + case VHOST_GET_FEATURES: > + features = VHOST_FEATURES; > + return put_user(features, featurep); > + case VHOST_ACK_FEATURES: > + r = get_user(features, featurep); > + /* No features for now */ > + if (r < 0) > + return r; > + if (features & ~VHOST_FEATURES) > + return -EOPNOTSUPP; > + vhost_net_set_features(n, features); > + return 0; > + case VHOST_RESET_OWNER: > + return vhost_net_reset_owner(n); > + default: > + return vhost_dev_ioctl(&n->dev, ioctl, arg); > + } > +} > + > +#ifdef CONFIG_COMPAT > +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, > + unsigned long arg) > +{ > + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); > +} > +#endif > + > +const static struct file_operations vhost_net_fops = { > + .owner = THIS_MODULE, > + .release = vhost_net_release, > + .unlocked_ioctl = vhost_net_ioctl, > +#ifdef CONFIG_COMPAT > + .compat_ioctl = vhost_net_compat_ioctl, > +#endif > + .open = vhost_net_open, > +}; > + > +static struct miscdevice vhost_net_misc = { > + VHOST_NET_MINOR, > + "vhost-net", > + &vhost_net_fops, > +}; > + > +int vhost_net_init(void) > +{ > + int r = vhost_init(); > + if (r) > + goto err_init; > + r = misc_register(&vhost_net_misc); > + if (r) > + goto err_reg; > + return 0; > +err_reg: > + vhost_cleanup(); > +err_init: > + return r; > + > +} > +module_init(vhost_net_init); > + > +void vhost_net_exit(void) > +{ > + misc_deregister(&vhost_net_misc); > + vhost_cleanup(); > +} > +module_exit(vhost_net_exit); > + > +MODULE_VERSION("0.0.1"); > +MODULE_LICENSE("GPL v2"); > +MODULE_AUTHOR("Michael S. Tsirkin"); > +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c > new file mode 100644 > index 0000000..6925cc1 > --- /dev/null > +++ b/drivers/vhost/vhost.c > @@ -0,0 +1,688 @@ > +/* Copyright (C) 2009 Red Hat, Inc. > + * Copyright (C) 2006 Rusty Russell IBM Corporation > + * > + * Author: Michael S. Tsirkin <mst@redhat.com> > + * > + * Inspiration, some code, and most witty comments come from > + * Documentation/lguest/lguest.c, by Rusty Russell > + * > + * This work is licensed under the terms of the GNU GPL, version 2. > + * > + * Generic code for virtio server in host kernel. > + */ > + > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/virtio_net.h> > +#include <linux/mm.h> > +#include <linux/miscdevice.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/rcupdate.h> > +#include <linux/poll.h> > +#include <linux/file.h> > + > +#include <linux/net.h> > +#include <linux/if_packet.h> > +#include <linux/if_arp.h> > + > +#include <net/sock.h> > + > +#include "vhost.h" > + > +enum { > + VHOST_MEMORY_MAX_NREGIONS = 64, > +}; > + > +static struct workqueue_struct *vhost_workqueue; > + > +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, > + poll_table *pt) > +{ > + struct vhost_poll *poll; > + poll = container_of(pt, struct vhost_poll, table); > + > + poll->wqh = wqh; > + add_wait_queue(wqh, &poll->wait); > +} > + > +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, > + void *key) > +{ > + struct vhost_poll *poll; > + poll = container_of(wait, struct vhost_poll, wait); > + if (!((unsigned long)key & poll->mask)) > + return 0; > + > + queue_work(vhost_workqueue, &poll->work); > + return 0; > +} > + > +/* Init poll structure */ > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > + unsigned long mask) > +{ > + INIT_WORK(&poll->work, func); > + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); > + init_poll_funcptr(&poll->table, vhost_poll_func); > + poll->mask = mask; > +} > + > +/* Start polling a file. We add ourselves to file's wait queue. The caller must > + * keep a reference to a file until after vhost_poll_stop is called. */ > +void vhost_poll_start(struct vhost_poll *poll, struct file *file) > +{ > + unsigned long mask; > + mask = file->f_op->poll(file, &poll->table); > + if (mask) > + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); > +} > + > +/* Stop polling a file. After this function returns, it becomes safe to drop the > + * file reference. You must also flush afterwards. */ > +void vhost_poll_stop(struct vhost_poll *poll) > +{ > + remove_wait_queue(poll->wqh, &poll->wait); > +} > + > +/* Flush any work that has been scheduled. When calling this, don't hold any > + * locks that are also used by the callback. */ > +void vhost_poll_flush(struct vhost_poll *poll) > +{ > + flush_work(&poll->work); > +} > + > +long vhost_dev_init(struct vhost_dev *dev, > + struct vhost_virtqueue *vqs, int nvqs) > +{ > + int i; > + dev->vqs = vqs; > + dev->nvqs = nvqs; > + mutex_init(&dev->mutex); > + > + for (i = 0; i < dev->nvqs; ++i) { > + dev->vqs[i].dev = dev; > + mutex_init(&dev->vqs[i].mutex); > + if (dev->vqs[i].handle_kick) > + vhost_poll_init(&dev->vqs[i].poll, > + dev->vqs[i].handle_kick, > + POLLIN); > + } > + return 0; > +} > + > +/* Caller should have device mutex */ > +long vhost_dev_check_owner(struct vhost_dev *dev) > +{ > + /* Are you the owner? If not, I don't think you mean to do that */ > + return dev->mm == current->mm ? 0 : -EPERM; > +} > + > +/* Caller should have device mutex */ > +static long vhost_dev_set_owner(struct vhost_dev *dev) > +{ > + /* Is there an owner already? */ > + if (dev->mm) > + return -EBUSY; > + /* No owner, become one */ > + dev->mm = get_task_mm(current); > + return 0; > +} > + > +/* Caller should have device mutex */ > +long vhost_dev_reset_owner(struct vhost_dev *dev) > +{ > + struct vhost_memory *memory; > + > + /* Restore memory to default 1:1 mapping. */ > + memory = kmalloc(offsetof(struct vhost_memory, regions) + > + 2 * sizeof *memory->regions, GFP_KERNEL); > + if (!memory) > + return -ENOMEM; > + > + vhost_dev_cleanup(dev); > + > + memory->nregions = 2; > + memory->regions[0].guest_phys_addr = 1; > + memory->regions[0].userspace_addr = 1; > + memory->regions[0].memory_size = ~0ULL; > + memory->regions[1].guest_phys_addr = 0; > + memory->regions[1].userspace_addr = 0; > + memory->regions[1].memory_size = 1; > + dev->memory = memory; > + return 0; > +} > + > +/* Caller should have device mutex */ > +void vhost_dev_cleanup(struct vhost_dev *dev) > +{ > + int i; > + for (i = 0; i < dev->nvqs; ++i) { > + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { > + vhost_poll_stop(&dev->vqs[i].poll); > + vhost_poll_flush(&dev->vqs[i].poll); > + } > + if (dev->vqs[i].error_ctx) > + eventfd_ctx_put(dev->vqs[i].error_ctx); > + if (dev->vqs[i].error) > + fput(dev->vqs[i].error); > + if (dev->vqs[i].kick) > + fput(dev->vqs[i].kick); > + if (dev->vqs[i].call_ctx) > + eventfd_ctx_put(dev->vqs[i].call_ctx); > + if (dev->vqs[i].call) > + fput(dev->vqs[i].call); > + dev->vqs[i].error_ctx = NULL; > + dev->vqs[i].error = NULL; > + dev->vqs[i].kick = NULL; > + dev->vqs[i].call_ctx = NULL; > + dev->vqs[i].call = NULL; > + } > + /* No one will access memory at this point */ > + kfree(dev->memory); > + dev->memory = NULL; > + if (dev->mm) > + mmput(dev->mm); > + dev->mm = NULL; > +} > + > +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) > +{ > + struct vhost_memory mem, *newmem, *oldmem; > + unsigned long size = offsetof(struct vhost_memory, regions); > + long r; > + r = copy_from_user(&mem, m, size); > + if (r) > + return r; > + if (mem.padding) > + return -EOPNOTSUPP; > + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) > + return -E2BIG; > + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); > + if (!newmem) > + return -ENOMEM; > + > + memcpy(newmem, &mem, size); > + r = copy_from_user(newmem->regions, m->regions, > + mem.nregions * sizeof *m->regions); > + if (r) { > + kfree(newmem); > + return r; > + } > + oldmem = d->memory; > + rcu_assign_pointer(d->memory, newmem); > + synchronize_rcu(); > + kfree(oldmem); > + return 0; > +} > + > +static int init_used(struct vhost_virtqueue *vq) > +{ > + int r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + return r; > + return get_user(vq->last_used_idx, &vq->used->idx); > +} > + > +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) > +{ > + struct file *eventfp, *filep = NULL, > + *pollstart = NULL, *pollstop = NULL; > + struct eventfd_ctx *ctx = NULL; > + u32 __user *idxp = argp; > + struct vhost_virtqueue *vq; > + struct vhost_vring_state s; > + struct vhost_vring_file f; > + struct vhost_vring_addr a; > + u32 idx; > + long r; > + > + r = get_user(idx, idxp); > + if (r < 0) > + return r; > + if (idx > d->nvqs) > + return -ENOBUFS; > + > + vq = d->vqs + idx; > + > + mutex_lock(&vq->mutex); > + > + switch (ioctl) { > + case VHOST_SET_VRING_NUM: > + r = copy_from_user(&s, argp, sizeof s); > + if (r < 0) > + break; > + if (s.num > 0xffff) { > + r = -EINVAL; > + break; > + } > + vq->num = s.num; > + break; > + case VHOST_SET_VRING_BASE: > + r = copy_from_user(&s, argp, sizeof s); > + if (r < 0) > + break; > + if (s.num > 0xffff) { > + r = -EINVAL; > + break; > + } > + vq->avail_idx = vq->last_avail_idx = s.num; > + break; > + case VHOST_GET_VRING_BASE: > + s.index = idx; > + s.num = vq->last_avail_idx; > + r = copy_to_user(argp, &s, sizeof s); > + break; > + case VHOST_SET_VRING_DESC: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->desc = (void __user *)(long)a.user_addr; > + break; > + case VHOST_SET_VRING_AVAIL: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->avail = (void __user *)(long)a.user_addr; > + /* Forget the cached index value. */ > + vq->avail_idx = vq->last_avail_idx; > + break; > + case VHOST_SET_VRING_USED: > + r = copy_from_user(&a, argp, sizeof a); > + if (r < 0) > + break; > + if (a.padding) { > + r = -EOPNOTSUPP; > + break; > + } > + if ((u64)(long)a.user_addr != a.user_addr) { > + r = -EFAULT; > + break; > + } > + vq->used = (void __user *)(long)a.user_addr; > + r = init_used(vq); > + if (r) > + break; > + break; > + case VHOST_SET_VRING_KICK: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->kick) { > + pollstop = filep = vq->kick; > + pollstart = vq->kick = eventfp; > + } else > + filep = eventfp; > + break; > + case VHOST_SET_VRING_CALL: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->call) { > + filep = vq->call; > + ctx = vq->call_ctx; > + vq->call = eventfp; > + vq->call_ctx = eventfp ? > + eventfd_ctx_fileget(eventfp) : NULL; > + } else > + filep = eventfp; > + break; > + case VHOST_SET_VRING_ERR: > + r = copy_from_user(&f, argp, sizeof f); > + if (r < 0) > + break; > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > + if (IS_ERR(eventfp)) > + return PTR_ERR(eventfp); > + if (eventfp != vq->error) { > + filep = vq->error; > + vq->error = eventfp; > + ctx = vq->error_ctx; > + vq->error_ctx = eventfp ? > + eventfd_ctx_fileget(eventfp) : NULL; > + } else > + filep = eventfp; > + break; I'm not sure how these eventfd's save a trip to userspace. AFAICT, eventfd's cannot be used to signal another part of the kernel, they can only be used to wake up userspace. In my system, when an IRQ for kick() comes in, I have an eventfd which gets signalled to notify userspace. When I want to send a call(), I have to use a special ioctl(), just like lguest does. Doesn't this mean that for call(), vhost is just going to signal an eventfd to wake up userspace, which is then going to call ioctl(), and then we're back in kernelspace. Seems like a wasted userspace round-trip. Or am I mis-reading this code? PS - you can see my current code at: http://www.mmarray.org/~iws/virtio-phys/ Thanks, Ira > + default: > + r = -ENOIOCTLCMD; > + } > + > + if (pollstop && vq->handle_kick) > + vhost_poll_stop(&vq->poll); > + > + if (ctx) > + eventfd_ctx_put(ctx); > + if (filep) > + fput(filep); > + > + if (pollstart && vq->handle_kick) > + vhost_poll_start(&vq->poll, vq->kick); > + > + mutex_unlock(&vq->mutex); > + > + if (pollstop && vq->handle_kick) > + vhost_poll_flush(&vq->poll); > + return 0; > +} > + > +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) > +{ > + void __user *argp = (void __user *)arg; > + long r; > + > + mutex_lock(&d->mutex); > + /* If you are not the owner, you can become one */ > + if (ioctl == VHOST_SET_OWNER) { > + r = vhost_dev_set_owner(d); > + goto done; > + } > + > + /* You must be the owner to do anything else */ > + r = vhost_dev_check_owner(d); > + if (r) > + goto done; > + > + switch (ioctl) { > + case VHOST_SET_MEM_TABLE: > + r = vhost_set_memory(d, argp); > + break; > + default: > + r = vhost_set_vring(d, ioctl, argp); > + break; > + } > +done: > + mutex_unlock(&d->mutex); > + return r; > +} > + > +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, > + __u64 addr, __u32 len) > +{ > + struct vhost_memory_region *reg; > + int i; > + /* linear search is not brilliant, but we really have on the order of 6 > + * regions in practice */ > + for (i = 0; i < mem->nregions; ++i) { > + reg = mem->regions + i; > + if (reg->guest_phys_addr <= addr && > + reg->guest_phys_addr + reg->memory_size - 1 >= addr) > + return reg; > + } > + return NULL; > +} > + > +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, > + struct iovec iov[], int iov_size) > +{ > + const struct vhost_memory_region *reg; > + struct vhost_memory *mem; > + struct iovec *_iov; > + u64 s = 0; > + int ret = 0; > + > + rcu_read_lock(); > + > + mem = rcu_dereference(dev->memory); > + while ((u64)len > s) { > + u64 size; > + if (ret >= iov_size) { > + ret = -ENOBUFS; > + break; > + } > + reg = find_region(mem, addr, len); > + if (!reg) { > + ret = -EFAULT; > + break; > + } > + _iov = iov + ret; > + size = reg->memory_size - addr + reg->guest_phys_addr; > + _iov->iov_len = min((u64)len, size); > + _iov->iov_base = (void *) > + (reg->userspace_addr + addr - reg->guest_phys_addr); > + s += size; > + addr += size; > + ++ret; > + } > + > + rcu_read_unlock(); > + return ret; > +} > + > +/* Each buffer in the virtqueues is actually a chain of descriptors. This > + * function returns the next descriptor in the chain, or vq->vring.num if we're > + * at the end. */ > +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) > +{ > + unsigned int next; > + > + /* If this descriptor says it doesn't chain, we're done. */ > + if (!(desc->flags & VRING_DESC_F_NEXT)) > + return vq->num; > + > + /* Check they're not leading us off end of descriptors. */ > + next = desc->next; > + /* Make sure compiler knows to grab that: we don't want it changing! */ > + /* We will use the result as an index in an array, so most > + * architectures only need a compiler barrier here. */ > + read_barrier_depends(); > + > + if (next >= vq->num) { > + vq_err(vq, "Desc next is %u > %u", next, vq->num); > + return vq->num; > + } > + > + return next; > +} > + > +/* This looks in the virtqueue and for the first available buffer, and converts > + * it to an iovec for convenient access. Since descriptors consist of some > + * number of output then some number of input descriptors, it's actually two > + * iovecs, but we pack them into one and note how many of each there were. > + * > + * This function returns the descriptor number found, or vq->num (which > + * is never a valid descriptor number) if none was found. */ > +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, > + struct iovec iov[], > + unsigned int *out_num, unsigned int *in_num) > +{ > + struct vring_desc desc; > + unsigned int i, head; > + u16 last_avail_idx; > + int ret; > + > + /* Check it isn't doing very strange things with descriptor numbers. */ > + last_avail_idx = vq->last_avail_idx; > + if (get_user(vq->avail_idx, &vq->avail->idx)) { > + vq_err(vq, "Failed to access avail idx at %p\n", > + &vq->avail->idx); > + return vq->num; > + } > + > + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { > + vq_err(vq, "Guest moved used index from %u to %u", > + last_avail_idx, vq->avail_idx); > + return vq->num; > + } > + > + /* If there's nothing new since last we looked, return invalid. */ > + if (vq->avail_idx == last_avail_idx) > + return vq->num; > + > + /* Grab the next descriptor number they're advertising, and increment > + * the index we've seen. */ > + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { > + vq_err(vq, "Failed to read head: idx %d address %p\n", > + last_avail_idx, > + &vq->avail->ring[last_avail_idx % vq->num]); > + return vq->num; > + } > + > + /* If their number is silly, that's an error. */ > + if (head >= vq->num) { > + vq_err(vq, "Guest says index %u > %u is available", > + head, vq->num); > + return vq->num; > + } > + > + vq->last_avail_idx++; > + > + /* When we start there are none of either input nor output. */ > + *out_num = *in_num = 0; > + > + i = head; > + do { > + unsigned iov_count = *in_num + *out_num; > + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) { > + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", > + i, vq->desc + i); > + return vq->num; > + } > + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, > + VHOST_NET_MAX_SG - iov_count); > + if (ret < 0) { > + vq_err(vq, "Translation failure %d descriptor idx %d\n", > + ret, i); > + return vq->num; > + } > + /* If this is an input descriptor, increment that count. */ > + if (desc.flags & VRING_DESC_F_WRITE) > + *in_num += ret; > + else { > + /* If it's an output descriptor, they're all supposed > + * to come before any input descriptors. */ > + if (*in_num) { > + vq_err(vq, "Descriptor has out after in: " > + "idx %d\n", i); > + return vq->num; > + } > + *out_num += ret; > + } > + } while ((i = next_desc(vq, &desc)) != vq->num); > + return head; > +} > + > +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) > +{ > + vq->last_avail_idx--; > +} > + > +/* After we've used one of their buffers, we tell them about it. We'll then > + * want to send them an interrupt, using vq->call. */ > +int vhost_add_used(struct vhost_virtqueue *vq, > + unsigned int head, int len) > +{ > + struct vring_used_elem *used; > + > + /* The virtqueue contains a ring of used buffers. Get a pointer to the > + * next entry in that used ring. */ > + used = &vq->used->ring[vq->last_used_idx % vq->num]; > + if (put_user(head, &used->id)) { > + vq_err(vq, "Failed to write used id"); > + return -EFAULT; > + } > + if (put_user(len, &used->len)) { > + vq_err(vq, "Failed to write used len"); > + return -EFAULT; > + } > + /* Make sure buffer is written before we update index. */ > + wmb(); > + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { > + vq_err(vq, "Failed to increment used idx"); > + return -EFAULT; > + } > + vq->last_used_idx++; > + return 0; > +} > + > +/* This actually sends the interrupt for this virtqueue */ > +void vhost_trigger_irq(struct vhost_dev *dev, struct vhost_virtqueue *vq) > +{ > + __u16 flags = 0; > + if (get_user(flags, &vq->avail->flags)) { > + vq_err(vq, "Failed to get flags"); > + return; > + } > + > + /* If they don't want an interrupt, don't send one, unless empty. */ > + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && > + (!vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) || > + vq->avail_idx != vq->last_avail_idx)) > + return; > + > + /* Send the Guest an interrupt tell them we used something up. */ > + if (vq->call_ctx) > + eventfd_signal(vq->call_ctx, 1); > +} > + > +/* And here's the combo meal deal. Supersize me! */ > +void vhost_add_used_and_trigger(struct vhost_dev *dev, > + struct vhost_virtqueue *vq, > + unsigned int head, int len) > +{ > + vhost_add_used(vq, head, len); > + vhost_trigger_irq(dev, vq); > +} > + > +/* OK, now we need to know about added descriptors. */ > +bool vhost_notify(struct vhost_virtqueue *vq) > +{ > + int r; > + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) > + return false; > + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; > + r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + vq_err(vq, "Failed to disable notification: %d\n", r); > + /* They could have slipped one in as we were doing that: make > + * sure it's written, tell caller it needs to check again. */ > + mb(); > + return true; > +} > + > +/* We don't need to be notified again. */ > +void vhost_no_notify(struct vhost_virtqueue *vq) > +{ > + int r; > + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) > + return; > + vq->used_flags |= VRING_USED_F_NO_NOTIFY; > + r = put_user(vq->used_flags, &vq->used->flags); > + if (r) > + vq_err(vq, "Failed to enable notification: %d\n", r); > +} > + > +int vhost_init(void) > +{ > + vhost_workqueue = create_workqueue("vhost"); > + if (!vhost_workqueue) > + return -ENOMEM; > + return 0; > +} > + > +void vhost_cleanup(void) > +{ > + destroy_workqueue(vhost_workqueue); > +} > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > new file mode 100644 > index 0000000..8e13d06 > --- /dev/null > +++ b/drivers/vhost/vhost.h > @@ -0,0 +1,122 @@ > +#ifndef _VHOST_H > +#define _VHOST_H > + > +#include <linux/eventfd.h> > +#include <linux/vhost.h> > +#include <linux/mm.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/poll.h> > +#include <linux/file.h> > +#include <linux/skbuff.h> > +#include <linux/uio.h> > +#include <linux/virtio_config.h> > + > +struct vhost_device; > + > +enum { > + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, > +}; > + > +/* Poll a file (eventfd or socket) */ > +/* Note: there's nothing vhost specific about this structure. */ > +struct vhost_poll { > + poll_table table; > + wait_queue_head_t *wqh; > + wait_queue_t wait; > + /* struct which will handle all actual work. */ > + struct work_struct work; > + unsigned long mask; > +}; > + > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > + unsigned long mask); > +void vhost_poll_start(struct vhost_poll *poll, struct file *file); > +void vhost_poll_stop(struct vhost_poll *poll); > +void vhost_poll_flush(struct vhost_poll *poll); > + > +/* The virtqueue structure describes a queue attached to a device. */ > +struct vhost_virtqueue { > + struct vhost_dev *dev; > + > + /* The actual ring of buffers. */ > + struct mutex mutex; > + unsigned int num; > + struct vring_desc __user *desc; > + struct vring_avail __user *avail; > + struct vring_used __user *used; > + struct file *kick; > + struct file *call; > + struct file *error; > + struct eventfd_ctx *call_ctx; > + struct eventfd_ctx *error_ctx; > + > + struct vhost_poll poll; > + > + /* The routine to call when the Guest pings us, or timeout. */ > + work_func_t handle_kick; > + > + /* Last available index we saw. */ > + u16 last_avail_idx; > + > + /* Caches available index value from user. */ > + u16 avail_idx; > + > + /* Last index we used. */ > + u16 last_used_idx; > + > + /* Used flags */ > + u16 used_flags; > + > + struct iovec iov[VHOST_NET_MAX_SG]; > + struct iovec hdr[VHOST_NET_MAX_SG]; > +}; > + > +struct vhost_dev { > + /* Readers use RCU to access memory table pointer. > + * Writers use mutex below.*/ > + struct vhost_memory *memory; > + struct mm_struct *mm; > + struct vhost_virtqueue *vqs; > + int nvqs; > + struct mutex mutex; > + unsigned acked_features; > +}; > + > +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > +long vhost_dev_check_owner(struct vhost_dev *); > +long vhost_dev_reset_owner(struct vhost_dev *); > +void vhost_dev_cleanup(struct vhost_dev *); > +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); > + > +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, > + struct iovec iov[], > + unsigned int *out_num, unsigned int *in_num); > +void vhost_discard_vq_desc(struct vhost_virtqueue *); > + > +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); > +void vhost_trigger_irq(struct vhost_dev *, struct vhost_virtqueue *); > +void vhost_add_used_and_trigger(struct vhost_dev *, struct vhost_virtqueue *, > + unsigned int head, int len); > +void vhost_no_notify(struct vhost_virtqueue *); > +bool vhost_notify(struct vhost_virtqueue *); > + > +int vhost_init(void); > +void vhost_cleanup(void); > + > +#define vq_err(vq, fmt, ...) do { \ > + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > + if ((vq)->error_ctx) \ > + eventfd_signal((vq)->error_ctx, 1);\ > + } while (0) > + > +enum { > + VHOST_FEATURES = 1 << VIRTIO_F_NOTIFY_ON_EMPTY, > +}; > + > +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) > +{ > + return dev->acked_features & (1 << bit); > +} > + > +#endif > diff --git a/include/linux/Kbuild b/include/linux/Kbuild > index dec2f18..975df9a 100644 > --- a/include/linux/Kbuild > +++ b/include/linux/Kbuild > @@ -360,6 +360,7 @@ unifdef-y += uio.h > unifdef-y += unistd.h > unifdef-y += usbdevice_fs.h > unifdef-y += utsname.h > +unifdef-y += vhost.h > unifdef-y += videodev2.h > unifdef-y += videodev.h > unifdef-y += virtio_config.h > diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h > index 0521177..781a8bb 100644 > --- a/include/linux/miscdevice.h > +++ b/include/linux/miscdevice.h > @@ -30,6 +30,7 @@ > #define HPET_MINOR 228 > #define FUSE_MINOR 229 > #define KVM_MINOR 232 > +#define VHOST_NET_MINOR 233 > #define MISC_DYNAMIC_MINOR 255 > > struct device; > diff --git a/include/linux/vhost.h b/include/linux/vhost.h > new file mode 100644 > index 0000000..3f441a9 > --- /dev/null > +++ b/include/linux/vhost.h > @@ -0,0 +1,101 @@ > +#ifndef _LINUX_VHOST_H > +#define _LINUX_VHOST_H > +/* Userspace interface for in-kernel virtio accelerators. */ > + > +/* vhost is used to reduce the number of system calls involved in virtio. > + * > + * Existing virtio net code is used in the guest without modification. > + * > + * This header includes interface used by userspace hypervisor for > + * device configuration. > + */ > + > +#include <linux/types.h> > +#include <linux/compiler.h> > +#include <linux/ioctl.h> > +#include <linux/virtio_config.h> > +#include <linux/virtio_ring.h> > + > +struct vhost_vring_state { > + unsigned int index; > + unsigned int num; > +}; > + > +struct vhost_vring_file { > + unsigned int index; > + int fd; > +}; > + > +struct vhost_vring_addr { > + unsigned int index; > + unsigned int padding; > + __u64 user_addr; > +}; > + > +struct vhost_memory_region { > + __u64 guest_phys_addr; > + __u64 memory_size; /* bytes */ > + __u64 userspace_addr; > + __u64 padding; /* read/write protection? */ > +}; > + > +struct vhost_memory { > + __u32 nregions; > + __u32 padding; > + struct vhost_memory_region regions[0]; > +}; > + > +/* ioctls */ > + > +#define VHOST_VIRTIO 0xAF > + > +/* Features bitmask for forward compatibility. Transport bits are used for > + * vhost specific features. */ > +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) > +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) > + > +/* Set current process as the (exclusive) owner of this file descriptor. This > + * must be called before any other vhost command. Further calls to > + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ > +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) > +/* Give up ownership, and reset the device to default values. > + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ > +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) > + > +/* Set up/modify memory layout */ > +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) > + > +/* Ring setup. These parameters can not be modified while ring is running > + * (bound to a device). */ > +/* Set number of descriptors in ring */ > +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) > +/* Start of array of descriptors (virtually contiguous) */ > +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) > +/* Used structure address */ > +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) > +/* Available structure address */ > +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) > +/* Base value where queue looks for available descriptors */ > +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > +/* Get accessor: reads index, writes value in num */ > +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > + > +/* The following ioctls use eventfd file descriptors to signal and poll > + * for events. */ > + > +/* Set eventfd to poll for added buffers */ > +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) > +/* Set eventfd to signal when buffers have beed used */ > +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) > +/* Set eventfd to signal an error */ > +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) > + > +/* VHOST_NET specific defines */ > + > +/* Attach virtio net device to a raw socket. The socket must be already > + * bound to an ethernet device, this device will be used for transmit. > + * Pass -1 to unbind from the socket and the transmit device. > + * This can be used to stop the device (e.g. for migration). */ > +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int) > + > +#endif > -- > 1.6.2.5 > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-09-25 17:01 ` Ira W. Snyder @ 2009-09-27 7:43 ` Michael S. Tsirkin 0 siblings, 0 replies; 83+ messages in thread From: Michael S. Tsirkin @ 2009-09-27 7:43 UTC (permalink / raw) To: Ira W. Snyder Cc: netdev, virtualization, kvm, linux-kernel, mingo, linux-mm, akpm, hpa, gregory.haskins, Rusty Russell, s.hetze On Fri, Sep 25, 2009 at 10:01:58AM -0700, Ira W. Snyder wrote: > > + case VHOST_SET_VRING_KICK: > > + r = copy_from_user(&f, argp, sizeof f); > > + if (r < 0) > > + break; > > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > > + if (IS_ERR(eventfp)) > > + return PTR_ERR(eventfp); > > + if (eventfp != vq->kick) { > > + pollstop = filep = vq->kick; > > + pollstart = vq->kick = eventfp; > > + } else > > + filep = eventfp; > > + break; > > + case VHOST_SET_VRING_CALL: > > + r = copy_from_user(&f, argp, sizeof f); > > + if (r < 0) > > + break; > > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > > + if (IS_ERR(eventfp)) > > + return PTR_ERR(eventfp); > > + if (eventfp != vq->call) { > > + filep = vq->call; > > + ctx = vq->call_ctx; > > + vq->call = eventfp; > > + vq->call_ctx = eventfp ? > > + eventfd_ctx_fileget(eventfp) : NULL; > > + } else > > + filep = eventfp; > > + break; > > + case VHOST_SET_VRING_ERR: > > + r = copy_from_user(&f, argp, sizeof f); > > + if (r < 0) > > + break; > > + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); > > + if (IS_ERR(eventfp)) > > + return PTR_ERR(eventfp); > > + if (eventfp != vq->error) { > > + filep = vq->error; > > + vq->error = eventfp; > > + ctx = vq->error_ctx; > > + vq->error_ctx = eventfp ? > > + eventfd_ctx_fileget(eventfp) : NULL; > > + } else > > + filep = eventfp; > > + break; > > I'm not sure how these eventfd's save a trip to userspace. > > AFAICT, eventfd's cannot be used to signal another part of the kernel, > they can only be used to wake up userspace. Yes, they can. See irqfd code in virt/kvm/eventfd.c. > In my system, when an IRQ for kick() comes in, I have an eventfd which > gets signalled to notify userspace. When I want to send a call(), I have > to use a special ioctl(), just like lguest does. > > Doesn't this mean that for call(), vhost is just going to signal an > eventfd to wake up userspace, which is then going to call ioctl(), and > then we're back in kernelspace. Seems like a wasted userspace > round-trip. > > Or am I mis-reading this code? Yes. Kernel can poll eventfd and deliver an interrupt directly without involving userspace. > PS - you can see my current code at: > http://www.mmarray.org/~iws/virtio-phys/ > > Thanks, > Ira > > > + default: > > + r = -ENOIOCTLCMD; > > + } > > + > > + if (pollstop && vq->handle_kick) > > + vhost_poll_stop(&vq->poll); > > + > > + if (ctx) > > + eventfd_ctx_put(ctx); > > + if (filep) > > + fput(filep); > > + > > + if (pollstart && vq->handle_kick) > > + vhost_poll_start(&vq->poll, vq->kick); > > + > > + mutex_unlock(&vq->mutex); > > + > > + if (pollstop && vq->handle_kick) > > + vhost_poll_flush(&vq->poll); > > + return 0; > > +} > > + > > +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) > > +{ > > + void __user *argp = (void __user *)arg; > > + long r; > > + > > + mutex_lock(&d->mutex); > > + /* If you are not the owner, you can become one */ > > + if (ioctl == VHOST_SET_OWNER) { > > + r = vhost_dev_set_owner(d); > > + goto done; > > + } > > + > > + /* You must be the owner to do anything else */ > > + r = vhost_dev_check_owner(d); > > + if (r) > > + goto done; > > + > > + switch (ioctl) { > > + case VHOST_SET_MEM_TABLE: > > + r = vhost_set_memory(d, argp); > > + break; > > + default: > > + r = vhost_set_vring(d, ioctl, argp); > > + break; > > + } > > +done: > > + mutex_unlock(&d->mutex); > > + return r; > > +} > > + > > +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, > > + __u64 addr, __u32 len) > > +{ > > + struct vhost_memory_region *reg; > > + int i; > > + /* linear search is not brilliant, but we really have on the order of 6 > > + * regions in practice */ > > + for (i = 0; i < mem->nregions; ++i) { > > + reg = mem->regions + i; > > + if (reg->guest_phys_addr <= addr && > > + reg->guest_phys_addr + reg->memory_size - 1 >= addr) > > + return reg; > > + } > > + return NULL; > > +} > > + > > +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, > > + struct iovec iov[], int iov_size) > > +{ > > + const struct vhost_memory_region *reg; > > + struct vhost_memory *mem; > > + struct iovec *_iov; > > + u64 s = 0; > > + int ret = 0; > > + > > + rcu_read_lock(); > > + > > + mem = rcu_dereference(dev->memory); > > + while ((u64)len > s) { > > + u64 size; > > + if (ret >= iov_size) { > > + ret = -ENOBUFS; > > + break; > > + } > > + reg = find_region(mem, addr, len); > > + if (!reg) { > > + ret = -EFAULT; > > + break; > > + } > > + _iov = iov + ret; > > + size = reg->memory_size - addr + reg->guest_phys_addr; > > + _iov->iov_len = min((u64)len, size); > > + _iov->iov_base = (void *) > > + (reg->userspace_addr + addr - reg->guest_phys_addr); > > + s += size; > > + addr += size; > > + ++ret; > > + } > > + > > + rcu_read_unlock(); > > + return ret; > > +} > > + > > +/* Each buffer in the virtqueues is actually a chain of descriptors. This > > + * function returns the next descriptor in the chain, or vq->vring.num if we're > > + * at the end. */ > > +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) > > +{ > > + unsigned int next; > > + > > + /* If this descriptor says it doesn't chain, we're done. */ > > + if (!(desc->flags & VRING_DESC_F_NEXT)) > > + return vq->num; > > + > > + /* Check they're not leading us off end of descriptors. */ > > + next = desc->next; > > + /* Make sure compiler knows to grab that: we don't want it changing! */ > > + /* We will use the result as an index in an array, so most > > + * architectures only need a compiler barrier here. */ > > + read_barrier_depends(); > > + > > + if (next >= vq->num) { > > + vq_err(vq, "Desc next is %u > %u", next, vq->num); > > + return vq->num; > > + } > > + > > + return next; > > +} > > + > > +/* This looks in the virtqueue and for the first available buffer, and converts > > + * it to an iovec for convenient access. Since descriptors consist of some > > + * number of output then some number of input descriptors, it's actually two > > + * iovecs, but we pack them into one and note how many of each there were. > > + * > > + * This function returns the descriptor number found, or vq->num (which > > + * is never a valid descriptor number) if none was found. */ > > +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, > > + struct iovec iov[], > > + unsigned int *out_num, unsigned int *in_num) > > +{ > > + struct vring_desc desc; > > + unsigned int i, head; > > + u16 last_avail_idx; > > + int ret; > > + > > + /* Check it isn't doing very strange things with descriptor numbers. */ > > + last_avail_idx = vq->last_avail_idx; > > + if (get_user(vq->avail_idx, &vq->avail->idx)) { > > + vq_err(vq, "Failed to access avail idx at %p\n", > > + &vq->avail->idx); > > + return vq->num; > > + } > > + > > + if ((u16)(vq->avail_idx - last_avail_idx) > vq->num) { > > + vq_err(vq, "Guest moved used index from %u to %u", > > + last_avail_idx, vq->avail_idx); > > + return vq->num; > > + } > > + > > + /* If there's nothing new since last we looked, return invalid. */ > > + if (vq->avail_idx == last_avail_idx) > > + return vq->num; > > + > > + /* Grab the next descriptor number they're advertising, and increment > > + * the index we've seen. */ > > + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { > > + vq_err(vq, "Failed to read head: idx %d address %p\n", > > + last_avail_idx, > > + &vq->avail->ring[last_avail_idx % vq->num]); > > + return vq->num; > > + } > > + > > + /* If their number is silly, that's an error. */ > > + if (head >= vq->num) { > > + vq_err(vq, "Guest says index %u > %u is available", > > + head, vq->num); > > + return vq->num; > > + } > > + > > + vq->last_avail_idx++; > > + > > + /* When we start there are none of either input nor output. */ > > + *out_num = *in_num = 0; > > + > > + i = head; > > + do { > > + unsigned iov_count = *in_num + *out_num; > > + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) { > > + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", > > + i, vq->desc + i); > > + return vq->num; > > + } > > + ret = translate_desc(dev, desc.addr, desc.len, iov + iov_count, > > + VHOST_NET_MAX_SG - iov_count); > > + if (ret < 0) { > > + vq_err(vq, "Translation failure %d descriptor idx %d\n", > > + ret, i); > > + return vq->num; > > + } > > + /* If this is an input descriptor, increment that count. */ > > + if (desc.flags & VRING_DESC_F_WRITE) > > + *in_num += ret; > > + else { > > + /* If it's an output descriptor, they're all supposed > > + * to come before any input descriptors. */ > > + if (*in_num) { > > + vq_err(vq, "Descriptor has out after in: " > > + "idx %d\n", i); > > + return vq->num; > > + } > > + *out_num += ret; > > + } > > + } while ((i = next_desc(vq, &desc)) != vq->num); > > + return head; > > +} > > + > > +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ > > +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) > > +{ > > + vq->last_avail_idx--; > > +} > > + > > +/* After we've used one of their buffers, we tell them about it. We'll then > > + * want to send them an interrupt, using vq->call. */ > > +int vhost_add_used(struct vhost_virtqueue *vq, > > + unsigned int head, int len) > > +{ > > + struct vring_used_elem *used; > > + > > + /* The virtqueue contains a ring of used buffers. Get a pointer to the > > + * next entry in that used ring. */ > > + used = &vq->used->ring[vq->last_used_idx % vq->num]; > > + if (put_user(head, &used->id)) { > > + vq_err(vq, "Failed to write used id"); > > + return -EFAULT; > > + } > > + if (put_user(len, &used->len)) { > > + vq_err(vq, "Failed to write used len"); > > + return -EFAULT; > > + } > > + /* Make sure buffer is written before we update index. */ > > + wmb(); > > + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { > > + vq_err(vq, "Failed to increment used idx"); > > + return -EFAULT; > > + } > > + vq->last_used_idx++; > > + return 0; > > +} > > + > > +/* This actually sends the interrupt for this virtqueue */ > > +void vhost_trigger_irq(struct vhost_dev *dev, struct vhost_virtqueue *vq) > > +{ > > + __u16 flags = 0; > > + if (get_user(flags, &vq->avail->flags)) { > > + vq_err(vq, "Failed to get flags"); > > + return; > > + } > > + > > + /* If they don't want an interrupt, don't send one, unless empty. */ > > + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && > > + (!vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) || > > + vq->avail_idx != vq->last_avail_idx)) > > + return; > > + > > + /* Send the Guest an interrupt tell them we used something up. */ > > + if (vq->call_ctx) > > + eventfd_signal(vq->call_ctx, 1); > > +} > > + > > +/* And here's the combo meal deal. Supersize me! */ > > +void vhost_add_used_and_trigger(struct vhost_dev *dev, > > + struct vhost_virtqueue *vq, > > + unsigned int head, int len) > > +{ > > + vhost_add_used(vq, head, len); > > + vhost_trigger_irq(dev, vq); > > +} > > + > > +/* OK, now we need to know about added descriptors. */ > > +bool vhost_notify(struct vhost_virtqueue *vq) > > +{ > > + int r; > > + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) > > + return false; > > + vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; > > + r = put_user(vq->used_flags, &vq->used->flags); > > + if (r) > > + vq_err(vq, "Failed to disable notification: %d\n", r); > > + /* They could have slipped one in as we were doing that: make > > + * sure it's written, tell caller it needs to check again. */ > > + mb(); > > + return true; > > +} > > + > > +/* We don't need to be notified again. */ > > +void vhost_no_notify(struct vhost_virtqueue *vq) > > +{ > > + int r; > > + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) > > + return; > > + vq->used_flags |= VRING_USED_F_NO_NOTIFY; > > + r = put_user(vq->used_flags, &vq->used->flags); > > + if (r) > > + vq_err(vq, "Failed to enable notification: %d\n", r); > > +} > > + > > +int vhost_init(void) > > +{ > > + vhost_workqueue = create_workqueue("vhost"); > > + if (!vhost_workqueue) > > + return -ENOMEM; > > + return 0; > > +} > > + > > +void vhost_cleanup(void) > > +{ > > + destroy_workqueue(vhost_workqueue); > > +} > > diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h > > new file mode 100644 > > index 0000000..8e13d06 > > --- /dev/null > > +++ b/drivers/vhost/vhost.h > > @@ -0,0 +1,122 @@ > > +#ifndef _VHOST_H > > +#define _VHOST_H > > + > > +#include <linux/eventfd.h> > > +#include <linux/vhost.h> > > +#include <linux/mm.h> > > +#include <linux/mutex.h> > > +#include <linux/workqueue.h> > > +#include <linux/poll.h> > > +#include <linux/file.h> > > +#include <linux/skbuff.h> > > +#include <linux/uio.h> > > +#include <linux/virtio_config.h> > > + > > +struct vhost_device; > > + > > +enum { > > + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, > > +}; > > + > > +/* Poll a file (eventfd or socket) */ > > +/* Note: there's nothing vhost specific about this structure. */ > > +struct vhost_poll { > > + poll_table table; > > + wait_queue_head_t *wqh; > > + wait_queue_t wait; > > + /* struct which will handle all actual work. */ > > + struct work_struct work; > > + unsigned long mask; > > +}; > > + > > +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, > > + unsigned long mask); > > +void vhost_poll_start(struct vhost_poll *poll, struct file *file); > > +void vhost_poll_stop(struct vhost_poll *poll); > > +void vhost_poll_flush(struct vhost_poll *poll); > > + > > +/* The virtqueue structure describes a queue attached to a device. */ > > +struct vhost_virtqueue { > > + struct vhost_dev *dev; > > + > > + /* The actual ring of buffers. */ > > + struct mutex mutex; > > + unsigned int num; > > + struct vring_desc __user *desc; > > + struct vring_avail __user *avail; > > + struct vring_used __user *used; > > + struct file *kick; > > + struct file *call; > > + struct file *error; > > + struct eventfd_ctx *call_ctx; > > + struct eventfd_ctx *error_ctx; > > + > > + struct vhost_poll poll; > > + > > + /* The routine to call when the Guest pings us, or timeout. */ > > + work_func_t handle_kick; > > + > > + /* Last available index we saw. */ > > + u16 last_avail_idx; > > + > > + /* Caches available index value from user. */ > > + u16 avail_idx; > > + > > + /* Last index we used. */ > > + u16 last_used_idx; > > + > > + /* Used flags */ > > + u16 used_flags; > > + > > + struct iovec iov[VHOST_NET_MAX_SG]; > > + struct iovec hdr[VHOST_NET_MAX_SG]; > > +}; > > + > > +struct vhost_dev { > > + /* Readers use RCU to access memory table pointer. > > + * Writers use mutex below.*/ > > + struct vhost_memory *memory; > > + struct mm_struct *mm; > > + struct vhost_virtqueue *vqs; > > + int nvqs; > > + struct mutex mutex; > > + unsigned acked_features; > > +}; > > + > > +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); > > +long vhost_dev_check_owner(struct vhost_dev *); > > +long vhost_dev_reset_owner(struct vhost_dev *); > > +void vhost_dev_cleanup(struct vhost_dev *); > > +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); > > + > > +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, > > + struct iovec iov[], > > + unsigned int *out_num, unsigned int *in_num); > > +void vhost_discard_vq_desc(struct vhost_virtqueue *); > > + > > +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); > > +void vhost_trigger_irq(struct vhost_dev *, struct vhost_virtqueue *); > > +void vhost_add_used_and_trigger(struct vhost_dev *, struct vhost_virtqueue *, > > + unsigned int head, int len); > > +void vhost_no_notify(struct vhost_virtqueue *); > > +bool vhost_notify(struct vhost_virtqueue *); > > + > > +int vhost_init(void); > > +void vhost_cleanup(void); > > + > > +#define vq_err(vq, fmt, ...) do { \ > > + pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \ > > + if ((vq)->error_ctx) \ > > + eventfd_signal((vq)->error_ctx, 1);\ > > + } while (0) > > + > > +enum { > > + VHOST_FEATURES = 1 << VIRTIO_F_NOTIFY_ON_EMPTY, > > +}; > > + > > +static inline int vhost_has_feature(struct vhost_dev *dev, int bit) > > +{ > > + return dev->acked_features & (1 << bit); > > +} > > + > > +#endif > > diff --git a/include/linux/Kbuild b/include/linux/Kbuild > > index dec2f18..975df9a 100644 > > --- a/include/linux/Kbuild > > +++ b/include/linux/Kbuild > > @@ -360,6 +360,7 @@ unifdef-y += uio.h > > unifdef-y += unistd.h > > unifdef-y += usbdevice_fs.h > > unifdef-y += utsname.h > > +unifdef-y += vhost.h > > unifdef-y += videodev2.h > > unifdef-y += videodev.h > > unifdef-y += virtio_config.h > > diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h > > index 0521177..781a8bb 100644 > > --- a/include/linux/miscdevice.h > > +++ b/include/linux/miscdevice.h > > @@ -30,6 +30,7 @@ > > #define HPET_MINOR 228 > > #define FUSE_MINOR 229 > > #define KVM_MINOR 232 > > +#define VHOST_NET_MINOR 233 > > #define MISC_DYNAMIC_MINOR 255 > > > > struct device; > > diff --git a/include/linux/vhost.h b/include/linux/vhost.h > > new file mode 100644 > > index 0000000..3f441a9 > > --- /dev/null > > +++ b/include/linux/vhost.h > > @@ -0,0 +1,101 @@ > > +#ifndef _LINUX_VHOST_H > > +#define _LINUX_VHOST_H > > +/* Userspace interface for in-kernel virtio accelerators. */ > > + > > +/* vhost is used to reduce the number of system calls involved in virtio. > > + * > > + * Existing virtio net code is used in the guest without modification. > > + * > > + * This header includes interface used by userspace hypervisor for > > + * device configuration. > > + */ > > + > > +#include <linux/types.h> > > +#include <linux/compiler.h> > > +#include <linux/ioctl.h> > > +#include <linux/virtio_config.h> > > +#include <linux/virtio_ring.h> > > + > > +struct vhost_vring_state { > > + unsigned int index; > > + unsigned int num; > > +}; > > + > > +struct vhost_vring_file { > > + unsigned int index; > > + int fd; > > +}; > > + > > +struct vhost_vring_addr { > > + unsigned int index; > > + unsigned int padding; > > + __u64 user_addr; > > +}; > > + > > +struct vhost_memory_region { > > + __u64 guest_phys_addr; > > + __u64 memory_size; /* bytes */ > > + __u64 userspace_addr; > > + __u64 padding; /* read/write protection? */ > > +}; > > + > > +struct vhost_memory { > > + __u32 nregions; > > + __u32 padding; > > + struct vhost_memory_region regions[0]; > > +}; > > + > > +/* ioctls */ > > + > > +#define VHOST_VIRTIO 0xAF > > + > > +/* Features bitmask for forward compatibility. Transport bits are used for > > + * vhost specific features. */ > > +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64) > > +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64) > > + > > +/* Set current process as the (exclusive) owner of this file descriptor. This > > + * must be called before any other vhost command. Further calls to > > + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ > > +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) > > +/* Give up ownership, and reset the device to default values. > > + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ > > +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) > > + > > +/* Set up/modify memory layout */ > > +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) > > + > > +/* Ring setup. These parameters can not be modified while ring is running > > + * (bound to a device). */ > > +/* Set number of descriptors in ring */ > > +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) > > +/* Start of array of descriptors (virtually contiguous) */ > > +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) > > +/* Used structure address */ > > +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) > > +/* Available structure address */ > > +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) > > +/* Base value where queue looks for available descriptors */ > > +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > > +/* Get accessor: reads index, writes value in num */ > > +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) > > + > > +/* The following ioctls use eventfd file descriptors to signal and poll > > + * for events. */ > > + > > +/* Set eventfd to poll for added buffers */ > > +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) > > +/* Set eventfd to signal when buffers have beed used */ > > +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) > > +/* Set eventfd to signal an error */ > > +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) > > + > > +/* VHOST_NET specific defines */ > > + > > +/* Attach virtio net device to a raw socket. The socket must be already > > + * bound to an ethernet device, this device will be used for transmit. > > + * Pass -1 to unbind from the socket and the transmit device. > > + * This can be used to stop the device (e.g. for migration). */ > > +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int) > > + > > +#endif > > -- > > 1.6.2.5 > > -- > > To unsubscribe from this list: send the line "unsubscribe netdev" in > > the body of a message to majordomo@vger.kernel.org > > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
[parent not found: <E88DD564E9DC5446A76B2B47C3BCCA150219600F9B@pdsmsx503.ccr.corp.intel.com>]
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server [not found] <E88DD564E9DC5446A76B2B47C3BCCA150219600F9B@pdsmsx503.ccr.corp.intel.com> @ 2009-08-31 11:42 ` Xin, Xiaohui 2009-08-31 15:23 ` Arnd Bergmann 2009-08-31 17:52 ` Avi Kivity 0 siblings, 2 replies; 83+ messages in thread From: Xin, Xiaohui @ 2009-08-31 11:42 UTC (permalink / raw) To: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, "kvm@v Hi, Michael That's a great job. We are now working on support VMDq on KVM, and since the VMDq hardware presents L2 sorting based on MAC addresses and VLAN tags, our target is to implement a zero copy solution using VMDq. We stared from the virtio-net architecture. What we want to proposal is to use AIO combined with direct I/O: 1) Modify virtio-net Backend service in Qemu to submit aio requests composed from virtqueue. 2) Modify TUN/TAP device to support aio operations and the user space buffer directly mapping into the host kernel. 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. 4) Modify the net_dev and skb structure to permit allocated skb to use user space directly mapped payload buffer address rather then kernel allocated. As zero copy is also your goal, we are interested in what's in your mind, and would like to collaborate with you if possible. BTW, we will send our VMDq write-up very soon. Thanks Xiaohui -----Original Message----- From: kvm-owner@vger.kernel.org [mailto:kvm-owner@vger.kernel.org] On Behalf Of Michael S. Tsirkin Sent: Wednesday, August 19, 2009 11:03 PM To: netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com Subject: [PATCHv4 2/2] vhost_net: a kernel-level virtio server What it is: vhost net is a character device that can be used to reduce the number of system calls involved in virtio networking. Existing virtio net code is used in the guest without modification. There's similarity with vringfd, with some differences and reduced scope - uses eventfd for signalling - structures can be moved around in memory at any time (good for migration) - support memory table and not just an offset (needed for kvm) common virtio related code has been put in a separate file vhost.c and can be made into a separate module if/when more backends appear. I used Rusty's lguest.c as the source for developing this part : this supplied me with witty comments I wouldn't be able to write myself. What it is not: vhost net is not a bus, and not a generic new system call. No assumptions are made on how guest performs hypercalls. Userspace hypervisors are supported as well as kvm. How it works: Basically, we connect virtio frontend (configured by userspace) to a backend. The backend could be a network device, or a tun-like device. In this version I only support raw socket as a backend, which can be bound to e.g. SR IOV, or to macvlan device. Backend is also configured by userspace, including vlan/mac etc. Status: This works for me, and I haven't see any crashes. I have not run any benchmarks yet, compared to userspace, I expect to see improved latency (as I save up to 4 system calls per packet) but not bandwidth/CPU (as TSO and interrupt mitigation are not supported). Features that I plan to look at in the future: - TSO - interrupt mitigation - zero copy Acked-by: Arnd Bergmann <arnd@arndb.de> Signed-off-by: Michael S. Tsirkin <mst@redhat.com> --- MAINTAINERS | 10 + arch/x86/kvm/Kconfig | 1 + drivers/Makefile | 1 + drivers/vhost/Kconfig | 11 + drivers/vhost/Makefile | 2 + drivers/vhost/net.c | 429 ++++++++++++++++++++++++++++ drivers/vhost/vhost.c | 664 ++++++++++++++++++++++++++++++++++++++++++++ drivers/vhost/vhost.h | 108 +++++++ include/linux/Kbuild | 1 + include/linux/miscdevice.h | 1 + include/linux/vhost.h | 100 +++++++ 11 files changed, 1328 insertions(+), 0 deletions(-) create mode 100644 drivers/vhost/Kconfig create mode 100644 drivers/vhost/Makefile create mode 100644 drivers/vhost/net.c create mode 100644 drivers/vhost/vhost.c create mode 100644 drivers/vhost/vhost.h create mode 100644 include/linux/vhost.h diff --git a/MAINTAINERS b/MAINTAINERS index b1114cf..de4587f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5431,6 +5431,16 @@ S: Maintained F: Documentation/filesystems/vfat.txt F: fs/fat/ +VIRTIO HOST (VHOST) +P: Michael S. Tsirkin +M: mst@redhat.com +L: kvm@vger.kernel.org +L: virtualization@lists.osdl.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/vhost/ +F: include/linux/vhost.h + VIA RHINE NETWORK DRIVER M: Roger Luethi <rl@hellgate.ch> S: Maintained diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index b84e571..94f44d9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -64,6 +64,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff --git a/drivers/Makefile b/drivers/Makefile index bc4205d..1551ae1 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -105,6 +105,7 @@ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ +obj-$(CONFIG_VHOST_NET) += vhost/ obj-$(CONFIG_VIRTIO) += virtio/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_STAGING) += staging/ diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig new file mode 100644 index 0000000..d955406 --- /dev/null +++ b/drivers/vhost/Kconfig @@ -0,0 +1,11 @@ +config VHOST_NET + tristate "Host kernel accelerator for virtio net" + depends on NET && EVENTFD + ---help--- + This kernel module can be loaded in host kernel to accelerate + guest networking with virtio_net. Not to be confused with virtio_net + module itself which needs to be loaded in guest kernel. + + To compile this driver as a module, choose M here: the module will + be called vhost_net. + diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile new file mode 100644 index 0000000..72dd020 --- /dev/null +++ b/drivers/vhost/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_VHOST_NET) += vhost_net.o +vhost_net-y := vhost.o net.o diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c new file mode 100644 index 0000000..64d0c13 --- /dev/null +++ b/drivers/vhost/net.c @@ -0,0 +1,429 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * virtio-net server in host kernel. + */ + +#include <linux/compat.h> +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/mmu_context.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/rcupdate.h> +#include <linux/file.h> + +#include <linux/net.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> + +#include <net/sock.h> + +#include "vhost.h" + +enum { + VHOST_NET_VQ_RX = 0, + VHOST_NET_VQ_TX = 1, + VHOST_NET_VQ_MAX = 2, +}; + +struct vhost_net { + struct vhost_dev dev; + struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; + /* We use a kind of RCU to access sock pointer. + * All readers access it from workqueue, which makes it possible to + * flush the workqueue instead of synchronize_rcu. Therefore readers do + * not need to call rcu_read_lock/rcu_read_unlock: the beginning of + * work item execution acts instead of rcu_read_lock() and the end of + * work item execution acts instead of rcu_read_lock(). + * Writers use device mutex. */ + struct socket *sock; + struct vhost_poll poll[VHOST_NET_VQ_MAX]; +}; + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_tx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; + unsigned head, out, in; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, + .msg_controllen = 0, + .msg_iov = (struct iovec *)vq->iov + 1, + .msg_flags = MSG_DONTWAIT, + }; + size_t len; + int err; + struct socket *sock = rcu_dereference(net->sock); + if (!sock || !sock_writeable(sock->sk)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); + /* Nothing new? Wait for eventfd to tell us they refilled. */ + if (head == vq->num) + break; + if (out <= 1 || in) { + vq_err(vq, "Unexpected descriptor format for TX: " + "out %d, int %d\n", out, in); + break; + } + /* Sanity check */ + if (vq->iov->iov_len != sizeof(struct virtio_net_hdr)) { + vq_err(vq, "Unexpected header len for TX: " + "%ld expected %zd\n", vq->iov->iov_len, + sizeof(struct virtio_net_hdr)); + break; + } + /* Skip header. TODO: support TSO. */ + msg.msg_iovlen = out - 1; + len = iov_length(vq->iov + 1, out - 1); + /* TODO: Check specific error and bomb out unless ENOBUFS? */ + err = sock->ops->sendmsg(NULL, sock, &msg, len); + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + if (err != len) + pr_err("Truncated TX packet: " + " len %d != %zd\n", err, len); + vhost_add_used_and_trigger(vq, head, + len + sizeof(struct virtio_net_hdr)); + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +/* Expects to be always run from workqueue - which acts as + * read-size critical section for our kind of RCU. */ +static void handle_rx(struct vhost_net *net) +{ + struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; + unsigned head, out, in; + struct msghdr msg = { + .msg_name = NULL, + .msg_namelen = 0, + .msg_control = NULL, /* FIXME: get and handle RX aux data. */ + .msg_controllen = 0, + .msg_iov = vq->iov + 1, + .msg_flags = MSG_DONTWAIT, + }; + + struct virtio_net_hdr hdr = { + .flags = 0, + .gso_type = VIRTIO_NET_HDR_GSO_NONE + }; + + size_t len; + int err; + struct socket *sock = rcu_dereference(net->sock); + if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + return; + + use_mm(net->dev.mm); + mutex_lock(&vq->mutex); + + for (;;) { + head = vhost_get_vq_desc(&net->dev, vq, vq->iov, &out, &in); + if (head == vq->num) + break; + if (in <= 1 || out) { + vq_err(vq, "Unexpected descriptor format for RX: " + "out %d, int %d\n", + out, in); + break; + } + /* Sanity check */ + if (vq->iov->iov_len != sizeof(struct virtio_net_hdr)) { + vq_err(vq, "Unexpected header len for RX: " + "%ld expected %zd\n", + vq->iov->iov_len, sizeof(struct virtio_net_hdr)); + break; + } + /* Skip header. TODO: support TSO/mergeable rx buffers. */ + msg.msg_iovlen = in - 1; + len = iov_length(vq->iov + 1, in - 1); + err = sock->ops->recvmsg(NULL, sock, &msg, + len, MSG_DONTWAIT | MSG_TRUNC); + /* TODO: Check specific error and bomb out unless EAGAIN? */ + if (err < 0) { + vhost_discard_vq_desc(vq); + break; + } + /* TODO: Should check and handle checksum. */ + if (err > len) { + pr_err("Discarded truncated rx packet: " + " len %d > %zd\n", err, len); + vhost_discard_vq_desc(vq); + continue; + } + len = err; + err = copy_to_user(vq->iov->iov_base, &hdr, sizeof hdr); + if (err) { + vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", + vq->iov->iov_base, err); + break; + } + vhost_add_used_and_trigger(vq, head, len + sizeof hdr); + } + + mutex_unlock(&vq->mutex); + unuse_mm(net->dev.mm); +} + +static void handle_tx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_tx(net); +} + +static void handle_rx_kick(struct work_struct *work) +{ + struct vhost_virtqueue *vq; + struct vhost_net *net; + vq = container_of(work, struct vhost_virtqueue, poll.work); + net = container_of(vq->dev, struct vhost_net, dev); + handle_rx(net); +} + +static void handle_tx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_TX].work); + handle_tx(net); +} + +static void handle_rx_net(struct work_struct *work) +{ + struct vhost_net *net; + net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); + handle_rx(net); +} + +static int vhost_net_open(struct inode *inode, struct file *f) +{ + struct vhost_net *n = kzalloc(sizeof *n, GFP_KERNEL); + int r; + if (!n) + return -ENOMEM; + f->private_data = n; + n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; + n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; + r = vhost_dev_init(&n->dev, n->vqs, VHOST_NET_VQ_MAX); + if (r < 0) { + kfree(n); + return r; + } + + vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT); + vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN); + return 0; +} + +static struct socket *vhost_net_stop(struct vhost_net *n) +{ + struct socket *sock = n->sock; + rcu_assign_pointer(n->sock, NULL); + if (sock) { + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); + } + return sock; +} + +static int vhost_net_release(struct inode *inode, struct file *f) +{ + struct vhost_net *n = f->private_data; + struct socket *sock; + + sock = vhost_net_stop(n); + vhost_dev_cleanup(&n->dev); + if (sock) + fput(sock->file); + kfree(n); + return 0; +} + +static long vhost_net_set_socket(struct vhost_net *n, int fd) +{ + struct { + struct sockaddr_ll sa; + char buf[MAX_ADDR_LEN]; + } uaddr; + struct socket *sock, *oldsock = NULL; + int uaddr_len = sizeof uaddr, r; + + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto done; + + if (fd == -1) { + /* Disconnect from socket and device. */ + oldsock = vhost_net_stop(n); + goto done; + } + + sock = sockfd_lookup(fd, &r); + if (!sock) { + r = -ENOTSOCK; + goto done; + } + + /* Parameter checking */ + if (sock->sk->sk_type != SOCK_RAW) { + r = -ESOCKTNOSUPPORT; + goto done; + } + + r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, + &uaddr_len, 0); + if (r) + goto done; + + if (uaddr.sa.sll_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto done; + } + + /* start polling new socket */ + if (sock == oldsock) + goto done; + + if (oldsock) { + vhost_poll_stop(n->poll + VHOST_NET_VQ_TX); + vhost_poll_stop(n->poll + VHOST_NET_VQ_RX); + } + oldsock = n->sock; + rcu_assign_pointer(n->sock, sock); + vhost_poll_start(n->poll + VHOST_NET_VQ_TX, sock->file); + vhost_poll_start(n->poll + VHOST_NET_VQ_RX, sock->file); +done: + mutex_unlock(&n->dev.mutex); + if (oldsock) { + vhost_poll_flush(n->poll + VHOST_NET_VQ_TX); + vhost_poll_flush(n->poll + VHOST_NET_VQ_RX); + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_TX].poll); + vhost_poll_flush(&n->dev.vqs[VHOST_NET_VQ_RX].poll); + fput(oldsock->file); + } + return r; +} + +static long vhost_net_reset_owner(struct vhost_net *n) +{ + struct socket *sock = NULL; + long r; + mutex_lock(&n->dev.mutex); + r = vhost_dev_check_owner(&n->dev); + if (r) + goto done; + sock = vhost_net_stop(n); + r = vhost_dev_reset_owner(&n->dev); +done: + mutex_unlock(&n->dev.mutex); + if (sock) + fput(sock->file); + return r; +} + +static long vhost_net_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + struct vhost_net *n = f->private_data; + void __user *argp = (void __user *)arg; + u32 __user *featurep = argp; + int __user *fdp = argp; + u32 features; + int fd, r; + switch (ioctl) { + case VHOST_NET_SET_SOCKET: + r = get_user(fd, fdp); + if (r < 0) + return r; + return vhost_net_set_socket(n, fd); + case VHOST_GET_FEATURES: + /* No features for now */ + features = 0; + return put_user(features, featurep); + case VHOST_ACK_FEATURES: + r = get_user(features, featurep); + /* No features for now */ + if (r < 0) + return r; + if (features) + return -EOPNOTSUPP; + return 0; + case VHOST_RESET_OWNER: + return vhost_net_reset_owner(n); + default: + return vhost_dev_ioctl(&n->dev, ioctl, arg); + } +} + +#ifdef CONFIG_COMPAT +static long vhost_net_compat_ioctl(struct file *f, unsigned int ioctl, + unsigned long arg) +{ + return vhost_net_ioctl(f, ioctl, (unsigned long)compat_ptr(arg)); +} +#endif + +const static struct file_operations vhost_net_fops = { + .owner = THIS_MODULE, + .release = vhost_net_release, + .unlocked_ioctl = vhost_net_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = vhost_net_compat_ioctl, +#endif + .open = vhost_net_open, +}; + +static struct miscdevice vhost_net_misc = { + VHOST_NET_MINOR, + "vhost-net", + &vhost_net_fops, +}; + +int vhost_net_init(void) +{ + int r = vhost_init(); + if (r) + goto err_init; + r = misc_register(&vhost_net_misc); + if (r) + goto err_reg; + return 0; +err_reg: + vhost_cleanup(); +err_init: + return r; + +} +module_init(vhost_net_init); + +void vhost_net_exit(void) +{ + misc_deregister(&vhost_net_misc); + vhost_cleanup(); +} +module_exit(vhost_net_exit); + +MODULE_VERSION("0.0.1"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Michael S. Tsirkin"); +MODULE_DESCRIPTION("Host kernel accelerator for virtio net"); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c new file mode 100644 index 0000000..e14169f --- /dev/null +++ b/drivers/vhost/vhost.c @@ -0,0 +1,664 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * Copyright (C) 2006 Rusty Russell IBM Corporation + * + * Author: Michael S. Tsirkin <mst@redhat.com> + * + * Inspiration, some code, and most witty comments come from + * Documentation/lguest/lguest.c, by Rusty Russell + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Generic code for virtio server in host kernel. + */ + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/virtio_net.h> +#include <linux/mm.h> +#include <linux/miscdevice.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/rcupdate.h> +#include <linux/poll.h> +#include <linux/file.h> + +#include <linux/net.h> +#include <linux/if_packet.h> +#include <linux/if_arp.h> + +#include <net/sock.h> + +#include "vhost.h" + +enum { + VHOST_MEMORY_MAX_NREGIONS = 64, +}; + +static struct workqueue_struct *vhost_workqueue; + +static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, + poll_table *pt) +{ + struct vhost_poll *poll; + poll = container_of(pt, struct vhost_poll, table); + + poll->wqh = wqh; + add_wait_queue(wqh, &poll->wait); +} + +static int vhost_poll_wakeup(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct vhost_poll *poll; + poll = container_of(wait, struct vhost_poll, wait); + if (!((unsigned long)key & poll->mask)) + return 0; + + queue_work(vhost_workqueue, &poll->work); + return 0; +} + +/* Init poll structure */ +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask) +{ + INIT_WORK(&poll->work, func); + init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup); + init_poll_funcptr(&poll->table, vhost_poll_func); + poll->mask = mask; +} + +/* Start polling a file. We add ourselves to file's wait queue. The user must + * keep a reference to a file until after vhost_poll_stop is called. */ +void vhost_poll_start(struct vhost_poll *poll, struct file *file) +{ + unsigned long mask; + mask = file->f_op->poll(file, &poll->table); + if (mask) + vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); +} + +/* Stop polling a file. After this function returns, it becomes safe to drop the + * file reference. You must also flush afterwards. */ +void vhost_poll_stop(struct vhost_poll *poll) +{ + remove_wait_queue(poll->wqh, &poll->wait); +} + +/* Flush any work that has been scheduled. When calling this, don't hold any + * locks that are also used by the callback. */ +void vhost_poll_flush(struct vhost_poll *poll) +{ + flush_work(&poll->work); +} + +long vhost_dev_init(struct vhost_dev *dev, + struct vhost_virtqueue *vqs, int nvqs) +{ + int i; + dev->vqs = vqs; + dev->nvqs = nvqs; + mutex_init(&dev->mutex); + + for (i = 0; i < dev->nvqs; ++i) { + dev->vqs[i].dev = dev; + mutex_init(&dev->vqs[i].mutex); + if (dev->vqs[i].handle_kick) + vhost_poll_init(&dev->vqs[i].poll, + dev->vqs[i].handle_kick, + POLLIN); + } + return 0; +} + +/* User should have device mutex */ +long vhost_dev_check_owner(struct vhost_dev *dev) +{ + return dev->mm == current->mm ? 0 : -EPERM; +} + +/* User should have device mutex */ +static long vhost_dev_set_owner(struct vhost_dev *dev) +{ + if (dev->mm) + return -EBUSY; + dev->mm = get_task_mm(current); + return 0; +} + +/* User should have device mutex */ +long vhost_dev_reset_owner(struct vhost_dev *dev) +{ + struct vhost_memory *memory; + + /* Restore memory to default 1:1 mapping. */ + memory = kmalloc(offsetof(struct vhost_memory, regions) + + 2 * sizeof *memory->regions, GFP_KERNEL); + if (!memory) + return -ENOMEM; + + vhost_dev_cleanup(dev); + + memory->nregions = 2; + memory->regions[0].guest_phys_addr = 1; + memory->regions[0].userspace_addr = 1; + memory->regions[0].memory_size = ~0ULL; + memory->regions[1].guest_phys_addr = 0; + memory->regions[1].userspace_addr = 0; + memory->regions[1].memory_size = 1; + dev->memory = memory; + return 0; +} + +/* User should have device mutex */ +void vhost_dev_cleanup(struct vhost_dev *dev) +{ + int i; + for (i = 0; i < dev->nvqs; ++i) { + if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { + vhost_poll_stop(&dev->vqs[i].poll); + vhost_poll_flush(&dev->vqs[i].poll); + } + if (dev->vqs[i].error_ctx) + eventfd_ctx_put(dev->vqs[i].error_ctx); + if (dev->vqs[i].error) + fput(dev->vqs[i].error); + if (dev->vqs[i].kick) + fput(dev->vqs[i].kick); + if (dev->vqs[i].call_ctx) + eventfd_ctx_put(dev->vqs[i].call_ctx); + if (dev->vqs[i].call) + fput(dev->vqs[i].call); + dev->vqs[i].error_ctx = NULL; + dev->vqs[i].error = NULL; + dev->vqs[i].kick = NULL; + dev->vqs[i].call_ctx = NULL; + dev->vqs[i].call = NULL; + } + /* No one will access memory at this point */ + kfree(dev->memory); + dev->memory = NULL; + if (dev->mm) + mmput(dev->mm); + dev->mm = NULL; +} + +static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) +{ + struct vhost_memory mem, *newmem, *oldmem; + unsigned long size = offsetof(struct vhost_memory, regions); + long r; + r = copy_from_user(&mem, m, size); + if (r) + return r; + if (mem.padding) + return -EOPNOTSUPP; + if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) + return -E2BIG; + newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); + if (!newmem) + return -ENOMEM; + + memcpy(newmem, &mem, size); + r = copy_from_user(newmem->regions, m->regions, + mem.nregions * sizeof *m->regions); + if (r) { + kfree(newmem); + return r; + } + oldmem = d->memory; + rcu_assign_pointer(d->memory, newmem); + synchronize_rcu(); + kfree(oldmem); + return 0; +} + +static int init_used(struct vhost_virtqueue *vq) +{ + u16 flags = 0; + int r = put_user(flags, &vq->used->flags); + if (r) + return r; + return get_user(vq->last_used_idx, &vq->used->idx); +} + +static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) +{ + struct file *eventfp, *filep = NULL, + *pollstart = NULL, *pollstop = NULL; + struct eventfd_ctx *ctx = NULL; + u32 __user *idxp = argp; + struct vhost_virtqueue *vq; + struct vhost_vring_state s; + struct vhost_vring_file f; + struct vhost_vring_addr a; + u32 idx; + long r; + + r = get_user(idx, idxp); + if (r < 0) + return r; + if (idx > d->nvqs) + return -ENOBUFS; + + vq = d->vqs + idx; + + mutex_lock(&vq->mutex); + + switch (ioctl) { + case VHOST_SET_VRING_NUM: + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->num = s.num; + break; + case VHOST_SET_VRING_BASE: + r = copy_from_user(&s, argp, sizeof s); + if (r < 0) + break; + if (s.num > 0xffff) { + r = -EINVAL; + break; + } + vq->last_avail_idx = s.num; + break; + case VHOST_GET_VRING_BASE: + s.index = idx; + s.num = vq->last_avail_idx; + r = copy_to_user(argp, &s, sizeof s); + break; + case VHOST_SET_VRING_DESC: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->desc = (void __user *)(long)a.user_addr; + break; + case VHOST_SET_VRING_AVAIL: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->avail = (void __user *)(long)a.user_addr; + break; + case VHOST_SET_VRING_USED: + r = copy_from_user(&a, argp, sizeof a); + if (r < 0) + break; + if (a.padding) { + r = -EOPNOTSUPP; + break; + } + if ((u64)(long)a.user_addr != a.user_addr) { + r = -EFAULT; + break; + } + vq->used = (void __user *)(long)a.user_addr; + r = init_used(vq); + if (r) + break; + break; + case VHOST_SET_VRING_KICK: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->kick) { + pollstop = filep = vq->kick; + pollstart = vq->kick = eventfp; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_CALL: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->call) { + filep = vq->call; + ctx = vq->call_ctx; + vq->call = eventfp; + vq->call_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + case VHOST_SET_VRING_ERR: + r = copy_from_user(&f, argp, sizeof f); + if (r < 0) + break; + eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); + if (IS_ERR(eventfp)) + return PTR_ERR(eventfp); + if (eventfp != vq->error) { + filep = vq->error; + vq->error = eventfp; + ctx = vq->error_ctx; + vq->error_ctx = eventfp ? + eventfd_ctx_fileget(eventfp) : NULL; + } else + filep = eventfp; + break; + default: + r = -ENOIOCTLCMD; + } + + if (pollstop && vq->handle_kick) + vhost_poll_stop(&vq->poll); + + if (ctx) + eventfd_ctx_put(ctx); + if (filep) + fput(filep); + + if (pollstart && vq->handle_kick) + vhost_poll_start(&vq->poll, vq->kick); + + mutex_unlock(&vq->mutex); + + if (pollstop && vq->handle_kick) + vhost_poll_flush(&vq->poll); + return 0; +} + +long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + long r; + + mutex_lock(&d->mutex); + if (ioctl == VHOST_SET_OWNER) { + r = vhost_dev_set_owner(d); + goto done; + } + + r = vhost_dev_check_owner(d); + if (r) + goto done; + + switch (ioctl) { + case VHOST_SET_MEM_TABLE: + r = vhost_set_memory(d, argp); + break; + default: + r = vhost_set_vring(d, ioctl, argp); + break; + } +done: + mutex_unlock(&d->mutex); + return r; +} + +static const struct vhost_memory_region *find_region(struct vhost_memory *mem, + __u64 addr, __u32 len) +{ + struct vhost_memory_region *reg; + int i; + /* linear search is not brilliant, but we really have on the order of 6 + * regions in practice */ + for (i = 0; i < mem->nregions; ++i) { + reg = mem->regions + i; + if (reg->guest_phys_addr <= addr && + reg->guest_phys_addr + reg->memory_size - 1 >= addr) + return reg; + } + return NULL; +} + +/* FIXME: this does not handle a region that spans multiple + * address/len pairs */ +int translate_desc(struct vhost_dev *dev, u64 addr, u32 len, + struct iovec iov[], int iov_count, int iov_size, + unsigned *num) +{ + const struct vhost_memory_region *reg; + struct vhost_memory *mem; + struct iovec *_iov; + u64 s = 0; + int ret = 0; + + rcu_read_lock(); + + mem = rcu_dereference(dev->memory); + while ((u64)len > s) { + u64 size; + if (*num + iov_count >= iov_size) { + ret = -ENOBUFS; + break; + } + reg = find_region(mem, addr, len); + if (!reg) { + ret = -EFAULT; + break; + } + _iov = iov + iov_count + *num; + size = reg->memory_size - addr + reg->guest_phys_addr; + _iov->iov_len = min((u64)len, size); + _iov->iov_base = (void *) + (reg->userspace_addr + addr - reg->guest_phys_addr); + s += size; + addr += size; + ++*num; + } + + rcu_read_unlock(); + return ret; +} + +/* Each buffer in the virtqueues is actually a chain of descriptors. This + * function returns the next descriptor in the chain, or vq->vring.num if we're + * at the end. */ +static unsigned next_desc(struct vhost_virtqueue *vq, struct vring_desc *desc) +{ + unsigned int next; + + /* If this descriptor says it doesn't chain, we're done. */ + if (!(desc->flags & VRING_DESC_F_NEXT)) + return vq->num; + + /* Check they're not leading us off end of descriptors. */ + next = desc->next; + /* Make sure compiler knows to grab that: we don't want it changing! */ + /* We will use the result as an index in an array, so most + * architectures only need a compiler barrier here. */ + read_barrier_depends(); + + if (next >= vq->num) { + vq_err(vq, "Desc next is %u > %u", next, vq->num); + return vq->num; + } + + return next; +} + +/* This looks in the virtqueue and for the first available buffer, and converts + * it to an iovec for convenient access. Since descriptors consist of some + * number of output then some number of input descriptors, it's actually two + * iovecs, but we pack them into one and note how many of each there were. + * + * This function returns the descriptor number found, or vq->num (which + * is never a valid descriptor number) if none was found. */ +unsigned vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num) +{ + struct vring_desc desc; + unsigned int i, head; + u16 last_avail_idx, idx; + + /* Check it isn't doing very strange things with descriptor numbers. */ + last_avail_idx = vq->last_avail_idx; + if (get_user(idx, &vq->avail->idx)) { + vq_err(vq, "Failed to access avail idx at %p\n", + &vq->avail->idx); + return vq->num; + } + + if ((u16)(idx - last_avail_idx) > vq->num) { + vq_err(vq, "Guest moved used index from %u to %u", + last_avail_idx, idx); + return vq->num; + } + + /* If there's nothing new since last we looked, return invalid. */ + if (idx == last_avail_idx) + return vq->num; + + /* Grab the next descriptor number they're advertising, and increment + * the index we've seen. */ + if (get_user(head, &vq->avail->ring[last_avail_idx % vq->num])) { + vq_err(vq, "Failed to read head: idx %d address %p\n", + idx, &vq->avail->ring[last_avail_idx % vq->num]); + return vq->num; + } + + /* If their number is silly, that's a fatal mistake. */ + if (head >= vq->num) { + vq_err(vq, "Guest says index %u > %u is available", + head, vq->num); + return vq->num; + } + + vq->last_avail_idx++; + + /* When we start there are none of either input nor output. */ + *out_num = *in_num = 0; + + i = head; + do { + unsigned *num; + unsigned iov_count; + if (copy_from_user(&desc, vq->desc + i, sizeof desc)) { + vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", + i, vq->desc + i); + return vq->num; + } + /* If this is an input descriptor, increment that count. */ + if (desc.flags & VRING_DESC_F_WRITE) { + num = in_num; + iov_count = *out_num; + } else { + /* If it's an output descriptor, they're all supposed + * to come before any input descriptors. */ + if (*in_num) { + vq_err(vq, "Descriptor has out after in: " + "idx %d\n", i); + return vq->num; + } + num = out_num; + iov_count = *in_num; + } + if (translate_desc(dev, desc.addr, desc.len, iov, iov_count, + VHOST_NET_MAX_SG, num)) { + vq_err(vq, "Failed to translate descriptor: idx %d\n", + i); + return vq->num; + } + + /* If we've got too many, that implies a descriptor loop. */ + if (*out_num + *in_num > vq->num) { + vq_err(vq, "Looped descriptor: idx %d\n", i); + return vq->num; + } + } while ((i = next_desc(vq, &desc)) != vq->num); + + vq->inflight++; + return head; +} + +/* Reverse the effect of vhost_get_vq_desc. Useful for error handling. */ +void vhost_discard_vq_desc(struct vhost_virtqueue *vq) +{ + vq->last_avail_idx--; + vq->inflight--; +} + +/* After we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using vq->call. */ +int vhost_add_used(struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + struct vring_used_elem *used; + + /* The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. */ + used = &vq->used->ring[vq->last_used_idx % vq->num]; + if (put_user(head, &used->id)) { + vq_err(vq, "Failed to write used id"); + return -EFAULT; + } + if (put_user(len, &used->len)) { + vq_err(vq, "Failed to write used len"); + return -EFAULT; + } + /* Make sure buffer is written before we update index. */ + wmb(); + if (put_user(vq->last_used_idx + 1, &vq->used->idx)) { + vq_err(vq, "Failed to increment used idx"); + return -EFAULT; + } + vq->last_used_idx++; + vq->inflight--; + return 0; +} + +/* This actually sends the interrupt for this virtqueue */ +void vhost_trigger_irq(struct vhost_virtqueue *vq) +{ + __u16 flags = 0; + if (get_user(flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return; + } + + /* If they don't want an interrupt, don't send one, unless empty. */ + if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && vq->inflight) + return; + + /* Send the Guest an interrupt tell them we used something up. */ + if (vq->call_ctx) + eventfd_signal(vq->call_ctx, 1); +} + +/* And here's the combo meal deal. Supersize me! */ +void vhost_add_used_and_trigger(struct vhost_virtqueue *vq, + unsigned int head, int len) +{ + vhost_add_used(vq, head, len); + vhost_trigger_irq(vq); +} + +int vhost_init(void) +{ + vhost_workqueue = create_workqueue("vhost"); + if (!vhost_workqueue) + return -ENOMEM; + return 0; +} + +void vhost_cleanup(void) +{ + destroy_workqueue(vhost_workqueue); +} diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h new file mode 100644 index 0000000..7f7ffcd --- /dev/null +++ b/drivers/vhost/vhost.h @@ -0,0 +1,108 @@ +#ifndef _VHOST_H +#define _VHOST_H + +#include <linux/eventfd.h> +#include <linux/vhost.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/workqueue.h> +#include <linux/poll.h> +#include <linux/file.h> +#include <linux/skbuff.h> + +struct vhost_device; + +enum { + VHOST_NET_MAX_SG = MAX_SKB_FRAGS + 2, +}; + +/* Poll a file (eventfd or socket) */ +/* Note: there's nothing vhost specific about this structure. */ +struct vhost_poll { + poll_table table; + wait_queue_head_t *wqh; + wait_queue_t wait; + /* struct which will handle all actual work. */ + struct work_struct work; + unsigned long mask; +}; + +void vhost_poll_init(struct vhost_poll *poll, work_func_t func, + unsigned long mask); +void vhost_poll_start(struct vhost_poll *poll, struct file *file); +void vhost_poll_stop(struct vhost_poll *poll); +void vhost_poll_flush(struct vhost_poll *poll); + +/* The virtqueue structure describes a queue attached to a device. */ +struct vhost_virtqueue { + struct vhost_dev *dev; + + /* The actual ring of buffers. */ + struct mutex mutex; + unsigned int num; + struct vring_desc __user *desc; + struct vring_avail __user *avail; + struct vring_used __user *used; + struct file *kick; + struct file *call; + struct file *error; + struct eventfd_ctx *call_ctx; + struct eventfd_ctx *error_ctx; + + struct vhost_poll poll; + + /* The routine to call when the Guest pings us, or timeout. */ + work_func_t handle_kick; + + /* Last available index we saw. */ + u16 last_avail_idx; + + /* Last index we used. */ + u16 last_used_idx; + + /* Outstanding buffers */ + unsigned int inflight; + + /* Is this blocked? */ + bool blocked; + + struct iovec iov[VHOST_NET_MAX_SG]; + +} ____cacheline_aligned; + +struct vhost_dev { + /* Readers use RCU to access memory table pointer. + * Writers use mutex below.*/ + struct vhost_memory *memory; + struct mm_struct *mm; + struct vhost_virtqueue *vqs; + int nvqs; + struct mutex mutex; +}; + +long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs); +long vhost_dev_check_owner(struct vhost_dev *); +long vhost_dev_reset_owner(struct vhost_dev *); +void vhost_dev_cleanup(struct vhost_dev *); +long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, unsigned long arg); + +unsigned vhost_get_vq_desc(struct vhost_dev *, struct vhost_virtqueue *, + struct iovec iov[], + unsigned int *out_num, unsigned int *in_num); +void vhost_discard_vq_desc(struct vhost_virtqueue *); + +int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); +void vhost_trigger_irq(struct vhost_virtqueue *); +void vhost_add_used_and_trigger(struct vhost_virtqueue *, + unsigned int head, int len); + +int vhost_init(void); +void vhost_cleanup(void); + +#define vq_err(vq, fmt, ...) do { \ + printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__); \ + if ((vq)->error_ctx) \ + eventfd_signal((vq)->error_ctx, 1);\ + } while (0) + +#endif diff --git a/include/linux/Kbuild b/include/linux/Kbuild index dec2f18..975df9a 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -360,6 +360,7 @@ unifdef-y += uio.h unifdef-y += unistd.h unifdef-y += usbdevice_fs.h unifdef-y += utsname.h +unifdef-y += vhost.h unifdef-y += videodev2.h unifdef-y += videodev.h unifdef-y += virtio_config.h diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 0521177..781a8bb 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -30,6 +30,7 @@ #define HPET_MINOR 228 #define FUSE_MINOR 229 #define KVM_MINOR 232 +#define VHOST_NET_MINOR 233 #define MISC_DYNAMIC_MINOR 255 struct device; diff --git a/include/linux/vhost.h b/include/linux/vhost.h new file mode 100644 index 0000000..9ec6d5f --- /dev/null +++ b/include/linux/vhost.h @@ -0,0 +1,100 @@ +#ifndef _LINUX_VHOST_H +#define _LINUX_VHOST_H +/* Userspace interface for in-kernel virtio accelerators. */ + +/* vhost is used to reduce the number of system calls involved in virtio. + * + * Existing virtio net code is used in the guest without modification. + * + * This header includes interface used by userspace hypervisor for + * device configuration. + */ + +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/ioctl.h> +#include <linux/virtio_config.h> +#include <linux/virtio_ring.h> + +struct vhost_vring_state { + unsigned int index; + unsigned int num; +}; + +struct vhost_vring_file { + unsigned int index; + int fd; +}; + +struct vhost_vring_addr { + unsigned int index; + unsigned int padding; + __u64 user_addr; +}; + +struct vhost_memory_region { + __u64 guest_phys_addr; + __u64 memory_size; /* bytes */ + __u64 userspace_addr; + __u64 padding; /* read/write protection? */ +}; + +struct vhost_memory { + __u32 nregions; + __u32 padding; + struct vhost_memory_region regions[0]; +}; + +/* ioctls */ + +#define VHOST_VIRTIO 0xAF + +/* Features bitmask for forward compatibility. Transport bits must be zero. */ +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u32) +#define VHOST_ACK_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u32) + +/* Set current process as the (exclusive) owner of this file descriptor. This + * must be called before any other vhost command. Further calls to + * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) +/* Give up ownership, and reset the device to default values. + * Allows subsequent call to VHOST_OWNER_SET to succeed. */ +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) + +/* Set up/modify memory layout */ +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory) + +/* Ring setup. These parameters can not be modified while ring is running + * (bound to a device). */ +/* Set number of descriptors in ring */ +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state) +/* Start of array of descriptors (virtually contiguous) */ +#define VHOST_SET_VRING_DESC _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr) +/* Used structure address */ +#define VHOST_SET_VRING_USED _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_addr) +/* Available structure address */ +#define VHOST_SET_VRING_AVAIL _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_addr) +/* Base value where queue looks for available descriptors */ +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Get accessor: reads index, writes value in num */ +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x14, struct vhost_vring_state) + +/* The following ioctls use eventfd file descriptors to signal and poll + * for events. */ + +/* Set eventfd to poll for added buffers */ +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file) +/* Set eventfd to signal when buffers have beed used */ +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file) +/* Set eventfd to signal an error */ +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file) + +/* VHOST_NET specific defines */ + +/* Attach virtio net device to a raw socket. The socket must be already + * bound to an ethernet device, this device will be used for transmit. + * Pass -1 to unbind from the socket and the transmit device. + * This can be used to stop the device (e.g. for migration). */ +#define VHOST_NET_SET_SOCKET _IOW(VHOST_VIRTIO, 0x30, int) + +#endif -- 1.6.2.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply related [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 11:42 ` Xin, Xiaohui @ 2009-08-31 15:23 ` Arnd Bergmann 2009-09-01 14:58 ` Xin, Xiaohui 2009-08-31 17:52 ` Avi Kivity 1 sibling, 1 reply; 83+ messages in thread From: Arnd Bergmann @ 2009-08-31 15:23 UTC (permalink / raw) To: Xin, Xiaohui Cc: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com On Monday 31 August 2009, Xin, Xiaohui wrote: > > Hi, Michael > That's a great job. We are now working on support VMDq on KVM, and since the VMDq hardware presents L2 sorting > based on MAC addresses and VLAN tags, our target is to implement a zero copy solution using VMDq. I'm also interested in helping there, please include me in the discussions. > We stared > from the virtio-net architecture. What we want to proposal is to use AIO combined with direct I/O: > 1) Modify virtio-net Backend service in Qemu to submit aio requests composed from virtqueue. right, that sounds useful. > 2) Modify TUN/TAP device to support aio operations and the user space buffer directly mapping into the host kernel. > 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. I don't think we should do that with the tun/tap driver. By design, tun/tap is a way to interact with the networking stack as if coming from a device. The only way this connects to an external adapter is through a bridge or through IP routing, which means that it does not correspond to a specific NIC. I have worked on a driver I called 'macvtap' in lack of a better name, to add a new tap frontend to the 'macvlan' driver. Since macvlan lets you add slaves to a single NIC device, this gives you a direct connection between one or multiple tap devices to an external NIC, which works a lot better than when you have a bridge inbetween. There is also work underway to add a bridging capability to macvlan, so you can communicate directly between guests like you can do with a bridge. Michael's vhost_net can plug into the same macvlan infrastructure, so the work is complementary. > 4) Modify the net_dev and skb structure to permit allocated skb to use user space directly mapped payload > buffer address rather then kernel allocated. yes. > As zero copy is also your goal, we are interested in what's in your mind, and would like to collaborate with you if possible. > BTW, we will send our VMDq write-up very soon. Ok, cool. Arnd <>< -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 15:23 ` Arnd Bergmann @ 2009-09-01 14:58 ` Xin, Xiaohui 0 siblings, 0 replies; 83+ messages in thread From: Xin, Xiaohui @ 2009-09-01 14:58 UTC (permalink / raw) To: Arnd Bergmann Cc: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com >I don't think we should do that with the tun/tap driver. By design, tun/tap is a way to interact >with the >networking stack as if coming from a device. The only way this connects to an external >adapter is through >a bridge or through IP routing, which means that it does not correspond to a specific NIC. >I have worked on a driver I called 'macvtap' in lack of a better name, to add a new tap >frontend to >the 'macvlan' driver. Since macvlan lets you add slaves to a single NIC device, this gives you >a direct >connection between one or multiple tap devices to an external NIC, which works a lot better >than when >you have a bridge inbetween. There is also work underway to add a bridging capability to >macvlan, so >you can communicate directly between guests like you can do with a bridge. >Michael's vhost_net can plug into the same macvlan infrastructure, so the work is >complementary. We use TUN/TAP device to implement the prototype, and agree that it's not the only choice here. We'd compare the two if possible. And what we cares more about is the modification in the kernel like the net_dev and skb structures' modifications, thanks. Thanks Xiaohui -----Original Message----- From: Arnd Bergmann [mailto:arnd@arndb.de] Sent: Monday, August 31, 2009 11:24 PM To: Xin, Xiaohui Cc: mst@redhat.com; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server On Monday 31 August 2009, Xin, Xiaohui wrote: > > Hi, Michael > That's a great job. We are now working on support VMDq on KVM, and since the VMDq hardware presents L2 sorting > based on MAC addresses and VLAN tags, our target is to implement a zero copy solution using VMDq. I'm also interested in helping there, please include me in the discussions. > We stared > from the virtio-net architecture. What we want to proposal is to use AIO combined with direct I/O: > 1) Modify virtio-net Backend service in Qemu to submit aio requests composed from virtqueue. right, that sounds useful. > 2) Modify TUN/TAP device to support aio operations and the user space buffer directly mapping into the host kernel. > 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. I don't think we should do that with the tun/tap driver. By design, tun/tap is a way to interact with the networking stack as if coming from a device. The only way this connects to an external adapter is through a bridge or through IP routing, which means that it does not correspond to a specific NIC. I have worked on a driver I called 'macvtap' in lack of a better name, to add a new tap frontend to the 'macvlan' driver. Since macvlan lets you add slaves to a single NIC device, this gives you a direct connection between one or multiple tap devices to an external NIC, which works a lot better than when you have a bridge inbetween. There is also work underway to add a bridging capability to macvlan, so you can communicate directly between guests like you can do with a bridge. Michael's vhost_net can plug into the same macvlan infrastructure, so the work is complementary. > 4) Modify the net_dev and skb structure to permit allocated skb to use user space directly mapped payload > buffer address rather then kernel allocated. yes. > As zero copy is also your goal, we are interested in what's in your mind, and would like to collaborate with you if possible. > BTW, we will send our VMDq write-up very soon. Ok, cool. Arnd <>< -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 11:42 ` Xin, Xiaohui 2009-08-31 15:23 ` Arnd Bergmann @ 2009-08-31 17:52 ` Avi Kivity 2009-08-31 21:56 ` Anthony Liguori 2009-09-01 5:04 ` Xin, Xiaohui 1 sibling, 2 replies; 83+ messages in thread From: Avi Kivity @ 2009-08-31 17:52 UTC (permalink / raw) To: Xin, Xiaohui Cc: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com On 08/31/2009 02:42 PM, Xin, Xiaohui wrote: > Hi, Michael > That's a great job. We are now working on support VMDq on KVM, and since the VMDq hardware presents L2 sorting based on MAC addresses and VLAN tags, our target is to implement a zero copy solution using VMDq. We stared from the virtio-net architecture. What we want to proposal is to use AIO combined with direct I/O: > 1) Modify virtio-net Backend service in Qemu to submit aio requests composed from virtqueue. > 2) Modify TUN/TAP device to support aio operations and the user space buffer directly mapping into the host kernel. > 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. > 4) Modify the net_dev and skb structure to permit allocated skb to use user space directly mapped payload buffer address rather then kernel allocated. > > As zero copy is also your goal, we are interested in what's in your mind, and would like to collaborate with you if possible. > One way to share the effort is to make vmdq queues available as normal kernel interfaces. It would take quite a bit of work, but the end result is that no other components need to be change, and it makes vmdq useful outside kvm. It also greatly reduces the amount of integration work needed throughout the stack (kvm/qemu/libvirt). -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 17:52 ` Avi Kivity @ 2009-08-31 21:56 ` Anthony Liguori 2009-09-01 15:37 ` Xin, Xiaohui 2009-09-01 5:04 ` Xin, Xiaohui 1 sibling, 1 reply; 83+ messages in thread From: Anthony Liguori @ 2009-08-31 21:56 UTC (permalink / raw) To: Avi Kivity Cc: Xin, Xiaohui, mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com Avi Kivity wrote: > On 08/31/2009 02:42 PM, Xin, Xiaohui wrote: >> Hi, Michael >> That's a great job. We are now working on support VMDq on KVM, and >> since the VMDq hardware presents L2 sorting based on MAC addresses >> and VLAN tags, our target is to implement a zero copy solution using >> VMDq. We stared from the virtio-net architecture. What we want to >> proposal is to use AIO combined with direct I/O: >> 1) Modify virtio-net Backend service in Qemu to submit aio requests >> composed from virtqueue. >> 2) Modify TUN/TAP device to support aio operations and the user space >> buffer directly mapping into the host kernel. >> 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. >> 4) Modify the net_dev and skb structure to permit allocated skb to >> use user space directly mapped payload buffer address rather then >> kernel allocated. >> >> As zero copy is also your goal, we are interested in what's in your >> mind, and would like to collaborate with you if possible. >> > > One way to share the effort is to make vmdq queues available as normal > kernel interfaces. It may be possible to make vmdq appear like an sr-iov capable device from userspace. sr-iov provides the userspace interfaces to allocate interfaces and assign mac addresses. To make it useful, you would have to handle tx multiplexing in the driver but that would be much easier to consume for kvm. Regards, Anthony Liguori -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 21:56 ` Anthony Liguori @ 2009-09-01 15:37 ` Xin, Xiaohui 0 siblings, 0 replies; 83+ messages in thread From: Xin, Xiaohui @ 2009-09-01 15:37 UTC (permalink / raw) To: Anthony Liguori, Avi Kivity Cc: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com >It may be possible to make vmdq appear like an sr-iov capable device >from userspace. sr-iov provides the userspace interfaces to allocate >interfaces and assign mac addresses. To make it useful, you would have >to handle tx multiplexing in the driver but that would be much easier to >consume for kvm What we have thought is to support multiple net_dev structures according to multiple queue pairs of one vmdq adapter and presents multiple mac address in user space and each one mac can be used by a guest. What does the tx multiplexing in the driver exactly mean? Thanks Xiaohui -----Original Message----- From: Anthony Liguori [mailto:anthony@codemonkey.ws] Sent: Tuesday, September 01, 2009 5:57 AM To: Avi Kivity Cc: Xin, Xiaohui; mst@redhat.com; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server Avi Kivity wrote: > On 08/31/2009 02:42 PM, Xin, Xiaohui wrote: >> Hi, Michael >> That's a great job. We are now working on support VMDq on KVM, and >> since the VMDq hardware presents L2 sorting based on MAC addresses >> and VLAN tags, our target is to implement a zero copy solution using >> VMDq. We stared from the virtio-net architecture. What we want to >> proposal is to use AIO combined with direct I/O: >> 1) Modify virtio-net Backend service in Qemu to submit aio requests >> composed from virtqueue. >> 2) Modify TUN/TAP device to support aio operations and the user space >> buffer directly mapping into the host kernel. >> 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. >> 4) Modify the net_dev and skb structure to permit allocated skb to >> use user space directly mapped payload buffer address rather then >> kernel allocated. >> >> As zero copy is also your goal, we are interested in what's in your >> mind, and would like to collaborate with you if possible. >> > > One way to share the effort is to make vmdq queues available as normal > kernel interfaces. It may be possible to make vmdq appear like an sr-iov capable device from userspace. sr-iov provides the userspace interfaces to allocate interfaces and assign mac addresses. To make it useful, you would have to handle tx multiplexing in the driver but that would be much easier to consume for kvm. Regards, Anthony Liguori -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
* RE: [PATCHv5 3/3] vhost_net: a kernel-level virtio server 2009-08-31 17:52 ` Avi Kivity 2009-08-31 21:56 ` Anthony Liguori @ 2009-09-01 5:04 ` Xin, Xiaohui 1 sibling, 0 replies; 83+ messages in thread From: Xin, Xiaohui @ 2009-09-01 5:04 UTC (permalink / raw) To: Avi Kivity Cc: mst@redhat.com, netdev@vger.kernel.org, virtualization@lists.linux-foundation.org, kvm@vger.kernel.org, linux-kernel@vger.kernel.org, mingo@elte.hu, linux-mm@kvack.org, akpm@linux-foundation.org, hpa@zytor.com, gregory.haskins@gmail.com > One way to share the effort is to make vmdq queues available as normal kernel interfaces. It would take quite a bit of work, but the end result is that no other components need to be change, and it makes vmdq useful outside kvm. It also greatly reduces the amount of integration work needed throughout the stack (kvm/qemu/libvirt). Yes. The common queue pair interface which we want to present will also apply to normal hardware, and try to leave other components unknown. Thanks Xiaohui -----Original Message----- From: Avi Kivity [mailto:avi@redhat.com] Sent: Tuesday, September 01, 2009 1:52 AM To: Xin, Xiaohui Cc: mst@redhat.com; netdev@vger.kernel.org; virtualization@lists.linux-foundation.org; kvm@vger.kernel.org; linux-kernel@vger.kernel.org; mingo@elte.hu; linux-mm@kvack.org; akpm@linux-foundation.org; hpa@zytor.com; gregory.haskins@gmail.com Subject: Re: [PATCHv5 3/3] vhost_net: a kernel-level virtio server On 08/31/2009 02:42 PM, Xin, Xiaohui wrote: > Hi, Michael > That's a great job. We are now working on support VMDq on KVM, and since the VMDq hardware presents L2 sorting based on MAC addresses and VLAN tags, our target is to implement a zero copy solution using VMDq. We stared from the virtio-net architecture. What we want to proposal is to use AIO combined with direct I/O: > 1) Modify virtio-net Backend service in Qemu to submit aio requests composed from virtqueue. > 2) Modify TUN/TAP device to support aio operations and the user space buffer directly mapping into the host kernel. > 3) Let a TUN/TAP device binds to single rx/tx queue from the NIC. > 4) Modify the net_dev and skb structure to permit allocated skb to use user space directly mapped payload buffer address rather then kernel allocated. > > As zero copy is also your goal, we are interested in what's in your mind, and would like to collaborate with you if possible. > One way to share the effort is to make vmdq queues available as normal kernel interfaces. It would take quite a bit of work, but the end result is that no other components need to be change, and it makes vmdq useful outside kvm. It also greatly reduces the amount of integration work needed throughout the stack (kvm/qemu/libvirt). -- I have a truly marvellous patch that fixes the bug which this signature is too narrow to contain. -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@kvack.org. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a> ^ permalink raw reply [flat|nested] 83+ messages in thread
end of thread, other threads:[~2009-10-03 10:00 UTC | newest]
Thread overview: 83+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <cover.1251388414.git.mst@redhat.com>
2009-08-27 16:06 ` [PATCHv5 1/3] mm: export use_mm/unuse_mm to modules Michael S. Tsirkin
2009-08-28 15:31   ` Gregory Haskins
2009-08-27 16:07 ` [PATCHv5 2/3] mm: reduce atomic use on use_mm fast path Michael S. Tsirkin
2009-08-27 16:07 ` [PATCHv5 3/3] vhost_net: a kernel-level virtio server Michael S. Tsirkin
2009-09-03 18:39   ` Ira W. Snyder
2009-09-07 10:15     ` Michael S. Tsirkin
2009-09-08 17:20       ` Ira W. Snyder
2009-09-08 20:14         ` Michael S. Tsirkin
2009-09-11 15:17           ` Xin, Xiaohui
2009-09-13  5:46             ` Michael S. Tsirkin
2009-09-14  5:57               ` Xin, Xiaohui
2009-09-14  7:05                 ` Michael S. Tsirkin
2009-09-11 16:00         ` Gregory Haskins
2009-09-11 16:14           ` Gregory Haskins
2009-09-13 12:01           ` Michael S. Tsirkin
2009-09-14 16:08             ` Gregory Haskins
2009-09-14 16:47               ` Michael S. Tsirkin
2009-09-14 19:14                 ` Gregory Haskins
2009-09-15 12:35                   ` Avi Kivity
2009-09-15 13:03                     ` Gregory Haskins
2009-09-15 13:25                       ` Avi Kivity
2009-09-15 13:50                         ` Gregory Haskins
2009-09-15 14:28                           ` Michael S. Tsirkin
2009-09-15 15:03                           ` Avi Kivity
2009-09-15 20:08                             ` Gregory Haskins
2009-09-15 20:40                               ` Michael S. Tsirkin
2009-09-15 20:43                                 ` Gregory Haskins
2009-09-15 21:25                                   ` Michael S. Tsirkin
2009-09-15 21:39                                     ` Gregory Haskins
2009-09-15 21:38                                       ` Michael S. Tsirkin
2009-09-15 21:55                                         ` Gregory Haskins
2009-09-16 14:57                                     ` Arnd Bergmann
2009-09-16 15:13                                       ` Michael S. Tsirkin
2009-09-16 15:22                                         ` Arnd Bergmann
2009-09-16 16:08                                           ` Michael S. Tsirkin
2009-09-16  8:23                               ` Avi Kivity
2009-09-16 11:44                                 ` Gregory Haskins
2009-09-16 13:05                                   ` Avi Kivity
2009-09-16 14:10                                     ` Gregory Haskins
2009-09-16 15:59                                       ` Avi Kivity
2009-09-16 19:22                                         ` Gregory Haskins
2009-09-16 21:00                                           ` Avi Kivity
2009-09-17  3:11                                             ` Gregory Haskins
2009-09-17  7:49                                               ` Avi Kivity
2009-09-17 14:16                                               ` Javier Guerra
2009-09-21 21:43                                               ` Ira W. Snyder
2009-09-22  9:43                                                 ` Avi Kivity
2009-09-22 15:25                                                   ` Ira W. Snyder
2009-09-22 15:56                                                     ` Avi Kivity
2009-09-23 14:26                                                   ` Gregory Haskins
2009-09-23 14:37                                                     ` Avi Kivity
2009-09-23 15:10                                                       ` Gregory Haskins
2009-09-23 17:58                                                         ` Gregory Haskins
2009-09-23 19:37                                                           ` Avi Kivity
2009-09-23 21:15                                                             ` Gregory Haskins
2009-09-24  7:18                                                               ` Avi Kivity
2009-09-24 18:03                                                                 ` Gregory Haskins
2009-09-25  8:22                                                                   ` Avi Kivity
2009-09-25 21:32                                                                     ` Gregory Haskins
2009-09-27  9:43                                                                       ` Avi Kivity
2009-09-30 20:04                                                                         ` Gregory Haskins
2009-10-01  8:34                                                                           ` Avi Kivity
2009-10-01  9:28                                                                             ` Michael S. Tsirkin
2009-10-01 19:24                                                                             ` Gregory Haskins
2009-10-03 10:00                                                                               ` Avi Kivity
2009-09-24 19:27                                                                 ` Ira W. Snyder
2009-09-25  7:43                                                                   ` Avi Kivity
2009-09-24  8:03                                                             ` Avi Kivity
2009-09-24 18:04                                                               ` Gregory Haskins
2009-09-17  3:57                                       ` Michael S. Tsirkin
2009-09-17  4:13                                         ` Gregory Haskins
2009-09-15 12:32                 ` Avi Kivity
2009-09-14 16:53               ` Michael S. Tsirkin
2009-09-14 19:28                 ` Gregory Haskins
2009-09-25 17:01   ` Ira W. Snyder
2009-09-27  7:43     ` Michael S. Tsirkin
     [not found] <E88DD564E9DC5446A76B2B47C3BCCA150219600F9B@pdsmsx503.ccr.corp.intel.com>
2009-08-31 11:42 ` Xin, Xiaohui
2009-08-31 15:23   ` Arnd Bergmann
2009-09-01 14:58     ` Xin, Xiaohui
2009-08-31 17:52   ` Avi Kivity
2009-08-31 21:56     ` Anthony Liguori
2009-09-01 15:37       ` Xin, Xiaohui
2009-09-01  5:04     ` Xin, Xiaohui
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).