[PATCH RFC 1/5] vringfd syscall

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH RFC 1/5] vringfd syscall
@ 2008-04-05 12:02 Rusty Russell
  2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
                   ` (5 more replies)
  0 siblings, 6 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-05 12:02 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, virtualization, Max Krasnyansky

For virtualization, we've developed virtio_ring for efficient communication.
This would also work well for userspace-kernel communication, particularly
for things like the tun device.  By using the same ABI, we can join guests
to the host kernel trivially.

These patches are fairly alpha; I've seen some network stalls I have to
track down and there are some fixmes.

Comments welcome!
Rusty.

diff -r 99132ad16999 Documentation/test_vring.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/Documentation/test_vring.c	Sat Apr 05 21:31:40 2008 +1100
@@ -0,0 +1,47 @@
+#include <unistd.h>
+#include <linux/virtio_ring.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <err.h>
+#include <poll.h>
+
+#ifndef __NR_vringfd
+#define __NR_vringfd		327
+#endif
+
+int main()
+{
+	int fd, r;
+	struct vring vr;
+	uint16_t used = 0;
+	struct pollfd pfd;
+	void *buf = calloc(vring_size(256, getpagesize()), 0);
+
+	vring_init(&vr, 256, buf, getpagesize());
+
+	fd = syscall(__NR_vringfd, buf, 256, &used);
+	if (fd < 0)
+		err(1, "vringfd gave %i", fd);
+
+	pfd.fd = fd;
+	pfd.events = POLLIN;
+	r = poll(&pfd, 1, 0);
+	
+	if (r != 0)
+		err(1, "poll gave %i", r);
+
+	vr.used->idx++;
+	r = poll(&pfd, 1, 0);
+	
+	if (r != 1)
+		err(1, "poll after buf used gave %i", r);
+
+	used++;
+	r = poll(&pfd, 1, 0);
+	
+	if (r != 0)
+		err(1, "poll after used incremented gave %i", r);
+
+	close(fd);
+	return 0;
+}
diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S
--- a/arch/x86/kernel/syscall_table_32.S	Sat Apr 05 21:20:32 2008 +1100
+++ b/arch/x86/kernel/syscall_table_32.S	Sat Apr 05 21:31:40 2008 +1100
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
 	.long sys_fallocate
 	.long sys_timerfd_settime	/* 325 */
 	.long sys_timerfd_gettime
+	.long sys_vringfd
diff -r 99132ad16999 fs/Kconfig
--- a/fs/Kconfig	Sat Apr 05 21:20:32 2008 +1100
+++ b/fs/Kconfig	Sat Apr 05 21:31:40 2008 +1100
@@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig"
 source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 
+config VRINGFD
+       bool "vring fd support (EXPERIMENTAL)"
+       depends on EXPERIMENTAL
+       help
+         vring is a ringbuffer implementation for efficient I/O.  It is
+	 currently used by virtualization hosts (lguest, kvm) for efficient
+	 networking using the tun driver.
+
+	 If unsure, say N.
+
 endmenu
diff -r 99132ad16999 fs/Makefile
--- a/fs/Makefile	Sat Apr 05 21:20:32 2008 +1100
+++ b/fs/Makefile	Sat Apr 05 21:31:40 2008 +1100
@@ -119,3 +119,4 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_VRINGFD)		+= vring.o
diff -r 99132ad16999 fs/vring.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fs/vring.c	Sat Apr 05 21:31:40 2008 +1100
@@ -0,0 +1,376 @@
+/* Ring-buffer file descriptor implementation.
+ *
+ *  Copyright 2008 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/virtio_ring.h>
+#include <linux/vring.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/wait.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/highmem.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/magic.h>
+#include <linux/module.h>
+
+static struct vfsmount *vring_mnt;
+static DEFINE_MUTEX(vring_lock);
+
+struct vring_info
+{
+	struct vring ring;
+	u16 mask;
+	u16 __user *last_used;
+	u16 last_avail;
+
+	const struct vring_ops *ops;
+	void *ops_data;
+
+	/* Waitqueue for poll() */
+	wait_queue_head_t poll_wait;
+
+	/* The mapped used ring. */
+	struct vring_used *used;
+	struct page *used_page;
+};
+
+static unsigned int vring_poll(struct file *filp,
+			       struct poll_table_struct *poll)
+{
+	struct vring_info *vr = filp->private_data;
+	int err;
+	unsigned int mask;
+	u16 used, last_used;
+
+	/* Some uses of vrings require updating in user context.  This
+	 * is best done close to the caller, ie. here. */
+	if (vr->ops && vr->ops->pull) {
+		err = vr->ops->pull(vr->ops_data);
+		if (unlikely(err < 0))
+			return err;
+
+		if (err > 0) {
+			/* Buffers have been used, no need to check indices */
+			mask = POLLIN | POLLRDNORM;
+			goto poll_wait;
+		}
+	}
+
+	err = get_user(used, &vr->ring.used->idx);
+	if (unlikely(err))
+		return err;
+
+	err = get_user(last_used, vr->last_used);
+	if (unlikely(err))
+		return err;
+
+	/* More buffers have been used?  It's 'readable'. */
+	if (used != last_used)
+		mask = POLLIN | POLLRDNORM;
+	else
+		mask = 0;
+
+poll_wait:
+	poll_wait(filp, &vr->poll_wait, poll);
+
+	return mask;
+}
+
+static ssize_t vring_write(struct file *filp, const char __user *buf,
+			   size_t size, loff_t *off)
+{
+	struct vring_info *vr = filp->private_data;
+
+	if (vr->ops && vr->ops->push)
+		return vr->ops->push(vr->ops_data);
+
+	return -EINVAL;
+}
+
+static int vring_release(struct inode *inode, struct file *filp)
+{
+	struct vring_info *vr = filp->private_data;
+
+	/* Callback for other end. */
+	if (vr->ops && vr->ops->destroy)
+		vr->ops->destroy(vr->ops_data);
+
+	if (vr->used) {
+		kunmap(vr->used_page);
+		put_page(vr->used_page);
+	}
+
+	kfree(vr);
+	return 0;
+}
+
+static const struct file_operations vring_fops = {
+	.release	= vring_release,
+	.write		= vring_write,
+	.poll		= vring_poll,
+};
+
+asmlinkage long sys_vringfd(void __user *addr,
+			    unsigned num_descs,
+			    u16 __user *last_used)
+{
+	int fd, err;
+	struct file *filp;
+	struct vring_info *vr;
+
+	/* Must be a power of two, and representable by u16 */
+	if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	fd = get_unused_fd();
+	if (fd < 0) {
+		err = fd;
+		goto out;
+	}
+
+	filp = alloc_file(vring_mnt, dget(vring_mnt->mnt_root), FMODE_WRITE,
+			  &vring_fops);
+	if (!filp) {
+		err = -ENFILE;
+		goto put_fd;
+	}
+
+	filp->private_data = vr = kmalloc(sizeof(*vr), GFP_KERNEL);
+	if (!vr) {
+		err = -ENOMEM;
+		goto put_filp;
+	}
+
+	/* Set up pointers into ring. */
+	vring_init(&vr->ring, num_descs, addr, PAGE_SIZE);
+	init_waitqueue_head(&vr->poll_wait);
+	vr->last_used = last_used;
+	vr->mask = num_descs - 1;
+	vr->ops = NULL;
+	vr->used = NULL;
+
+	err = get_user(vr->last_avail, &vr->ring.avail->idx);
+	if (err)
+		goto free_vr;
+	
+	fd_install(fd, filp);
+	return fd;
+
+free_vr:
+	kfree(vr);
+put_filp:
+	put_filp(filp);
+put_fd:
+	put_unused_fd(fd);
+out:
+	return err;
+}
+
+/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
+int vring_get_buffer(struct vring_info *vr,
+		     struct iovec *in_iov,
+		     unsigned int *num_in, unsigned long *in_len,
+		     struct iovec *out_iov,
+		     unsigned int *num_out, unsigned long *out_len)
+{
+	unsigned int i, in = 0, out = 0;
+	unsigned long dummy;
+	u16 head;
+	struct vring_desc d;
+
+	if (unlikely(get_user(head, &vr->ring.avail->idx) != 0))
+		return -EFAULT;
+
+	if (vr->last_avail == head)
+		return 0;
+
+	if (!in_len)
+		in_len = &dummy;
+	if (!out_len)
+		out_len = &dummy;
+
+	*in_len = *out_len = 0;
+	
+	if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0))
+		return -EFAULT;
+
+	i = head;
+	do {
+		if (unlikely(i >= vr->ring.num)) {
+			pr_debug("vring: bad index: %u\n", i);
+			return -EINVAL;
+		}
+
+		if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
+			return -EFAULT;
+
+		if (d.flags & VRING_DESC_F_WRITE) {
+			/* Check for length and iovec overflows */
+			if (!num_in)
+				return -EINVAL;
+			if (in == *num_in || *in_len + d.len < *in_len)
+				return -E2BIG;
+			in_iov[in].iov_len = d.len;
+			*in_len += d.len;
+			in_iov[in].iov_base = (void __user*)(long)d.addr;
+			in++;
+		} else {
+			if (!num_out)
+				return -EINVAL;
+			if (out == *num_out || *out_len + d.len < *out_len)
+				return -E2BIG;
+			out_iov[out].iov_len = d.len;
+			*out_len += d.len;
+			out_iov[out].iov_base = (void __user*)(long)d.addr;
+			out++;
+		}
+
+		i = d.next;
+	} while (d.flags & VRING_DESC_F_NEXT);
+
+	if (num_in)
+		*num_in = in;
+	if (num_out)
+		*num_out = out;
+
+	/* 0 is a valid head, so add one. */
+	vr->last_avail++;
+	return head + 1;
+}
+EXPORT_SYMBOL_GPL(vring_get_buffer);
+
+void vring_used_buffer(struct vring_info *vr, int id, u32 len)
+{
+	struct vring_used_elem used;
+	u16 used_idx;
+
+	BUG_ON(id <= 0 || id > vr->ring.num);
+
+	used.id = id - 1;
+	used.len = len;
+	if (get_user(used_idx, &vr->ring.used->idx) != 0)
+		return;
+
+	copy_to_user(&vr->ring.used->ring[used_idx & vr->mask], &used,
+		     sizeof(used));
+	wmb();
+	used_idx++;
+	put_user(used_idx, &vr->ring.used->idx);
+}
+EXPORT_SYMBOL_GPL(vring_used_buffer);
+
+void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len)
+{
+	struct vring_used_elem *used;
+
+	BUG_ON(id <= 0 || id > vr->ring.num);
+	BUG_ON(!vr->used);
+
+	used = &vr->used->ring[vr->used->idx & vr->mask];
+	used->id = id - 1;
+	used->len = len;
+	/* Make sure buffer is written before we update index. */
+	wmb();
+	vr->used->idx++;
+}
+EXPORT_SYMBOL_GPL(vring_used_buffer_atomic);
+
+void vring_wake(struct vring_info *vr)
+{
+	wake_up(&vr->poll_wait);
+}
+EXPORT_SYMBOL_GPL(vring_wake);
+
+struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
+				void *data, bool atomic_use)
+{
+	struct file *filp;
+	struct vring_info *vr;
+
+	/* Must be a valid fd, and must be one of ours. */
+	filp = fget(fd);
+	if (!filp) {
+		vr = ERR_PTR(-EBADF);
+		goto out;
+	}
+
+	if (filp->f_op != &vring_fops) {
+		vr = ERR_PTR(-EBADF);
+		goto fput;
+	}
+
+	/* Mutex simply protects against parallel vring_attach. */
+	mutex_lock(&vring_lock);
+	vr = filp->private_data;
+	if (vr->ops) {
+		vr = ERR_PTR(-EBUSY);
+		goto unlock;
+	}
+
+	/* If they want to use atomically, we have to map the page. */
+	if (atomic_use) {
+		if (get_user_pages(current, current->mm,
+				   (unsigned long)vr->ring.used, 1, 1, 1,
+				   &vr->used_page, NULL) != 1) {
+			vr = ERR_PTR(-EFAULT);
+			goto unlock;
+		}
+		vr->used = kmap(vr->used_page);
+		if (!vr->used) {
+			put_page(vr->used_page);
+			vr = ERR_PTR(-ENOMEM);
+			goto unlock;
+		}
+	}
+
+	vr->ops = ops;
+	vr->ops_data = data;
+
+unlock:
+	mutex_unlock(&vring_lock);
+fput:
+	fput(filp);
+out:
+	return vr;
+}
+EXPORT_SYMBOL_GPL(vring_attach);
+
+static int vringfs_get_sb(struct file_system_type *fs_type,
+			  int flags, const char *dev_name, void *data,
+			  struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "vring", NULL, VRINGFS_SUPER_MAGIC, mnt);
+}
+
+static struct file_system_type vring_fs_type = {
+	.name		= "vringfs",
+	.get_sb		= vringfs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+static int init(void)
+{
+	register_filesystem(&vring_fs_type);
+	vring_mnt = kern_mount(&vring_fs_type);
+	return 0;
+}
+
+module_init(init);
diff -r 99132ad16999 include/asm-x86/unistd_32.h
--- a/include/asm-x86/unistd_32.h	Sat Apr 05 21:20:32 2008 +1100
+++ b/include/asm-x86/unistd_32.h	Sat Apr 05 21:31:40 2008 +1100
@@ -332,6 +332,7 @@
 #define __NR_fallocate		324
 #define __NR_timerfd_settime	325
 #define __NR_timerfd_gettime	326
+#define __NR_vringfd		327
 
 #ifdef __KERNEL__
 
diff -r 99132ad16999 include/linux/magic.h
--- a/include/linux/magic.h	Sat Apr 05 21:20:32 2008 +1100
+++ b/include/linux/magic.h	Sat Apr 05 21:31:40 2008 +1100
@@ -41,5 +41,6 @@
 
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
+#define VRINGFS_SUPER_MAGIC	0xB1BBAD
 
 #endif /* __LINUX_MAGIC_H__ */
diff -r 99132ad16999 include/linux/syscalls.h
--- a/include/linux/syscalls.h	Sat Apr 05 21:20:32 2008 +1100
+++ b/include/linux/syscalls.h	Sat Apr 05 21:31:40 2008 +1100
@@ -614,6 +614,7 @@ asmlinkage long sys_timerfd_gettime(int 
 asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
 asmlinkage long sys_eventfd(unsigned int count);
 asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
+asmlinkage long sys_vringfd(void __user *, unsigned num, u16 __user *);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
diff -r 99132ad16999 include/linux/vring.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/vring.h	Sat Apr 05 21:31:40 2008 +1100
@@ -0,0 +1,54 @@
+/* Ring-buffer file descriptor implementation.
+ *
+ *  Copyright 2008 Rusty Russell IBM Corporation
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef _LINUX_VRING_H
+#define _LINUX_VRING_H
+
+/* All members are optional */
+struct vring_ops
+{
+	/* Cleanup */
+	void (*destroy)(void *);
+
+	/* Returns number of used buffers, or negative errno. */
+	int (*pull)(void *);
+
+	/* Returns 0 or negative errno. */
+	int (*push)(void *);
+};
+
+/* If they want to call vring_used_buffer_atomic(), set atomic_use.
+ * This currently means that the userspace used buffer must fit in a page. */
+struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
+				void *data, bool atomic_use);
+
+struct iovec;
+
+/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
+int vring_get_buffer(struct vring_info *vr,
+		     struct iovec *in_iov,
+		     unsigned int *num_in, unsigned long *in_len,
+		     struct iovec *out_iov,
+		     unsigned int *num_out, unsigned long *out_len);
+
+void vring_used_buffer(struct vring_info *vr, int id, u32 len);
+
+void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len);
+
+void vring_wake(struct vring_info *vr);
+#endif /* _LINUX_VRING_H */
diff -r 99132ad16999 kernel/sys_ni.c
--- a/kernel/sys_ni.c	Sat Apr 05 21:20:32 2008 +1100
+++ b/kernel/sys_ni.c	Sat Apr 05 21:31:40 2008 +1100
@@ -161,3 +161,4 @@ cond_syscall(compat_sys_timerfd_settime)
 cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
+cond_syscall(sys_vringfd);

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH RFC 2/5] vringfd base/offset
  2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
@ 2008-04-05 12:04 ` Rusty Russell
  2008-04-05 12:05   ` [PATCH RFC 3/5] tun: vringfd receive support Rusty Russell
                     ` (3 more replies)
       [not found] ` <200804052204.28518.rusty__10896.9346424148$1207397431$gmane$org@rustcorp.com.au>
                   ` (4 subsequent siblings)
  5 siblings, 4 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-05 12:04 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, virtualization, Max Krasnyansky

It turns out the lguest (and possibly kvm) want the addresses in the
ring buffer to only cover a certain part of memory, and be offset.

It makes sense that this be an ioctl.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 08fb00b8acab Documentation/ioctl-number.txt
--- a/Documentation/ioctl-number.txt	Sat Apr 05 21:31:40 2008 +1100
+++ b/Documentation/ioctl-number.txt	Sat Apr 05 22:00:10 2008 +1100
@@ -183,6 +183,7 @@ 0xAC	00-1F	linux/raw.h
 0xAC	00-1F	linux/raw.h
 0xAD	00	Netfilter device	in development:
 					<mailto:rusty@rustcorp.com.au>	
+0xAE	00-01	linux/vring.h
 0xB0	all	RATIO devices		in development:
 					<mailto:vgo@ratio.de>
 0xB1	00-1F	PPPoX			<mailto:mostrows@styx.uwaterloo.ca>
diff -r 08fb00b8acab fs/vring.c
--- a/fs/vring.c	Sat Apr 05 21:31:40 2008 +1100
+++ b/fs/vring.c	Sat Apr 05 22:00:10 2008 +1100
@@ -38,6 +38,8 @@ struct vring_info
 	u16 mask;
 	u16 __user *last_used;
 	u16 last_avail;
+
+	unsigned long base, limit;
 
 	const struct vring_ops *ops;
 	void *ops_data;
@@ -120,10 +122,30 @@ static int vring_release(struct inode *i
 	return 0;
 }
 
+static int vring_ioctl(struct inode *in, struct file *filp,
+		       unsigned int cmd, unsigned long arg)
+{
+	struct vring_info *vr = filp->private_data;
+
+	switch (cmd) {
+	case VRINGSETBASE:
+		vr->base = arg;
+		break;
+	case VRINGSETLIMIT:
+		vr->limit = arg;
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	return 0;
+}
+
 static const struct file_operations vring_fops = {
 	.release	= vring_release,
 	.write		= vring_write,
 	.poll		= vring_poll,
+	.ioctl		= vring_ioctl,
 };
 
 asmlinkage long sys_vringfd(void __user *addr,
@@ -166,6 +188,8 @@ asmlinkage long sys_vringfd(void __user 
 	vr->mask = num_descs - 1;
 	vr->ops = NULL;
 	vr->used = NULL;
+	vr->limit = -1UL;
+	vr->base = 0;
 
 	err = get_user(vr->last_avail, &vr->ring.avail->idx);
 	if (err)
@@ -208,12 +232,15 @@ int vring_get_buffer(struct vring_info *
 		out_len = &dummy;
 
 	*in_len = *out_len = 0;
-	
-	if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0))
+
+	if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail
+							  % vr->ring.num])))
 		return -EFAULT;
 
 	i = head;
 	do {
+		void __user *base;
+
 		if (unlikely(i >= vr->ring.num)) {
 			pr_debug("vring: bad index: %u\n", i);
 			return -EINVAL;
@@ -222,24 +249,38 @@ int vring_get_buffer(struct vring_info *
 		if (copy_from_user(&d, &vr->ring.desc[i], sizeof(d)) != 0)
 			return -EFAULT;
 
+		if (d.addr + d.len > vr->limit || (d.addr + d.len < d.addr)) {
+			pr_debug("vring: bad addr/len: %u@%p\n", 
+				 d.len, (void *)(unsigned long)d.addr);
+			return -EINVAL;
+		}
+
+		base = (void __user *)(unsigned long)d.addr + vr->base;
+
 		if (d.flags & VRING_DESC_F_WRITE) {
 			/* Check for length and iovec overflows */
-			if (!num_in)
+			if (!num_in) {
+				pr_debug("vring: writable desc %u in ring %p\n",
+				         i, vr->ring.desc);
 				return -EINVAL;
+			}
 			if (in == *num_in || *in_len + d.len < *in_len)
 				return -E2BIG;
 			in_iov[in].iov_len = d.len;
 			*in_len += d.len;
-			in_iov[in].iov_base = (void __user*)(long)d.addr;
+			in_iov[in].iov_base = base;
 			in++;
 		} else {
-			if (!num_out)
+			if (!num_out) {
+				pr_debug("vring: readable desc %u in ring %p\n",
+				         i, vr->ring.desc);
 				return -EINVAL;
+			}
 			if (out == *num_out || *out_len + d.len < *out_len)
 				return -E2BIG;
 			out_iov[out].iov_len = d.len;
 			*out_len += d.len;
-			out_iov[out].iov_base = (void __user*)(long)d.addr;
+			out_iov[out].iov_base = base;
 			out++;
 		}
 
diff -r 08fb00b8acab include/linux/vring.h
--- a/include/linux/vring.h	Sat Apr 05 21:31:40 2008 +1100
+++ b/include/linux/vring.h	Sat Apr 05 22:00:10 2008 +1100
@@ -18,7 +18,13 @@
  */
 #ifndef _LINUX_VRING_H
 #define _LINUX_VRING_H
+#include <linux/types.h>
 
+/* Ioctl defines, as in "ioctls are AEgly". */
+#define VRINGSETBASE	_IO(0xAE, 0)
+#define VRINGSETLIMIT	_IO(0xAE, 1)
+
+#ifdef __KERNEL__
 /* All members are optional */
 struct vring_ops
 {
@@ -51,4 +57,6 @@ void vring_used_buffer_atomic(struct vri
 void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len);
 
 void vring_wake(struct vring_info *vr);
+#endif /* __KERNEL__ */
+
 #endif /* _LINUX_VRING_H */

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
@ 2008-04-05 12:05   ` Rusty Russell
  2008-04-05 12:06     ` [PATCH RFC 4/5] tun: vringfd xmit support Rusty Russell
  2008-04-08 19:49     ` [PATCH RFC 3/5] tun: vringfd receive support Max Krasnyansky
  2008-04-05 12:44   ` [PATCH RFC 2/5] vringfd base/offset Avi Kivity
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-05 12:05 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, virtualization, Max Krasnyansky

This patch modifies tun to allow a vringfd to specify the receive
buffer.  Because we can't copy to userspace in bh context, we queue
like normal then use the "pull" hook to actually do the copy.

More thought needs to be put into the possible races with ring
registration and a simultaneous close, for example (see FIXME).

We use struct virtio_net_hdr prepended to packets in the ring to allow
userspace to receive GSO packets in future (at the moment, the tun
driver doesn't tell the stack it can handle them, so these cases are
never taken).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 285c3112b26c Documentation/test_vring.c
--- a/Documentation/test_vring.c	Sat Apr 05 22:00:10 2008 +1100
+++ b/Documentation/test_vring.c	Sat Apr 05 22:15:56 2008 +1100
@@ -1,21 +1,62 @@
 #include <unistd.h>
 #include <linux/virtio_ring.h>
+#include <linux/ioctl.h>
+#include <linux/if_tun.h>
 #include <stdio.h>
 #include <stdint.h>
+#include <string.h>
 #include <err.h>
 #include <poll.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <linux/sockios.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/types.h>
 
 #ifndef __NR_vringfd
 #define __NR_vringfd		327
 #endif
 
+/* This sets up the Host end of the network device with an IP address, brings
+ * it up so packets will flow, the copies the MAC address into the hwaddr
+ * pointer. */
+static void configure_device(int fd, const char *devname, uint32_t ipaddr,
+			     unsigned char hwaddr[6])
+{
+	struct ifreq ifr;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+
+	/* Don't read these incantations.  Just cut & paste them like I did! */
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, devname);
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = htonl(ipaddr);
+	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+		err(1, "Setting %s interface address", devname);
+	ifr.ifr_flags = IFF_UP;
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+		err(1, "Bringing interface %s up", devname);
+
+	/* SIOC stands for Socket I/O Control.  G means Get (vs S for Set
+	 * above).  IF means Interface, and HWADDR is hardware address.
+	 * Simple! */
+	if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
+		err(1, "getting hw address for %s", devname);
+	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
+}
+
+
+
 int main()
 {
-	int fd, r;
+	int fd, tunfd, r;
 	struct vring vr;
 	uint16_t used = 0;
 	struct pollfd pfd;
+	struct ifreq ifr;
 	void *buf = calloc(vring_size(256, getpagesize()), 0);
+	char pkt[65535];
 
 	vring_init(&vr, 256, buf, getpagesize());
 
@@ -23,25 +64,57 @@ int main()
 	if (fd < 0)
 		err(1, "vringfd gave %i", fd);
 
+	tunfd = open("/dev/net/tun", O_RDWR);
+	if (tunfd < 0)
+		err(1, "Opening /dev/net/tun");
+	memset(&ifr, 0, sizeof(ifr));
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+	strcpy(ifr.ifr_name, "tap%d");
+	if (ioctl(tunfd, TUNSETIFF, &ifr) != 0)
+		err(1, "configuring /dev/net/tun");
+
+	printf("Interface is %s\n", ifr.ifr_name);
+
+	if (ioctl(tunfd, TUNSETRECVVRING, fd) != 0)
+		err(1, "Setting receive ring");
+
+	/* Add a buffer.  Split it nicely between protocol parts. */
+	vr.desc[0].addr = (unsigned long)pkt;
+	vr.desc[0].len = 14;
+	vr.desc[0].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+	vr.desc[0].next = 1;
+	vr.desc[1].addr = (unsigned long)pkt + 14;
+	vr.desc[1].len = 20;
+	vr.desc[1].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+	vr.desc[1].next = 2;
+	vr.desc[2].addr = (unsigned long)pkt + 34;
+	vr.desc[2].len = 8;
+	vr.desc[2].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+	vr.desc[2].next = 3;
+	vr.desc[3].addr = (unsigned long)pkt + 42;
+	vr.desc[3].len = 100;
+	vr.desc[3].flags = VRING_DESC_F_WRITE;
+
+	/* Here's our buffer. */
+	vr.avail->ring[0] = 0;
+	vr.avail->idx++;
+
+	printf("Waiting for packet...\n");
+	
 	pfd.fd = fd;
 	pfd.events = POLLIN;
-	r = poll(&pfd, 1, 0);
+	r = poll(&pfd, 1, -1);
 	
-	if (r != 0)
+	if (r != 1)
 		err(1, "poll gave %i", r);
 
-	vr.used->idx++;
-	r = poll(&pfd, 1, 0);
-	
-	if (r != 1)
-		err(1, "poll after buf used gave %i", r);
+	/* OK, should have used a buffer. */
+	if (vr.used->idx != 1)
+		errx(1, "vr.used->idx = %u", vr.used->idx);
 
-	used++;
-	r = poll(&pfd, 1, 0);
-	
-	if (r != 0)
-		err(1, "poll after used incremented gave %i", r);
+	if (vr.used->ring[0].id != 0)
+		errx(1, "vr.used->ring[0] = %u", vr.used->ring[0].id);
 
-	close(fd);
+	printf("Total length used = %u\n", vr.used->ring[0].len);
 	return 0;
 }
diff -r 285c3112b26c drivers/net/tun.c
--- a/drivers/net/tun.c	Sat Apr 05 22:00:10 2008 +1100
+++ b/drivers/net/tun.c	Sat Apr 05 22:15:56 2008 +1100
@@ -62,6 +62,8 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/crc32.h>
+#include <linux/vring.h>
+#include <linux/virtio_net.h>
 #include <net/net_namespace.h>
 
 #include <asm/system.h>
@@ -98,6 +100,8 @@ struct tun_struct {
 	u8 dev_addr[ETH_ALEN];
 	u32 chr_filter[2];
 	u32 net_filter[2];
+
+	struct vring_info	*inring;
 
 #ifdef TUN_DEBUG	
 	int debug;
@@ -158,6 +162,10 @@ static int tun_net_xmit(struct sk_buff *
 	/* Notify and wake up reader process */
 	if (tun->flags & TUN_FASYNC)
 		kill_fasync(&tun->fasync, SIGIO, POLL_IN);
+
+	if (tun->inring)
+		vring_wake(tun->inring);
+
 	wake_up_interruptible(&tun->read_wait);
 	return 0;
 
@@ -249,6 +257,117 @@ static void tun_net_init(struct net_devi
 		break;
 	}
 }
+
+#ifdef CONFIG_VRINGFD
+static void unset_recv(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+
+	tun->inring = NULL;
+}
+
+/* Returns number of used buffers, or negative errno. */
+static int pull_recv_skbs(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	int err = 0, num_copied = 0;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&tun->readq)) != NULL) {
+		struct iovec iov[1+MAX_SKB_FRAGS];
+		struct virtio_net_hdr gso = { 0 }; /* no info leak */
+		unsigned int iovnum = ARRAY_SIZE(iov);
+		unsigned long len;
+		int id;
+
+		id = vring_get_buffer(tun->inring, iov, &iovnum, &len,
+				      NULL, NULL, NULL);
+		if (id <= 0) {
+			err = id;
+			break;
+		}
+
+		/* FIXME: we could stash this descriptor and go looking for a
+		 * better-sized one.  That would allow them to mix different
+		 * buffer sizes for efficiency. */
+		if (unlikely(len < sizeof(gso) + skb->len)) {
+			tun->dev->stats.tx_aborted_errors++;
+			err = -ENOBUFS; /* PS. You suck! */
+			break;
+		}
+
+		if (skb_is_gso(skb)) {
+			struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+			/* This is a hint as to how much should be linear. */
+			gso.hdr_len = skb_headlen(skb);
+			gso.gso_size = sinfo->gso_size;
+			if (sinfo->gso_type & SKB_GSO_TCPV4)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+			else if (sinfo->gso_type & SKB_GSO_TCPV6)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+			else if (sinfo->gso_type & SKB_GSO_UDP)
+				gso.gso_type = VIRTIO_NET_HDR_GSO_UDP;
+			else
+				BUG();
+			if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+				gso.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+		} else
+			gso.gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL) {
+			gso.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+			gso.csum_start = skb->csum_start - skb_headroom(skb);
+			gso.csum_offset = skb->csum_offset;
+		} /* else everything is zero */
+
+		err = memcpy_toiovec(iov, (void *)&gso, sizeof(gso));
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		err = skb_copy_datagram_iovec(skb, 0, iov, skb->len);
+		if (unlikely(err)) {
+			tun->dev->stats.tx_fifo_errors++;
+			break;
+		}
+
+		vring_used_buffer(tun->inring, id, sizeof(gso) + skb->len);
+		num_copied++;
+	}
+
+	if (skb)
+		skb_queue_head(&tun->readq, skb);
+
+	if (num_copied)
+		netif_wake_queue(tun->dev);
+
+	return err ?: num_copied;
+}
+
+static struct vring_ops recvops = {
+	.destroy = unset_recv,
+	.pull = pull_recv_skbs,
+};
+
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	struct vring_info *vi;
+
+	/* FIXME: Racy vs unset_recv or even pull_recv_skbs. */
+	vi = vring_attach(fd, &recvops, tun, false);
+	if (IS_ERR(vi))
+		return PTR_ERR(vi);
+	tun->inring = vi;
+	return 0;
+}
+#else /* ... !CONFIG_VRINGFD */
+static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+#endif
 
 /* Character device part */
 
@@ -462,6 +581,7 @@ static void tun_setup(struct net_device 
 
 	tun->owner = -1;
 	tun->group = -1;
+	tun->inring = NULL;
 
 	dev->open = tun_net_open;
 	dev->hard_start_xmit = tun_net_xmit;
@@ -670,6 +790,9 @@ static int tun_chr_ioctl(struct inode *i
 		tun->debug = arg;
 		break;
 #endif
+
+	case TUNSETRECVVRING:
+		return set_recv_vring(tun, arg);		
 
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
diff -r 285c3112b26c include/linux/if_tun.h
--- a/include/linux/if_tun.h	Sat Apr 05 22:00:10 2008 +1100
+++ b/include/linux/if_tun.h	Sat Apr 05 22:15:56 2008 +1100
@@ -42,6 +42,7 @@
 #define TUNSETOWNER   _IOW('T', 204, int)
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
+#define TUNSETRECVVRING _IOW('T', 207, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH RFC 4/5] tun: vringfd xmit support.
  2008-04-05 12:05   ` [PATCH RFC 3/5] tun: vringfd receive support Rusty Russell
@ 2008-04-05 12:06     ` Rusty Russell
  2008-04-05 12:09       ` [PATCH RFC 5/5] lguest support Rusty Russell
  2008-04-07  5:13       ` [PATCH RFC 4/5] tun: vringfd xmit support Herbert Xu
  2008-04-08 19:49     ` [PATCH RFC 3/5] tun: vringfd receive support Max Krasnyansky
  1 sibling, 2 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-05 12:06 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, virtualization, Max Krasnyansky

This patch modifies tun to allow a vringfd to specify the send
buffer.  The user does a write to push out packets from the buffer.

Again, more thought needs to be put into the possible races with ring
registration.

Again we use the 'struct virtio_net_hdr' to allow userspace to send
GSO packets.  In this case, it can hint how much to copy, and the
other pages will be made into skb fragments.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 8270b5fdf03f drivers/net/tun.c
--- a/drivers/net/tun.c	Sat Apr 05 22:49:10 2008 +1100
+++ b/drivers/net/tun.c	Sat Apr 05 22:51:10 2008 +1100
@@ -101,7 +101,7 @@ struct tun_struct {
 	u32 chr_filter[2];
 	u32 net_filter[2];
 
-	struct vring_info	*inring;
+	struct vring_info	*inring, *outring;
 
 #ifdef TUN_DEBUG	
 	int debug;
@@ -258,6 +258,162 @@ static void tun_net_init(struct net_devi
 	}
 }
 
+/* We don't consolidate consecutive iovecs, so huge iovecs can break here.
+ * Users will learn not to do that. */
+static int get_user_skb_frags(const struct iovec *iv, size_t len,
+			      struct skb_frag_struct *f)
+{
+	unsigned int i, j, num_pg = 0;
+	int err;
+	struct page *pages[MAX_SKB_FRAGS];
+
+	down_read(&current->mm->mmap_sem);
+	while (len) {
+		int n, npages;
+		unsigned long base, len;
+		base = (unsigned long)iv->iov_base;
+		len = (unsigned long)iv->iov_len;
+
+		if (len == 0) {
+			iv++;
+			continue;
+		}
+
+		/* How many pages will this take? */
+		npages = 1 + (base + len - 1)/PAGE_SIZE - base/PAGE_SIZE;
+		if (unlikely(num_pg + npages > MAX_SKB_FRAGS)) {
+			err = -ENOSPC;
+			goto fail;
+		}
+		n = get_user_pages(current, current->mm, base, npages,
+				   0, 0, pages, NULL);
+		if (unlikely(n < 0)) {
+			err = n;
+			goto fail;
+		}
+
+		/* Transfer pages to the frag array */
+		for (j = 0; j < n; j++) {
+			f[num_pg].page = pages[j];
+			if (j == 0) {
+				f[num_pg].page_offset = offset_in_page(base);
+				f[num_pg].size = min(len, PAGE_SIZE -
+						     f[num_pg].page_offset);
+			} else {
+				f[num_pg].page_offset = 0;
+				f[num_pg].size = min(len, PAGE_SIZE);
+			}
+			len -= f[num_pg].size;
+			base += f[num_pg].size;
+			num_pg++;
+		}
+
+		if (unlikely(n != npages)) {
+			err = -EFAULT;
+			goto fail;
+		}
+	}
+	up_read(&current->mm->mmap_sem);
+	return num_pg;
+
+fail:
+	for (i = 0; i < num_pg; i++)
+		put_page(f[i].page);
+	up_read(&current->mm->mmap_sem);
+	return err;
+}
+
+/* Get packet from user space buffer.  copylen is a hint as to how
+ * much to copy (rest is pinned).  */
+static struct sk_buff *get_user_skb(struct tun_struct *tun, struct iovec *iv,
+				    size_t copylen, size_t len, int extra)
+{
+	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
+	struct sk_buff *skb;
+	size_t align = 0;
+	int err;
+
+	/* You can't have user fragments without room for destruction info. */
+	BUG_ON(!extra && copylen != len);
+
+	if (!(tun->flags & TUN_NO_PI)) {
+		if (len < sizeof(pi)) {
+			err = -EINVAL;
+			goto fail;
+		}
+		len -= sizeof(pi);
+
+		if (memcpy_fromiovec((void *)&pi, iv, sizeof(pi))) {
+			err = -EFAULT;
+			goto fail;
+		}
+		if (copylen > len)
+			copylen = len;		
+	}
+
+	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
+		align = NET_IP_ALIGN;
+		if (unlikely(copylen < ETH_HLEN)) {
+			if (len < ETH_HLEN) {
+				err = -EINVAL;
+				goto fail;
+			}
+			copylen = ETH_HLEN;
+		}
+	}
+
+	/* We don't need a destructor if we don't have fragments. */
+	if (extra && copylen == len)
+		extra = 0;
+
+	if (!(skb = __alloc_skb(copylen + align, GFP_KERNEL, 0, extra, -1))) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	if (align)
+		skb_reserve(skb, align);
+	if (memcpy_fromiovec(skb_put(skb, copylen), iv, copylen)) {
+		err = -EFAULT;
+		goto free_skb;
+	}
+
+	switch (tun->flags & TUN_TYPE_MASK) {
+	case TUN_TUN_DEV:
+		skb_reset_mac_header(skb);
+		skb->protocol = pi.proto;
+		skb->dev = tun->dev;
+		break;
+	case TUN_TAP_DEV:
+		skb->protocol = eth_type_trans(skb, tun->dev);
+		break;
+	};
+
+	if (tun->flags & TUN_NOCHECKSUM)
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	/* Anything left gets put into frags. */
+	if (extra) {
+		struct skb_shared_info *sinfo = skb_shinfo(skb);
+		int err = get_user_skb_frags(iv, len - copylen, sinfo->frags);
+		if (err < 0)
+			goto free_skb;
+		sinfo->nr_frags = err;
+	}
+	tun->dev->last_rx = jiffies;
+
+	tun->dev->stats.rx_packets++;
+	tun->dev->stats.rx_bytes += len;
+
+	return skb;
+
+free_skb:
+	kfree_skb(skb);
+fail:
+	tun->dev->stats.rx_dropped++;
+	return ERR_PTR(err);
+}
+
 #ifdef CONFIG_VRINGFD
 static void unset_recv(void *_tun)
 {
@@ -362,8 +518,118 @@ static int set_recv_vring(struct tun_str
 	tun->inring = vi;
 	return 0;
 }
+
+static void unset_xmit(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+
+	tun->outring = NULL;
+}
+
+struct skb_shinfo_tun {
+	struct tun_struct *tun;
+
+	unsigned int id;
+	unsigned int len;
+};
+
+/* We are done with this skb: put it in the used pile. */
+static void skb_finished(struct skb_shared_info *sinfo)
+{
+	struct skb_shinfo_tun *sht = (void *)(sinfo + 1);
+
+	/* FIXME: Race prevention */
+	vring_used_buffer_atomic(sht->tun->outring, sht->id, sht->len);
+	vring_wake(sht->tun->outring);
+
+	/* Release device. */
+	dev_put(sht->tun->dev);
+}
+
+static int xmit_packets(void *_tun)
+{
+	struct tun_struct *tun = _tun;
+	struct iovec iov[1+MAX_SKB_FRAGS];
+	unsigned int iovnum = ARRAY_SIZE(iov);
+	int id, err, wake = 0;
+	unsigned long len;
+
+	while ((id = vring_get_buffer(tun->outring, NULL, NULL, NULL,
+				      iov, &iovnum, &len)) > 0) {
+		struct virtio_net_hdr h;
+		struct sk_buff *skb;
+		struct skb_shared_info *shinfo;
+		struct skb_shinfo_tun *sht;
+
+		if (unlikely(len < sizeof(h)))
+			return -EINVAL;
+
+		err = memcpy_fromiovec((void *)&h, iov, sizeof(h));
+		if (unlikely(err))
+			return -EFAULT;
+
+		len -= sizeof(h);
+		if (h.hdr_len > len)
+			return -EINVAL;
+
+		/* Without GSO, we copy entire packet. */
+		if (h.gso_type == VIRTIO_NET_HDR_GSO_NONE)
+			h.hdr_len = len;
+
+		skb = get_user_skb(tun, iov, h.hdr_len, len, sizeof(*sht));
+		if (IS_ERR(skb))
+			return PTR_ERR(skb);
+
+		if ((h.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+		    !skb_partial_csum_set(skb, h.csum_start, h.csum_offset)) {
+			kfree_skb(skb);
+			return -EINVAL;
+		}
+
+		shinfo = skb_shinfo(skb);
+		/* If it has fragments, set up destructor for later. */
+		if (shinfo->nr_frags) {
+			sht = (void *)(shinfo + 1);
+			shinfo->destructor = skb_finished;
+			sht->id = id;
+			sht->len = sizeof(h) + skb->len;
+		} else {
+			vring_used_buffer(tun->outring, id, sizeof(h)+skb->len);
+			wake = 1;
+		}
+		netif_rx_ni(skb);
+	}
+
+	if (wake)
+		vring_wake(tun->outring);
+
+	/* 0 or error. */
+	return id;
+}
+
+static struct vring_ops xmitops = {
+	.destroy = unset_xmit,
+	.push = xmit_packets,
+};
+
+static int set_xmit_vring(struct tun_struct *tun, int fd)
+{
+	struct vring_info *vi;
+
+	/* FIXME: Racy. */
+	vi = vring_attach(fd, &xmitops, tun, false);
+	if (IS_ERR(vi))
+		return PTR_ERR(vi);
+	tun->outring = vi;
+	return 0;
+}
 #else /* ... !CONFIG_VRINGFD */
 static int set_recv_vring(struct tun_struct *tun, int fd)
+{
+	return -ENOTTY;
+}
+
+static int set_xmit_vring(struct tun_struct *tun, int fd)
 {
 	return -ENOTTY;
 }
@@ -390,74 +656,26 @@ static unsigned int tun_chr_poll(struct 
 	return mask;
 }
 
-/* Get packet from user space buffer */
-static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
-{
-	struct tun_pi pi = { 0, __constant_htons(ETH_P_IP) };
-	struct sk_buff *skb;
-	size_t len = count, align = 0;
-
-	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) > count)
-			return -EINVAL;
-
-		if(memcpy_fromiovec((void *)&pi, iv, sizeof(pi)))
-			return -EFAULT;
-	}
-
-	if ((tun->flags & TUN_TYPE_MASK) == TUN_TAP_DEV) {
-		align = NET_IP_ALIGN;
-		if (unlikely(len < ETH_HLEN))
-			return -EINVAL;
-	}
-
-	if (!(skb = alloc_skb(len + align, GFP_KERNEL))) {
-		tun->dev->stats.rx_dropped++;
-		return -ENOMEM;
-	}
-
-	if (align)
-		skb_reserve(skb, align);
-	if (memcpy_fromiovec(skb_put(skb, len), iv, len)) {
-		tun->dev->stats.rx_dropped++;
-		kfree_skb(skb);
-		return -EFAULT;
-	}
-
-	switch (tun->flags & TUN_TYPE_MASK) {
-	case TUN_TUN_DEV:
-		skb_reset_mac_header(skb);
-		skb->protocol = pi.proto;
-		skb->dev = tun->dev;
-		break;
-	case TUN_TAP_DEV:
-		skb->protocol = eth_type_trans(skb, tun->dev);
-		break;
-	};
-
-	if (tun->flags & TUN_NOCHECKSUM)
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-	netif_rx_ni(skb);
-	tun->dev->last_rx = jiffies;
-
-	tun->dev->stats.rx_packets++;
-	tun->dev->stats.rx_bytes += len;
-
-	return count;
-}
-
 static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
 	struct tun_struct *tun = iocb->ki_filp->private_data;
+	size_t len;
+	struct sk_buff *skb;
 
 	if (!tun)
 		return -EBADFD;
 
 	DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
 
-	return tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
+	len = iov_length(iv, count);
+
+	skb = get_user_skb(tun, (struct iovec *)iv, len, len, 0);
+	if (IS_ERR(skb))
+		return PTR_ERR(skb);
+
+	netif_rx_ni(skb);
+	return len;
 }
 
 /* Put packet to the user space buffer */
@@ -795,7 +1013,10 @@ static int tun_chr_ioctl(struct inode *i
 #endif
 
 	case TUNSETRECVVRING:
-		return set_recv_vring(tun, arg);		
+		return set_recv_vring(tun, arg);
+
+	case TUNSETXMITVRING:
+		return set_xmit_vring(tun, arg);
 
 	case SIOCGIFFLAGS:
 		ifr.ifr_flags = tun->if_flags;
diff -r 8270b5fdf03f include/linux/if_tun.h
--- a/include/linux/if_tun.h	Sat Apr 05 22:49:10 2008 +1100
+++ b/include/linux/if_tun.h	Sat Apr 05 22:51:10 2008 +1100
@@ -43,6 +43,7 @@
 #define TUNSETLINK    _IOW('T', 205, int)
 #define TUNSETGROUP   _IOW('T', 206, int)
 #define TUNSETRECVVRING _IOW('T', 207, int)
+#define TUNSETXMITVRING _IOW('T', 208, int)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH RFC 5/5] lguest support
  2008-04-05 12:06     ` [PATCH RFC 4/5] tun: vringfd xmit support Rusty Russell
@ 2008-04-05 12:09       ` Rusty Russell
  2008-04-07  5:13       ` [PATCH RFC 4/5] tun: vringfd xmit support Herbert Xu
  1 sibling, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-05 12:09 UTC (permalink / raw)
  To: linux-kernel; +Cc: netdev, virtualization, Max Krasnyansky

This is how lguest uses the vringfd tun support.  It needs more cleanup,
but it seems to basically work.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

diff -r 6979348a6ece Documentation/lguest/lguest.c
--- a/Documentation/lguest/lguest.c	Sat Apr 05 22:02:28 2008 +1100
+++ b/Documentation/lguest/lguest.c	Sat Apr 05 22:12:25 2008 +1100
@@ -43,6 +43,7 @@
 #include "linux/virtio_console.h"
 #include "linux/virtio_rng.h"
 #include "linux/virtio_ring.h"
+#include "linux/vring.h"
 #include "asm-x86/bootparam.h"
 /*L:110 We can ignore the 39 include files we need for this program, but I do
  * want to draw attention to the use of kernel-style types.
@@ -56,6 +57,10 @@ typedef uint16_t u16;
 typedef uint16_t u16;
 typedef uint8_t u8;
 /*:*/
+
+#ifndef __NR_vringfd
+#define __NR_vringfd		327
+#endif
 
 #define PAGE_PRESENT 0x7 	/* Present, RW, Execute */
 #define NET_PEERNUM 1
@@ -101,6 +106,9 @@ struct device_list
 
 	/* The descriptor page for the devices. */
 	u8 *descpage;
+
+	/* Pointer to last used in descpage */
+	u8 *nextdesc;
 
 	/* A single linked list of devices. */
 	struct device *dev;
@@ -853,6 +861,13 @@ static void handle_console_output(int fd
  * and write them (ignoring the first element) to this device's file descriptor
  * (/dev/net/tun).
  */
+struct virtio_net_info
+{
+	struct virtqueue *xmit_vq, *recv_vq;
+	u16 xmit_used, recv_used;
+	int xmitfd;
+};
+
 static void handle_net_output(int fd, struct virtqueue *vq)
 {
 	unsigned int head, out, in;
@@ -870,6 +885,15 @@ static void handle_net_output(int fd, st
 		len = writev(vq->dev->fd, iov+1, out-1);
 		add_used_and_trigger(fd, vq, head, len);
 	}
+}
+
+static void handle_netring_output(int fd, struct virtqueue *vq)
+{
+	struct virtio_net_info *ni = vq->dev->priv;
+
+	/* We have output, kick the kernel. */
+	if (write(ni->xmitfd, "", 0) != 0)
+		err(1, "Writing to xmitfd");
 }
 
 /* This is where we handle a packet coming in from the tun device to our
@@ -1054,18 +1078,13 @@ static struct lguest_device_desc *new_de
 static struct lguest_device_desc *new_dev_desc(u16 type)
 {
 	struct lguest_device_desc d = { .type = type };
-	void *p;
-
-	/* Figure out where the next device config is, based on the last one. */
-	if (devices.lastdev)
-		p = device_config(devices.lastdev)
-			+ devices.lastdev->desc->config_len;
-	else
-		p = devices.descpage;
+	void *p = devices.nextdesc;
 
 	/* We only have one page for all the descriptors. */
 	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
 		errx(1, "Too many devices");
+
+	devices.nextdesc += sizeof(d);
 
 	/* p might not be aligned, so we memcpy in. */
 	return memcpy(p, &d, sizeof(d));
@@ -1104,6 +1123,7 @@ static void add_virtqueue(struct device 
 	 * yet, otherwise we'd be overwriting them. */
 	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
 	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	devices.nextdesc += sizeof(vq->config);
 	dev->desc->num_vq++;
 
 	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
@@ -1133,6 +1153,7 @@ static void add_feature(struct device *d
 	if (dev->desc->feature_len <= bit / CHAR_BIT) {
 		assert(dev->desc->config_len == 0);
 		dev->desc->feature_len = (bit / CHAR_BIT) + 1;
+		devices.nextdesc = features + dev->desc->feature_len * 2;
 	}
 
 	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
@@ -1147,8 +1168,10 @@ static void set_config(struct device *de
 	if (device_config(dev) + len > devices.descpage + getpagesize())
 		errx(1, "Too many devices");
 
+	assert(device_config(dev) == devices.nextdesc);
 	/* Copy in the config information, and store the length. */
 	memcpy(device_config(dev), conf, len);
+	devices.nextdesc += len;
 	dev->desc->config_len = len;
 }
 
@@ -1167,7 +1190,8 @@ static struct device *new_device(const c
 	 * to the device_list's fdset and maxfd. */
 	if (handle_input)
 		add_device_fd(dev->fd);
-	dev->desc = new_dev_desc(type);
+	if (type)
+		dev->desc = new_dev_desc(type);
 	dev->handle_input = handle_input;
 	dev->name = name;
 	dev->vq = NULL;
@@ -1295,11 +1319,30 @@ static void configure_device(int fd, con
 	memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
 }
 
+static bool xmitfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	ni->xmit_used = ni->xmit_vq->vring.used->idx;
+	trigger_irq(fd, ni->xmit_vq);
+
+	return true;
+}
+
+static bool recvfd_used(int fd, struct device *dev)
+{
+	struct virtio_net_info *ni = dev->priv;
+
+	ni->recv_used = ni->recv_vq->vring.used->idx;
+	trigger_irq(fd, ni->recv_vq);
+	return true;
+}
+
 /*L:195 Our network is a Host<->Guest network.  This can either use bridging or
  * routing, but the principle is the same: it uses the "tun" device to inject
  * packets into the Host as if they came in from a normal network card.  We
  * just shunt packets between the Guest and the tun device. */
-static void setup_tun_net(const char *arg)
+static void setup_tun_net(const char *arg, bool rings)
 {
 	struct device *dev;
 	struct ifreq ifr;
@@ -1307,6 +1350,7 @@ static void setup_tun_net(const char *ar
 	u32 ip;
 	const char *br_name = NULL;
 	struct virtio_net_config conf;
+	struct virtio_net_info *ni;
 
 	/* We open the /dev/net/tun device and tell it we want a tap device.  A
 	 * tap device is like a tun device, only somehow different.  To tell
@@ -1318,17 +1362,63 @@ static void setup_tun_net(const char *ar
 	strcpy(ifr.ifr_name, "tap%d");
 	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
 		err(1, "configuring /dev/net/tun");
-	/* We don't need checksums calculated for packets coming in this
-	 * device: trust us! */
-	ioctl(netfd, TUNSETNOCSUM, 1);
 
-	/* First we create a new network device. */
-	dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+	if (rings) {
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, NULL);
+		add_virtqueue(dev, VIRTQUEUE_NUM, NULL);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_netring_output);
+	} else {	
+		/* We don't need checksums calculated for packets coming in this
+		 * device: trust us! */
+		ioctl(netfd, TUNSETNOCSUM, 1);
 
-	/* Network devices need a receive and a send queue, just like
-	 * console. */
-	add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
-	add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+		/* First we create a new network device. */
+		dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input);
+		/* When they add more receive buffers, try re-enabling input */
+		add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd);
+		add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output);
+	}
+
+	dev->priv = ni = malloc(sizeof(*ni));
+
+	ni->recv_vq = dev->vq;
+	ni->xmit_vq = dev->vq->next;
+	ni->recv_used = 0;
+	ni->xmit_used = 0;
+
+	if (rings) {
+		int xmitfd, recvfd;
+
+		/* Now we create the receive and xmit ringfds. */
+		recvfd = syscall(__NR_vringfd, dev->vq->vring.desc,
+				 VIRTQUEUE_NUM, &ni->recv_used);
+		if (recvfd < 0)
+			err(1, "Creating recv vringfd");
+
+		xmitfd = syscall(__NR_vringfd, dev->vq->next->vring.desc,
+				 VIRTQUEUE_NUM, &ni->xmit_used);
+		if (xmitfd < 0)
+			err(1, "Creating xmit vringfd");
+
+		/* Set offset & limit. */
+		if (ioctl(xmitfd, VRINGSETBASE, guest_base) != 0
+		    || ioctl(recvfd, VRINGSETBASE, guest_base) != 0
+		    || ioctl(xmitfd, VRINGSETLIMIT, guest_limit) != 0
+		    || ioctl(recvfd, VRINGSETLIMIT, guest_limit) != 0)
+			err(1, "Setting vring offset and limit");
+
+		/* Tell the tunnet to use them. */
+		if (ioctl(netfd, TUNSETRECVVRING, recvfd) != 0)
+			err(1, "Setting receive ring");
+		if (ioctl(netfd, TUNSETXMITVRING, xmitfd) != 0)
+			err(1, "Setting xmit ring");
+
+		/* Now we need to respond when they become readable. */
+		new_device("net", 0, recvfd, recvfd_used)->priv = ni;
+		new_device("net", 0, xmitfd, xmitfd_used)->priv = ni;
+		ni->xmitfd = xmitfd;
+	}
 
 	/* We need a socket to perform the magic network ioctls to bring up the
 	 * tap interface, connect to the bridge etc.  Any socket will do! */
@@ -1716,6 +1806,7 @@ static struct option opts[] = {
 static struct option opts[] = {
 	{ "verbose", 0, NULL, 'v' },
 	{ "tunnet", 1, NULL, 't' },
+	{ "tunring", 1, NULL, 'R' },
 	{ "block", 1, NULL, 'b' },
 	{ "rng", 0, NULL, 'r' },
 	{ "initrd", 1, NULL, 'i' },
@@ -1775,7 +1866,7 @@ int main(int argc, char *argv[])
 						      + DEVICE_PAGES);
 			guest_limit = mem;
 			guest_max = mem + DEVICE_PAGES*getpagesize();
-			devices.descpage = get_pages(1);
+			devices.descpage = devices.nextdesc = get_pages(1);
 			break;
 		}
 	}
@@ -1787,7 +1878,10 @@ int main(int argc, char *argv[])
 			verbose = true;
 			break;
 		case 't':
-			setup_tun_net(optarg);
+			setup_tun_net(optarg, false);
+			break;
+		case 'R':
+			setup_tun_net(optarg, true);
 			break;
 		case 'b':
 			setup_block_file(optarg);

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/5] vringfd base/offset
  2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
  2008-04-05 12:05   ` [PATCH RFC 3/5] tun: vringfd receive support Rusty Russell
@ 2008-04-05 12:44   ` Avi Kivity
  2008-04-06  2:54     ` Rusty Russell
       [not found]   ` <200804052205.43824.rusty__2650.41595926068$1207397436$gmane$org@rustcorp.com.au>
  2008-04-08  5:14   ` [PATCH RFC 2/5] vringfd base/offset Arnd Bergmann
  3 siblings, 1 reply; 29+ messages in thread
From: Avi Kivity @ 2008-04-05 12:44 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Rusty Russell wrote:
> It turns out the lguest (and possibly kvm) want the addresses in the
> ring buffer to only cover a certain part of memory, and be offset.
>
>  					<mailto:rusty@rustcorp.com.au>	
> +0xAE	00-01	linux/vring.h
>   

Another virtualization subsystem already uses this range.


-- 
Any sufficiently difficult bug is indistinguishable from a feature.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
       [not found] <200804052202.09157.rusty__7324.67876882783$1207397085$gmane$org@rustcorp.com.au>
@ 2008-04-05 17:13 ` Anthony Liguori
  2008-04-06  3:03   ` Rusty Russell
  0 siblings, 1 reply; 29+ messages in thread
From: Anthony Liguori @ 2008-04-05 17:13 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Rusty Russell wrote:
> For virtualization, we've developed virtio_ring for efficient communication.
> This would also work well for userspace-kernel communication, particularly
> for things like the tun device.  By using the same ABI, we can join guests
> to the host kernel trivially.
> 
> These patches are fairly alpha; I've seen some network stalls I have to
> track down and there are some fixmes.
> 
> Comments welcome!
> Rusty.
> 
> diff -r 99132ad16999 Documentation/test_vring.c
> --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
> +++ b/Documentation/test_vring.c	Sat Apr 05 21:31:40 2008 +1100
> @@ -0,0 +1,47 @@
> +#include <unistd.h>
> +#include <linux/virtio_ring.h>
> +#include <stdio.h>
> +#include <stdint.h>
> +#include <err.h>
> +#include <poll.h>
> +
> +#ifndef __NR_vringfd
> +#define __NR_vringfd		327
> +#endif
> +
> +int main()
> +{
> +	int fd, r;
> +	struct vring vr;
> +	uint16_t used = 0;
> +	struct pollfd pfd;
> +	void *buf = calloc(vring_size(256, getpagesize()), 0);

Shouldn't this be calloc(1, vring_size(256, getpagesize()));?

> +	vring_init(&vr, 256, buf, getpagesize());
> +
> +	fd = syscall(__NR_vringfd, buf, 256, &used);
> +	if (fd < 0)
> +		err(1, "vringfd gave %i", fd);
> +
> +	pfd.fd = fd;
> +	pfd.events = POLLIN;
> +	r = poll(&pfd, 1, 0);
> +	
> +	if (r != 0)
> +		err(1, "poll gave %i", r);
> +
> +	vr.used->idx++;
> +	r = poll(&pfd, 1, 0);
> +	
> +	if (r != 1)
> +		err(1, "poll after buf used gave %i", r);
> +
> +	used++;
> +	r = poll(&pfd, 1, 0);
> +	
> +	if (r != 0)
> +		err(1, "poll after used incremented gave %i", r);

I have a tough time seeing what you're demonstrating here.  Perhaps some 
comments?

> +	close(fd);
> +	return 0;
> +}
> diff -r 99132ad16999 arch/x86/kernel/syscall_table_32.S
> --- a/arch/x86/kernel/syscall_table_32.S	Sat Apr 05 21:20:32 2008 +1100
> +++ b/arch/x86/kernel/syscall_table_32.S	Sat Apr 05 21:31:40 2008 +1100
> @@ -326,3 +326,4 @@ ENTRY(sys_call_table)
>  	.long sys_fallocate
>  	.long sys_timerfd_settime	/* 325 */
>  	.long sys_timerfd_gettime
> +	.long sys_vringfd
> diff -r 99132ad16999 fs/Kconfig
> --- a/fs/Kconfig	Sat Apr 05 21:20:32 2008 +1100
> +++ b/fs/Kconfig	Sat Apr 05 21:31:40 2008 +1100
> @@ -2135,4 +2135,14 @@ source "fs/nls/Kconfig"
>  source "fs/nls/Kconfig"
>  source "fs/dlm/Kconfig"
>  
> +config VRINGFD
> +       bool "vring fd support (EXPERIMENTAL)"
> +       depends on EXPERIMENTAL
> +       help
> +         vring is a ringbuffer implementation for efficient I/O.  It is
> +	 currently used by virtualization hosts (lguest, kvm) for efficient
> +	 networking using the tun driver.
> +
> +	 If unsure, say N.
> +

Should select VIRTIO && VIRTIO_RING

<snip>

> +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer() */
> +int vring_get_buffer(struct vring_info *vr,
> +		     struct iovec *in_iov,
> +		     unsigned int *num_in, unsigned long *in_len,
> +		     struct iovec *out_iov,
> +		     unsigned int *num_out, unsigned long *out_len)
> +{

It seems unlikely that a caller could place both in_iov/out_iov on the 
stack since to do it safely, you would have to use vring.num which is 
determined by userspace.  Since you have to kmalloc() the buffers 
anyway, why not just allocate a single data structure within this 
function and return it.

> +void vring_used_buffer_atomic(struct vring_info *vr, int id, u32 len)
> +{
> +	struct vring_used_elem *used;
> +
> +	BUG_ON(id <= 0 || id > vr->ring.num);
> +	BUG_ON(!vr->used);
> +
> +	used = &vr->used->ring[vr->used->idx & vr->mask];
> +	used->id = id - 1;
> +	used->len = len;
> +	/* Make sure buffer is written before we update index. */
> +	wmb();
> +	vr->used->idx++;
> +}
> +EXPORT_SYMBOL_GPL(vring_used_buffer_atomic);

When is this actually safe to use?

> +
> +void vring_wake(struct vring_info *vr)
> +{
> +	wake_up(&vr->poll_wait);
> +}
> +EXPORT_SYMBOL_GPL(vring_wake);
> +
> +struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
> +				void *data, bool atomic_use)
> +{
> +	struct file *filp;
> +	struct vring_info *vr;
> +
> +	/* Must be a valid fd, and must be one of ours. */
> +	filp = fget(fd);
> +	if (!filp) {
> +		vr = ERR_PTR(-EBADF);
> +		goto out;
> +	}
> +
> +	if (filp->f_op != &vring_fops) {
> +		vr = ERR_PTR(-EBADF);
> +		goto fput;
> +	}
> +
> +	/* Mutex simply protects against parallel vring_attach. */
> +	mutex_lock(&vring_lock);
> +	vr = filp->private_data;
> +	if (vr->ops) {
> +		vr = ERR_PTR(-EBUSY);
> +		goto unlock;
> +	}
> +
> +	/* If they want to use atomically, we have to map the page. */
> +	if (atomic_use) {
> +		if (get_user_pages(current, current->mm,
> +				   (unsigned long)vr->ring.used, 1, 1, 1,
> +				   &vr->used_page, NULL) != 1) {
> +			vr = ERR_PTR(-EFAULT);
> +			goto unlock;
> +		}

Oh, this is when it's safe to use.  You don't seem to be acquiring 
current->mm->mmap_sem here.  Also, this assumes the used ring fits 
within a single page which isn't true with a ring > 512 elements.

A consequence of this is that devices that interact with a ring queue 
atomically now have an additional capability: pinning an arbitrary 
amount of physical memory.

I think this means that it's no longer safe to give a tap fd to an 
unprivileged process regardless of how you're routing the traffic.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/5] vringfd base/offset
       [not found] ` <200804052204.28518.rusty__10896.9346424148$1207397431$gmane$org@rustcorp.com.au>
@ 2008-04-05 17:18   ` Anthony Liguori
  2008-04-06  3:23     ` Rusty Russell
  0 siblings, 1 reply; 29+ messages in thread
From: Anthony Liguori @ 2008-04-05 17:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Rusty Russell wrote:
> It turns out the lguest (and possibly kvm) want the addresses in the
> ring buffer to only cover a certain part of memory, and be offset.
> 
> It makes sense that this be an ioctl.
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
>

<snip>

> @@ -208,12 +232,15 @@ int vring_get_buffer(struct vring_info *
>  		out_len = &dummy;
>  
>  	*in_len = *out_len = 0;
> -	
> -	if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0))
> +
> +	if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail
> +							  % vr->ring.num])))

Why not & with vr->mask for the sake of consistency with the rest of the 
code.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
       [not found]   ` <200804052205.43824.rusty__2650.41595926068$1207397436$gmane$org@rustcorp.com.au>
@ 2008-04-05 17:26     ` Anthony Liguori
  0 siblings, 0 replies; 29+ messages in thread
From: Anthony Liguori @ 2008-04-05 17:26 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Rusty Russell wrote:
> This patch modifies tun to allow a vringfd to specify the receive
> buffer.  Because we can't copy to userspace in bh context, we queue
> like normal then use the "pull" hook to actually do the copy.
> 
> More thought needs to be put into the possible races with ring
> registration and a simultaneous close, for example (see FIXME).
> 
> We use struct virtio_net_hdr prepended to packets in the ring to allow
> userspace to receive GSO packets in future (at the moment, the tun
> driver doesn't tell the stack it can handle them, so these cases are
> never taken).
> 
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

<snip>

> +
> +#ifdef CONFIG_VRINGFD

I think the rest of the code needs these for it to actually work without 
VRINGFD.

> +static void unset_recv(void *_tun)
> +{
> +	struct tun_struct *tun = _tun;
> +
> +	tun->inring = NULL;
> +}
> +
> +/* Returns number of used buffers, or negative errno. */
> +static int pull_recv_skbs(void *_tun)
> +{
> +	struct tun_struct *tun = _tun;
> +	int err = 0, num_copied = 0;
> +	struct sk_buff *skb;
> +
> +	while ((skb = skb_dequeue(&tun->readq)) != NULL) {
> +		struct iovec iov[1+MAX_SKB_FRAGS];
> +		struct virtio_net_hdr gso = { 0 }; /* no info leak */
> +		unsigned int iovnum = ARRAY_SIZE(iov);
> +		unsigned long len;
> +		int id;
> +
> +		id = vring_get_buffer(tun->inring, iov, &iovnum, &len,
> +				      NULL, NULL, NULL);
> +		if (id <= 0) {
> +			err = id;
> +			break;
> +		}

Ah, I see now why you're passing something from the stack.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/5] vringfd base/offset
  2008-04-05 12:44   ` [PATCH RFC 2/5] vringfd base/offset Avi Kivity
@ 2008-04-06  2:54     ` Rusty Russell
  0 siblings, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-06  2:54 UTC (permalink / raw)
  To: Avi Kivity; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

On Saturday 05 April 2008 22:44:33 Avi Kivity wrote:
> Rusty Russell wrote:
> > It turns out the lguest (and possibly kvm) want the addresses in the
> > ring buffer to only cover a certain part of memory, and be offset.
> >
> >  					<mailto:rusty@rustcorp.com.au>
> > +0xAE	00-01	linux/vring.h
>
> Another virtualization subsystem already uses this range.

Nasty.  Not listed in the ioctl list, and didn't find it with grep 8(.  I 
don't think I ever used 0xAD which was assigned to me, so I can change to 
that.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-05 17:13 ` Anthony Liguori
@ 2008-04-06  3:03   ` Rusty Russell
  0 siblings, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-06  3:03 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

On Sunday 06 April 2008 03:13:41 Anthony Liguori wrote:
> > +	void *buf = calloc(vring_size(256, getpagesize()), 0);
>
> Shouldn't this be calloc(1, vring_size(256, getpagesize()));?

Heh, yes... spot the last minute change from malloc to calloc.

> > +	if (r != 0)
> > +		err(1, "poll after used incremented gave %i", r);
>
> I have a tough time seeing what you're demonstrating here.  Perhaps some
> comments?

Well, not I have lguest working, I can just blow away the test program.  It
just tests poll.

> > +config VRINGFD
> > +       bool "vring fd support (EXPERIMENTAL)"
> > +       depends on EXPERIMENTAL
> > +       help
> > +         vring is a ringbuffer implementation for efficient I/O.  It is
> > +	 currently used by virtualization hosts (lguest, kvm) for efficient
> > +	 networking using the tun driver.
> > +
> > +	 If unsure, say N.
> > +
>
> Should select VIRTIO && VIRTIO_RING

I don't think so.  It doesn't depend on either.

> > +/* Returns an error, or 0 (no buffers), or an id for vring_used_buffer()
> > */ +int vring_get_buffer(struct vring_info *vr,
> > +		     struct iovec *in_iov,
> > +		     unsigned int *num_in, unsigned long *in_len,
> > +		     struct iovec *out_iov,
> > +		     unsigned int *num_out, unsigned long *out_len)
> > +{
>
> It seems unlikely that a caller could place both in_iov/out_iov on the
> stack since to do it safely, you would have to use vring.num which is
> determined by userspace.  Since you have to kmalloc() the buffers
> anyway, why not just allocate a single data structure within this
> function and return it.

This needs a comment. num_out and num_in are in parameters specifying the 
maximum of each.

> > +	/* If they want to use atomically, we have to map the page. */
> > +	if (atomic_use) {
> > +		if (get_user_pages(current, current->mm,
> > +				   (unsigned long)vr->ring.used, 1, 1, 1,
> > +				   &vr->used_page, NULL) != 1) {
> > +			vr = ERR_PTR(-EFAULT);
> > +			goto unlock;
> > +		}
>
> Oh, this is when it's safe to use.  You don't seem to be acquiring
> current->mm->mmap_sem here.  Also, this assumes the used ring fits
> within a single page which isn't true with a ring > 512 elements.

Yes, this is a hack.  It actually means ring <= 256 for PAGE_SIZE 4096.  I'm 
not entirely comfortable with it.

> A consequence of this is that devices that interact with a ring queue
> atomically now have an additional capability: pinning an arbitrary
> amount of physical memory.

Erk, the size check that was supposed to be here got lost in the reshuffle :(.

One option is to use a sliding window, but better is to do best effort and 
have the tun driver fall back (this is actually possible with a slight 
change).

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/5] vringfd base/offset
  2008-04-05 17:18   ` Anthony Liguori
@ 2008-04-06  3:23     ` Rusty Russell
  0 siblings, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-06  3:23 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

On Sunday 06 April 2008 03:18:59 Anthony Liguori wrote:
> Rusty Russell wrote:
> > -	if (unlikely(get_user(head, &vr->ring.avail->ring[head]) != 0))
> > +
> > +	if (unlikely(get_user(head, &vr->ring.avail->ring[vr->last_avail
> > +							  % vr->ring.num])))
>
> Why not & with vr->mask for the sake of consistency with the rest of the
> code.

Thanks, fixed.

Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 4/5] tun: vringfd xmit support.
  2008-04-05 12:06     ` [PATCH RFC 4/5] tun: vringfd xmit support Rusty Russell
  2008-04-05 12:09       ` [PATCH RFC 5/5] lguest support Rusty Russell
@ 2008-04-07  5:13       ` Herbert Xu
  2008-04-07  7:24         ` Rusty Russell
  1 sibling, 1 reply; 29+ messages in thread
From: Herbert Xu @ 2008-04-07  5:13 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, virtualization, maxk

Rusty Russell <rusty@rustcorp.com.au> wrote:
>
> +/* We are done with this skb: put it in the used pile. */
> +static void skb_finished(struct skb_shared_info *sinfo)
> +{
> +       struct skb_shinfo_tun *sht = (void *)(sinfo + 1);
> +
> +       /* FIXME: Race prevention */
> +       vring_used_buffer_atomic(sht->tun->outring, sht->id, sht->len);
> +       vring_wake(sht->tun->outring);
> +
> +       /* Release device. */
> +       dev_put(sht->tun->dev);
> +}

On second thought, this is not going to work.  The network stack
can clone individual pages out of this skb and put it into a new
skb.  Therefore whatever scheme we come up with will either need
to be page-based, or add a flag to tell the network stack that it
can't clone those pages.

Cheers,
-- 
Visit Openswan at http://www.openswan.org/
Email: Herbert Xu ~{PmV>HI~} <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 4/5] tun: vringfd xmit support.
  2008-04-07  5:13       ` [PATCH RFC 4/5] tun: vringfd xmit support Herbert Xu
@ 2008-04-07  7:24         ` Rusty Russell
  2008-04-07  7:35           ` David Miller
  0 siblings, 1 reply; 29+ messages in thread
From: Rusty Russell @ 2008-04-07  7:24 UTC (permalink / raw)
  To: Herbert Xu; +Cc: linux-kernel, netdev, virtualization, maxk

On Monday 07 April 2008 15:13:44 Herbert Xu wrote:
> Rusty Russell <rusty@rustcorp.com.au> wrote:
> > +/* We are done with this skb: put it in the used pile. */
> > +static void skb_finished(struct skb_shared_info *sinfo)
> > +{
> > +       struct skb_shinfo_tun *sht = (void *)(sinfo + 1);
> > +
> > +       /* FIXME: Race prevention */
> > +       vring_used_buffer_atomic(sht->tun->outring, sht->id, sht->len);
> > +       vring_wake(sht->tun->outring);
> > +
> > +       /* Release device. */
> > +       dev_put(sht->tun->dev);
> > +}
>
> On second thought, this is not going to work.  The network stack
> can clone individual pages out of this skb and put it into a new
> skb.  Therefore whatever scheme we come up with will either need
> to be page-based, or add a flag to tell the network stack that it
> can't clone those pages.

Erk... I'll put in the latter for now.   A page-level solution is not really 
an option: if userspace hands us mmaped pages for example.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 4/5] tun: vringfd xmit support.
  2008-04-07  7:24         ` Rusty Russell
@ 2008-04-07  7:35           ` David Miller
  2008-04-08  1:51             ` Rusty Russell
  0 siblings, 1 reply; 29+ messages in thread
From: David Miller @ 2008-04-07  7:35 UTC (permalink / raw)
  To: rusty; +Cc: herbert, linux-kernel, netdev, virtualization, maxk

From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 7 Apr 2008 17:24:51 +1000

> On Monday 07 April 2008 15:13:44 Herbert Xu wrote:
> > On second thought, this is not going to work.  The network stack
> > can clone individual pages out of this skb and put it into a new
> > skb.  Therefore whatever scheme we come up with will either need
> > to be page-based, or add a flag to tell the network stack that it
> > can't clone those pages.
> 
> Erk... I'll put in the latter for now.   A page-level solution is not really 
> an option: if userspace hands us mmaped pages for example.

Keep in mind that the core of the TCP stack really depends
upon being able to slice and dice paged SKBs as is pleases
in order to send packets out.

In fact, it also does such splitting during SACK processing.

It really is a base requirement for efficient TSO support.
Otherwise the above operations would be so incredibly
expensive we might as well rip all of the TSO support out.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
  2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
       [not found] ` <200804052204.28518.rusty__10896.9346424148$1207397431$gmane$org@rustcorp.com.au>
@ 2008-04-07 17:54 ` Jonathan Corbet
  2008-04-07 22:34   ` Rusty Russell
  2008-04-08  2:35 ` Arnd Bergmann
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 29+ messages in thread
From: Jonathan Corbet @ 2008-04-07 17:54 UTC (permalink / raw)
  To: Rusty Russell; +Cc: netdev, virtualization, Max Krasnyansky, linux-kernel

Hey, Rusty,

> For virtualization, we've developed virtio_ring for efficient communication.
> This would also work well for userspace-kernel communication, particularly
> for things like the tun device.  By using the same ABI, we can join guests
> to the host kernel trivially.

I'm *sure* you meant to document that somewhat non-trivial proposed new
kernel API as soon as you got a moment.  But, since I went to the
trouble of reverse engineering it, I decided not to wait and to make a
little unreliable guide of my own.  For those who are curious about how
vringfd() is meant to work (or about how badly I misunderstood it), the
info is at:

	http://lwn.net/SubscriberLink/276856/8c927025c53ad7ce/

Hopefully it will be helpful,

jon

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-07 17:54 ` [PATCH RFC 1/5] vringfd syscall Jonathan Corbet
@ 2008-04-07 22:34   ` Rusty Russell
  0 siblings, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-07 22:34 UTC (permalink / raw)
  To: Jonathan Corbet; +Cc: netdev, virtualization, Max Krasnyansky, linux-kernel

On Tuesday 08 April 2008 03:54:34 Jonathan Corbet wrote:
> Hey, Rusty,
>
> > For virtualization, we've developed virtio_ring for efficient
> > communication. This would also work well for userspace-kernel
> > communication, particularly for things like the tun device.  By using the
> > same ABI, we can join guests to the host kernel trivially.
>
> I'm *sure* you meant to document that somewhat non-trivial proposed new
> kernel API as soon as you got a moment.

Actually, yes.  But I wanted to get it out there before I start the treck 
across to the virtualization summit.

A few points:
'The page alignment for the used array is important - that array might be 
mapped separately into kernel space.'
   Well, the used array is written by one side only, so it's possible to split 
the ring here and make each part r/o to the other side.  More importantly, a 
page boundary is almost certainly a cacheline boundary, and we already have a 
userspace interface for it.

'Note that the flags fields in the vring_avail and vring_used structures 
appear to be unused.'
   virtio uses these for wakeup/interrupt suppression.  It's a cheap way to 
avoid hypercalls, and we can use them the same way to avoid system calls (you 
set the suppression bit while you're actually looking at the ring).

The need for the kmap (and hence the atomic horror) has now been alleviated: I 
changed the shinfo destructor code to allow the destructor to hold onto the 
skb data so it can queue it and free it later.

BTW, the only place currently where both output and input buffers are used is 
the virtio_blk driver doing a read, where the header describes the operation, 
and the other buffers are overwritten with the data.

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 4/5] tun: vringfd xmit support.
  2008-04-07  7:35           ` David Miller
@ 2008-04-08  1:51             ` Rusty Russell
  0 siblings, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-08  1:51 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, linux-kernel, netdev, virtualization, maxk

On Monday 07 April 2008 17:35:28 David Miller wrote:
> From: Rusty Russell <rusty@rustcorp.com.au>
> Date: Mon, 7 Apr 2008 17:24:51 +1000
>
> > On Monday 07 April 2008 15:13:44 Herbert Xu wrote:
> > > On second thought, this is not going to work.  The network stack
> > > can clone individual pages out of this skb and put it into a new
> > > skb.  Therefore whatever scheme we come up with will either need
> > > to be page-based, or add a flag to tell the network stack that it
> > > can't clone those pages.
> >
> > Erk... I'll put in the latter for now.   A page-level solution is not
> > really an option: if userspace hands us mmaped pages for example.
>
> Keep in mind that the core of the TCP stack really depends
> upon being able to slice and dice paged SKBs as is pleases
> in order to send packets out.
>
> In fact, it also does such splitting during SACK processing.
>
> It really is a base requirement for efficient TSO support.
> Otherwise the above operations would be so incredibly
> expensive we might as well rip all of the TSO support out.

In this case that's OK; these are only for packets coming from 
userspace/guest, so the only interaction with the tcp code is possibly as a 
recipient.

If tcp wanted to do zero-copy aio xmit, I think we'd need something else.

Cheers,
Rusty.





^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
                   ` (2 preceding siblings ...)
  2008-04-07 17:54 ` [PATCH RFC 1/5] vringfd syscall Jonathan Corbet
@ 2008-04-08  2:35 ` Arnd Bergmann
  2008-04-09 19:28 ` Jeremy Fitzhardinge
  2008-04-12 17:18 ` Marcelo Tosatti
  5 siblings, 0 replies; 29+ messages in thread
From: Arnd Bergmann @ 2008-04-08  2:35 UTC (permalink / raw)
  To: virtualization; +Cc: Rusty Russell, linux-kernel, netdev, Max Krasnyansky

On Saturday 05 April 2008, Rusty Russell wrote:
> +asmlinkage long sys_vringfd(void __user *addr,
> +                           unsigned num_descs,
> +                           u16 __user *last_used)
> +{
> +       int fd, err;
> +       struct file *filp;
> +       struct vring_info *vr;
> +
> +       /* Must be a power of two, and representable by u16 */
> +       if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) {
> +               err = -EINVAL;
> +               goto out;
> +       }
> +
> +       fd = get_unused_fd();
> +       if (fd < 0) {
> +               err = fd;
> +               goto out;
> +       }
> +
> +       filp = alloc_file(vring_mnt, dget(vring_mnt->mnt_root), FMODE_WRITE,
> +                         &vring_fops);
> +       if (!filp) {
> +               err = -ENFILE;
> +               goto put_fd;
> +       }

This looks like a candidate for anon_inode_getfd(), which would let you
get rid of the code for registering your own file system.

	Arnd <><

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 2/5] vringfd base/offset
  2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
                     ` (2 preceding siblings ...)
       [not found]   ` <200804052205.43824.rusty__2650.41595926068$1207397436$gmane$org@rustcorp.com.au>
@ 2008-04-08  5:14   ` Arnd Bergmann
  3 siblings, 0 replies; 29+ messages in thread
From: Arnd Bergmann @ 2008-04-08  5:14 UTC (permalink / raw)
  To: virtualization; +Cc: Rusty Russell, linux-kernel, netdev, Max Krasnyansky

On Saturday 05 April 2008, Rusty Russell wrote:
>  static const struct file_operations vring_fops = {
>         .release        = vring_release,
>         .write          = vring_write,
>         .poll           = vring_poll,
> +       .ioctl          = vring_ioctl,
>  };

This should also set the .compat_ioctl pointer. If the argument 
is defined as an actual memory location instead of a pointer,
you also don't need the compat_ptr() conversion for s390x, so
both can point to the same vring_ioctl function.

	Arnd <><

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-05 12:05   ` [PATCH RFC 3/5] tun: vringfd receive support Rusty Russell
  2008-04-05 12:06     ` [PATCH RFC 4/5] tun: vringfd xmit support Rusty Russell
@ 2008-04-08 19:49     ` Max Krasnyansky
  2008-04-09 12:46       ` Dor Laor
  2008-04-10  5:44       ` Rusty Russell
  1 sibling, 2 replies; 29+ messages in thread
From: Max Krasnyansky @ 2008-04-08 19:49 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, virtualization

Rusty Russell wrote:
> This patch modifies tun to allow a vringfd to specify the receive
> buffer.  Because we can't copy to userspace in bh context, we queue
> like normal then use the "pull" hook to actually do the copy.
> 
> More thought needs to be put into the possible races with ring
> registration and a simultaneous close, for example (see FIXME).
> 
> We use struct virtio_net_hdr prepended to packets in the ring to allow
> userspace to receive GSO packets in future (at the moment, the tun
> driver doesn't tell the stack it can handle them, so these cases are
> never taken).

In general the code looks good. The only thing I could not convince myself in
is whether having generic ring buffer makes sense or not.
At least the TUN driver would be more efficient if it had its own simple ring
 implementation. Less indirection, fewer callbacks, fewer if()s, etc. TUN
already has the file descriptor and having two additional fds for rx and tx
ring is a waste (think of a VPN server that has to have a bunch of TUN fds).
Also as I mentioned before Jamal and I wanted to expose some of the SKB fields
through TUN device. With the rx/tx rings the natural way of doing that would
be the ring descriptor itself. It can of course be done the same way we copy
proto info (PI) and GSO stuff before the packet but that means more
copy_to_user() calls and yet more checks.

So. What am I missing ? Why do we need generic ring for the TUN ? I looked at
the lguest code a bit and it seems that we need a bunch of network specific
code anyway. The cool thing is that you can now mmap the rings into the guest
 directly but the same thing can be done with TUN specific rings.

Max

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-08 19:49     ` [PATCH RFC 3/5] tun: vringfd receive support Max Krasnyansky
@ 2008-04-09 12:46       ` Dor Laor
  2008-04-10 17:02         ` Max Krasnyanskiy
  2008-04-10  5:44       ` Rusty Russell
  1 sibling, 1 reply; 29+ messages in thread
From: Dor Laor @ 2008-04-09 12:46 UTC (permalink / raw)
  To: Max Krasnyansky; +Cc: Rusty Russell, linux-kernel, netdev, virtualization


On Tue, 2008-04-08 at 12:49 -0700, Max Krasnyansky wrote:
> Rusty Russell wrote:
> > This patch modifies tun to allow a vringfd to specify the receive
> > buffer.  Because we can't copy to userspace in bh context, we queue
> > like normal then use the "pull" hook to actually do the copy.
> > 
> > More thought needs to be put into the possible races with ring
> > registration and a simultaneous close, for example (see FIXME).
> > 
> > We use struct virtio_net_hdr prepended to packets in the ring to allow
> > userspace to receive GSO packets in future (at the moment, the tun
> > driver doesn't tell the stack it can handle them, so these cases are
> > never taken).
> 
> In general the code looks good. The only thing I could not convince myself in
> is whether having generic ring buffer makes sense or not.
> At least the TUN driver would be more efficient if it had its own simple ring
>  implementation. Less indirection, fewer callbacks, fewer if()s, etc. TUN
> already has the file descriptor and having two additional fds for rx and tx
> ring is a waste (think of a VPN server that has to have a bunch of TUN fds).
> Also as I mentioned before Jamal and I wanted to expose some of the SKB fields
> through TUN device. With the rx/tx rings the natural way of doing that would
> be the ring descriptor itself. It can of course be done the same way we copy
> proto info (PI) and GSO stuff before the packet but that means more
> copy_to_user() calls and yet more checks.
> 
> So. What am I missing ? Why do we need generic ring for the TUN ? I looked at
> the lguest code a bit and it seems that we need a bunch of network specific
> code anyway. The cool thing is that you can now mmap the rings into the guest
>  directly but the same thing can be done with TUN specific rings.
> 

The idea was to use the same virtio ring that the guests use.
The thing with TUN specific ring is that the guests are the one
allocating the rings within their memory space and the opposite makes
life very complex.
Cheers,
Dor

> Max
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
                   ` (3 preceding siblings ...)
  2008-04-08  2:35 ` Arnd Bergmann
@ 2008-04-09 19:28 ` Jeremy Fitzhardinge
  2008-04-12 17:18 ` Marcelo Tosatti
  5 siblings, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2008-04-09 19:28 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Rusty Russell wrote:
> +asmlinkage long sys_vringfd(void __user *addr,
> +			    unsigned num_descs,
> +			    u16 __user *last_used)
> +{
> +	int fd, err;
> +	struct file *filp;
> +	struct vring_info *vr;
> +
> +	/* Must be a power of two, and representable by u16 */
> +	if (!num_descs || (num_descs & (num_descs-1)) || num_descs > 65536) {
>   

65536 doesn't fit in u16.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-08 19:49     ` [PATCH RFC 3/5] tun: vringfd receive support Max Krasnyansky
  2008-04-09 12:46       ` Dor Laor
@ 2008-04-10  5:44       ` Rusty Russell
  2008-04-10 17:18         ` Max Krasnyanskiy
  1 sibling, 1 reply; 29+ messages in thread
From: Rusty Russell @ 2008-04-10  5:44 UTC (permalink / raw)
  To: Max Krasnyansky; +Cc: linux-kernel, netdev, virtualization

On Wednesday 09 April 2008 05:49:15 Max Krasnyansky wrote:
> Rusty Russell wrote:
> > This patch modifies tun to allow a vringfd to specify the receive
> > buffer.  Because we can't copy to userspace in bh context, we queue
> > like normal then use the "pull" hook to actually do the copy.
> >
> > More thought needs to be put into the possible races with ring
> > registration and a simultaneous close, for example (see FIXME).
> >
> > We use struct virtio_net_hdr prepended to packets in the ring to allow
> > userspace to receive GSO packets in future (at the moment, the tun
> > driver doesn't tell the stack it can handle them, so these cases are
> > never taken).
>
> In general the code looks good. The only thing I could not convince myself
> in is whether having generic ring buffer makes sense or not.
> At least the TUN driver would be more efficient if it had its own simple
> ring implementation. Less indirection, fewer callbacks, fewer if()s, etc.
> TUN already has the file descriptor and having two additional fds for rx
> and tx ring is a waste (think of a VPN server that has to have a bunch of
> TUN fds). Also as I mentioned before Jamal and I wanted to expose some of
> the SKB fields through TUN device. With the rx/tx rings the natural way of
> doing that would be the ring descriptor itself. It can of course be done
> the same way we copy proto info (PI) and GSO stuff before the packet but
> that means more copy_to_user() calls and yet more checks.
>
> So. What am I missing ? Why do we need generic ring for the TUN ? I looked
> at the lguest code a bit and it seems that we need a bunch of network
> specific code anyway. The cool thing is that you can now mmap the rings
> into the guest directly but the same thing can be done with TUN specific
> rings.

I started modifying tun to do this directly, but it ended up with a whole heap 
of code just for the rings, and a lot of current code (eg. read, write, poll) 
ended up inside an 'if (tun->rings) ... else {'.  Having a natural poll() 
interface for the rings made more sense, so being their own fds fell out 
naturally.

I decided to float this version because it does minimal damage to tun, and I 
know that other people have wanted rings before: I'd like to know if this is 
likely to be generic enough for them.

Thanks!
Rusty

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-09 12:46       ` Dor Laor
@ 2008-04-10 17:02         ` Max Krasnyanskiy
  0 siblings, 0 replies; 29+ messages in thread
From: Max Krasnyanskiy @ 2008-04-10 17:02 UTC (permalink / raw)
  To: dor.laor; +Cc: Rusty Russell, linux-kernel, netdev, virtualization

Dor Laor wrote:
> On Tue, 2008-04-08 at 12:49 -0700, Max Krasnyansky wrote:
>> Rusty Russell wrote:
>>> This patch modifies tun to allow a vringfd to specify the receive
>>> buffer.  Because we can't copy to userspace in bh context, we queue
>>> like normal then use the "pull" hook to actually do the copy.
>>>
>>> More thought needs to be put into the possible races with ring
>>> registration and a simultaneous close, for example (see FIXME).
>>>
>>> We use struct virtio_net_hdr prepended to packets in the ring to allow
>>> userspace to receive GSO packets in future (at the moment, the tun
>>> driver doesn't tell the stack it can handle them, so these cases are
>>> never taken).
>> In general the code looks good. The only thing I could not convince myself in
>> is whether having generic ring buffer makes sense or not.
>> At least the TUN driver would be more efficient if it had its own simple ring
>>  implementation. Less indirection, fewer callbacks, fewer if()s, etc. TUN
>> already has the file descriptor and having two additional fds for rx and tx
>> ring is a waste (think of a VPN server that has to have a bunch of TUN fds).
>> Also as I mentioned before Jamal and I wanted to expose some of the SKB fields
>> through TUN device. With the rx/tx rings the natural way of doing that would
>> be the ring descriptor itself. It can of course be done the same way we copy
>> proto info (PI) and GSO stuff before the packet but that means more
>> copy_to_user() calls and yet more checks.
>>
>> So. What am I missing ? Why do we need generic ring for the TUN ? I looked at
>> the lguest code a bit and it seems that we need a bunch of network specific
>> code anyway. The cool thing is that you can now mmap the rings into the guest
>>  directly but the same thing can be done with TUN specific rings.
>>
> 
> The idea was to use the same virtio ring that the guests use.
> The thing with TUN specific ring is that the guests are the one
> allocating the rings within their memory space and the opposite makes
> life very complex.

We can do the same thing with TUN rings. I mean have them allocated in the 
guest space. With that we'd still have all of the advantages that I listed 
above. ie We'd have ring descriptors that carry packet info, less indirection, 
etc.

Max



^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 3/5] tun: vringfd receive support.
  2008-04-10  5:44       ` Rusty Russell
@ 2008-04-10 17:18         ` Max Krasnyanskiy
  0 siblings, 0 replies; 29+ messages in thread
From: Max Krasnyanskiy @ 2008-04-10 17:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, virtualization

Rusty Russell wrote:
> On Wednesday 09 April 2008 05:49:15 Max Krasnyansky wrote:
>> Rusty Russell wrote:
>>> This patch modifies tun to allow a vringfd to specify the receive
>>> buffer.  Because we can't copy to userspace in bh context, we queue
>>> like normal then use the "pull" hook to actually do the copy.
>>>
>>> More thought needs to be put into the possible races with ring
>>> registration and a simultaneous close, for example (see FIXME).
>>>
>>> We use struct virtio_net_hdr prepended to packets in the ring to allow
>>> userspace to receive GSO packets in future (at the moment, the tun
>>> driver doesn't tell the stack it can handle them, so these cases are
>>> never taken).
>> In general the code looks good. The only thing I could not convince myself
>> in is whether having generic ring buffer makes sense or not.
>> At least the TUN driver would be more efficient if it had its own simple
>> ring implementation. Less indirection, fewer callbacks, fewer if()s, etc.
>> TUN already has the file descriptor and having two additional fds for rx
>> and tx ring is a waste (think of a VPN server that has to have a bunch of
>> TUN fds). Also as I mentioned before Jamal and I wanted to expose some of
>> the SKB fields through TUN device. With the rx/tx rings the natural way of
>> doing that would be the ring descriptor itself. It can of course be done
>> the same way we copy proto info (PI) and GSO stuff before the packet but
>> that means more copy_to_user() calls and yet more checks.
>>
>> So. What am I missing ? Why do we need generic ring for the TUN ? I looked
>> at the lguest code a bit and it seems that we need a bunch of network
>> specific code anyway. The cool thing is that you can now mmap the rings
>> into the guest directly but the same thing can be done with TUN specific
>> rings.
> 
> I started modifying tun to do this directly, but it ended up with a whole heap 
> of code just for the rings, and a lot of current code (eg. read, write, poll) 
> ended up inside an 'if (tun->rings) ... else {'.  Having a natural poll() 
> interface for the rings made more sense, so being their own fds fell out 
> naturally.
Hmm, the version that I sent you awhile ago (remember I sent you an attachment 
with prototype of the new tun driver and user space code) was not that bad in 
that area. It mean it did not touch existing read()/write() path. The 
difference was that it allocated the rings and the data buffer in the kernel 
and mapped into the user-space. Which is not what you guys need but that's a 
separate thing.

The fd thing could be an issue. As I mentioned the example would be a VPN 
server (OpenVPN, etc) with a bunch of client connection (typically tun per 
connection).

> I decided to float this version because it does minimal damage to tun, and I
> know that other people have wanted rings before: I'd like to know if this is
> likely to be generic enough for them.
I see.

I'll try to spend some time on this in a near future and take a crack at the 
version with the TUN specific rings. Although I said that many times now and 
it may not happen in the near enough future :). In the mean time if your 
current version helps you guys a lot I do not mind us putting it in. We can 
always add another mode or something that uses internal rings and gradually 
obsolete old read()/write() and generic rings.

Max

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
                   ` (4 preceding siblings ...)
  2008-04-09 19:28 ` Jeremy Fitzhardinge
@ 2008-04-12 17:18 ` Marcelo Tosatti
  2008-04-12 17:39   ` Marcelo Tosatti
  2008-04-12 18:19   ` Rusty Russell
  5 siblings, 2 replies; 29+ messages in thread
From: Marcelo Tosatti @ 2008-04-12 17:18 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

Hi Rusty,

A couple comments below.

On Sat, Apr 05, 2008 at 10:02:08PM +1000, Rusty Russell wrote:

> +static unsigned int vring_poll(struct file *filp,
> +			       struct poll_table_struct *poll)
> +{
> +	struct vring_info *vr = filp->private_data;
> +	int err;
> +	unsigned int mask;
> +	u16 used, last_used;
> +
> +	/* Some uses of vrings require updating in user context.  This
> +	 * is best done close to the caller, ie. here. */
> +	if (vr->ops && vr->ops->pull) {
> +		err = vr->ops->pull(vr->ops_data);
> +		if (unlikely(err < 0))
> +			return err;
> +
> +		if (err > 0) {
> +			/* Buffers have been used, no need to check indices */
> +			mask = POLLIN | POLLRDNORM;
> +			goto poll_wait;
> +		}
> +	}
> +
> +	err = get_user(used, &vr->ring.used->idx);
> +	if (unlikely(err))
> +		return err;
> +
> +	err = get_user(last_used, vr->last_used);
> +	if (unlikely(err))
> +		return err;
> +
> +	/* More buffers have been used?  It's 'readable'. */
> +	if (used != last_used)
> +		mask = POLLIN | POLLRDNORM;
> +	else
> +		mask = 0;
> +
> +poll_wait:
> +	poll_wait(filp, &vr->poll_wait, poll);
> +
> +	return mask;
> +}

I suppose you are doing data copy in ->poll instead of ->read to save
a system call? This is weird, not conformant to what the interface is
supposed to do.

This way select/poll syscalls might block in userspace datacopy. The
application might have a higher priority fd in the fdset to be informed
of, for example.

So why not change this to the common arrangement, with vring_poll adding
the waitqueue with poll_wait() and vring_read doing the actual data copy ?

> +struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
> +				void *data, bool atomic_use)
> +{
> +	struct file *filp;
> +	struct vring_info *vr;
> +
> +	/* Must be a valid fd, and must be one of ours. */
> +	filp = fget(fd);
> +	if (!filp) {
> +		vr = ERR_PTR(-EBADF);
> +		goto out;
> +	}
> +
> +	if (filp->f_op != &vring_fops) {
> +		vr = ERR_PTR(-EBADF);
> +		goto fput;
> +	}
> +
> +	/* Mutex simply protects against parallel vring_attach. */
> +	mutex_lock(&vring_lock);
> +	vr = filp->private_data;
> +	if (vr->ops) {
> +		vr = ERR_PTR(-EBUSY);
> +		goto unlock;
> +	}
> +
> +	/* If they want to use atomically, we have to map the page. */
> +	if (atomic_use) {
> +		if (get_user_pages(current, current->mm,
> +				   (unsigned long)vr->ring.used, 1, 1, 1,
> +				   &vr->used_page, NULL) != 1) {

Can't the same be achieved by the app mlocking the vring pages, which
then goes through standard rlimit checking ?

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-12 17:18 ` Marcelo Tosatti
@ 2008-04-12 17:39   ` Marcelo Tosatti
  2008-04-12 18:19   ` Rusty Russell
  1 sibling, 0 replies; 29+ messages in thread
From: Marcelo Tosatti @ 2008-04-12 17:39 UTC (permalink / raw)
  To: Rusty Russell; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

On Sat, Apr 12, 2008 at 02:18:20PM -0300, Marcelo Tosatti wrote:

> > +		mask = POLLIN | POLLRDNORM;
> > +	else
> > +		mask = 0;
> > +
> > +poll_wait:
> > +	poll_wait(filp, &vr->poll_wait, poll);
> > +
> > +	return mask;
> > +}
> 
> I suppose you are doing data copy in ->poll instead of ->read to save
> a system call? This is weird, not conformant to what the interface is
> supposed to do.
> 
> This way select/poll syscalls might block in userspace datacopy. The
> application might have a higher priority fd in the fdset to be informed
> of, for example.
> 
> So why not change this to the common arrangement, with vring_poll adding
> the waitqueue with poll_wait() and vring_read doing the actual data copy ?
> 
> > +struct vring_info *vring_attach(int fd, const struct vring_ops *ops,
> > +				void *data, bool atomic_use)
> > +{
> > +	struct file *filp;
> > +	struct vring_info *vr;
> > +
> > +	/* Must be a valid fd, and must be one of ours. */
> > +	filp = fget(fd);
> > +	if (!filp) {
> > +		vr = ERR_PTR(-EBADF);
> > +		goto out;
> > +	}
> > +
> > +	if (filp->f_op != &vring_fops) {
> > +		vr = ERR_PTR(-EBADF);
> > +		goto fput;
> > +	}
> > +
> > +	/* Mutex simply protects against parallel vring_attach. */
> > +	mutex_lock(&vring_lock);
> > +	vr = filp->private_data;
> > +	if (vr->ops) {
> > +		vr = ERR_PTR(-EBUSY);
> > +		goto unlock;
> > +	}
> > +
> > +	/* If they want to use atomically, we have to map the page. */
> > +	if (atomic_use) {
> > +		if (get_user_pages(current, current->mm,
> > +				   (unsigned long)vr->ring.used, 1, 1, 1,
> > +				   &vr->used_page, NULL) != 1) {
> 
> Can't the same be achieved by the app mlocking the vring pages, which
> then goes through standard rlimit checking ?

Oh, this is a driver API to allow the copy to take place in atomic
contexes. You might want to add some sort of limit enforcement.

Also forgot mmap_sem there.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH RFC 1/5] vringfd syscall
  2008-04-12 17:18 ` Marcelo Tosatti
  2008-04-12 17:39   ` Marcelo Tosatti
@ 2008-04-12 18:19   ` Rusty Russell
  1 sibling, 0 replies; 29+ messages in thread
From: Rusty Russell @ 2008-04-12 18:19 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: linux-kernel, netdev, Max Krasnyansky, virtualization

On Sunday 13 April 2008 03:18:20 Marcelo Tosatti wrote:
> I suppose you are doing data copy in ->poll instead of ->read to save
> a system call? This is weird, not conformant to what the interface is
> supposed to do.

Hi Marcelo,

   Thanks for reading this.

   Yeah, I dislike it too.  The main motivation was that not all rings need to 
be "pulled" in this way, but it turns out that userspace can detect this 
(poll, check ring, if nothing, try read, check ring).  So next version will 
unscrew the poll callback.

> > +	/* If they want to use atomically, we have to map the page. */
> > +	if (atomic_use) {
> > +		if (get_user_pages(current, current->mm,
> > +				   (unsigned long)vr->ring.used, 1, 1, 1,
> > +				   &vr->used_page, NULL) != 1) {
>
> Can't the same be achieved by the app mlocking the vring pages, which
> then goes through standard rlimit checking ?

Tun/tap writes to this from any context.  Fortunately, this too is changing, 
as I came up with a sane way of queueing to userspace context for this case.

Expect updated patches Monday.

Thanks!
Rusty.

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2008-04-12 18:20 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-05 12:02 [PATCH RFC 1/5] vringfd syscall Rusty Russell
2008-04-05 12:04 ` [PATCH RFC 2/5] vringfd base/offset Rusty Russell
2008-04-05 12:05   ` [PATCH RFC 3/5] tun: vringfd receive support Rusty Russell
2008-04-05 12:06     ` [PATCH RFC 4/5] tun: vringfd xmit support Rusty Russell
2008-04-05 12:09       ` [PATCH RFC 5/5] lguest support Rusty Russell
2008-04-07  5:13       ` [PATCH RFC 4/5] tun: vringfd xmit support Herbert Xu
2008-04-07  7:24         ` Rusty Russell
2008-04-07  7:35           ` David Miller
2008-04-08  1:51             ` Rusty Russell
2008-04-08 19:49     ` [PATCH RFC 3/5] tun: vringfd receive support Max Krasnyansky
2008-04-09 12:46       ` Dor Laor
2008-04-10 17:02         ` Max Krasnyanskiy
2008-04-10  5:44       ` Rusty Russell
2008-04-10 17:18         ` Max Krasnyanskiy
2008-04-05 12:44   ` [PATCH RFC 2/5] vringfd base/offset Avi Kivity
2008-04-06  2:54     ` Rusty Russell
     [not found]   ` <200804052205.43824.rusty__2650.41595926068$1207397436$gmane$org@rustcorp.com.au>
2008-04-05 17:26     ` [PATCH RFC 3/5] tun: vringfd receive support Anthony Liguori
2008-04-08  5:14   ` [PATCH RFC 2/5] vringfd base/offset Arnd Bergmann
     [not found] ` <200804052204.28518.rusty__10896.9346424148$1207397431$gmane$org@rustcorp.com.au>
2008-04-05 17:18   ` Anthony Liguori
2008-04-06  3:23     ` Rusty Russell
2008-04-07 17:54 ` [PATCH RFC 1/5] vringfd syscall Jonathan Corbet
2008-04-07 22:34   ` Rusty Russell
2008-04-08  2:35 ` Arnd Bergmann
2008-04-09 19:28 ` Jeremy Fitzhardinge
2008-04-12 17:18 ` Marcelo Tosatti
2008-04-12 17:39   ` Marcelo Tosatti
2008-04-12 18:19   ` Rusty Russell
     [not found] <200804052202.09157.rusty__7324.67876882783$1207397085$gmane$org@rustcorp.com.au>
2008-04-05 17:13 ` Anthony Liguori
2008-04-06  3:03   ` Rusty Russell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).