[uml-devel] [PATCH 01/10] Epoll based interrupt controller

All of lore.kernel.org
 help / color / mirror / Atom feed

* [uml-devel] [PATCH 01/10] Epoll based interrupt controller
@ 2014-08-29  7:05 anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
                   ` (7 more replies)
  0 siblings, 8 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

1. Minimum kernel 2.5.99
2. No "walk the list" lookups for received IRQs - immediate identification
of the correct handler to invoke
3. Full set of IRQ semantics - edge, level, read, write
    3.1. Write is now a *REAL* write - so if you (ab)use the
    write to signify NONE (as in line.c) you will hang!!!
    3.2. Read is fully backward compatible
4. Otherwise mostly compatible with original poll() based controller
5. Provides significant performance improvement (up to 10x times for
large device numbers) ands lays the groundwork for the network and
timer improvements to follow

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/line.c            |    3 +-
 arch/um/include/shared/irq_user.h |   19 +-
 arch/um/include/shared/os.h       |   13 +-
 arch/um/kernel/irq.c              |  456 +++++++++++++++++++++++++------------
 arch/um/os-Linux/irq.c            |  145 +++++-------
 5 files changed, 392 insertions(+), 244 deletions(-)

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 8035145..6c4511f 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -283,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
 	if (err)
 		return err;
 	if (output)
-		err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
+		err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
 				     line_write_interrupt, IRQF_SHARED,
 				     driver->write_irq_name, data);
 	return err;
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index df56330..8d6eaff 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -9,16 +10,18 @@
 #include <sysdep/ptrace.h>
 
 struct irq_fd {
-	struct irq_fd *next;
-	void *id;
-	int fd;
-	int type;
-	int irq;
-	int events;
-	int current_events;
+        struct irq_fd *next;
+        struct irq_fd *leaf;
+        void *id;
+        int fd;
+        int type;
+        int irq;
+        int events;
 };
 
-enum { IRQ_READ, IRQ_WRITE };
+#define IRQ_NONE 0
+#define IRQ_READ  1
+#define IRQ_WRITE 2 
 
 struct siginfo;
 extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 021104d..17b4e9f 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -276,15 +277,17 @@ extern void halt_skas(void);
 extern void reboot_skas(void);
 
 /* irq.c */
-extern int os_waiting_for_events(struct irq_fd *active_fds);
-extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
+
+extern int os_setup_epoll(int maxevents);
+extern int os_waiting_for_events_epoll(void *kernel_events, int maxevents);
+extern int os_add_epoll_fd (int events, int fd, void * data);
+extern int os_mod_epoll_fd (int events, int fd, void * data);
+extern int os_del_epoll_fd (int fd);
+
 extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
 		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
 extern void os_free_irq_later(struct irq_fd *active_fds,
 		int irq, void *dev_id);
-extern int os_get_pollfd(int i);
-extern void os_set_pollfd(int i, int fd);
-extern void os_set_ioignore(void);
 
 /* sigio.c */
 extern int add_sigio_fd(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 1d8505b..5d7ee49e 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
@@ -18,6 +19,61 @@
 #include <os.h>
 
 /*
+*	We are on the "kernel side" so we cannot pick up the sys/epoll.h 
+*	So we lift out of it the applicable key definitions.
+*/
+
+
+enum EPOLL_EVENTS
+  {
+	EPOLLIN = 0x001,
+#define EPOLLIN EPOLLIN
+	EPOLLPRI = 0x002,
+#define EPOLLPRI EPOLLPRI
+	EPOLLOUT = 0x004,
+#define EPOLLOUT EPOLLOUT
+	EPOLLRDNORM = 0x040,
+#define EPOLLRDNORM EPOLLRDNORM
+	EPOLLRDBAND = 0x080,
+#define EPOLLRDBAND EPOLLRDBAND
+	EPOLLWRNORM = 0x100,
+#define EPOLLWRNORM EPOLLWRNORM
+	EPOLLWRBAND = 0x200,
+#define EPOLLWRBAND EPOLLWRBAND
+	EPOLLMSG = 0x400,
+#define EPOLLMSG EPOLLMSG
+	EPOLLERR = 0x008,
+#define EPOLLERR EPOLLERR
+	EPOLLHUP = 0x010,
+#define EPOLLHUP EPOLLHUP
+	EPOLLRDHUP = 0x2000,
+#define EPOLLRDHUP EPOLLRDHUP
+	EPOLLONESHOT = (1 << 30),
+#define EPOLLONESHOT EPOLLONESHOT
+	EPOLLET = (1 << 31)
+#define EPOLLET EPOLLET
+  };
+
+
+typedef union epoll_data
+{
+	void *ptr;
+	int fd;
+	uint32_t u32;
+	uint64_t u64;
+} epoll_data_t;
+
+struct epoll_event
+{
+	uint32_t events;	/* Epoll events */
+	epoll_data_t data;	/* User data variable */
+} __attribute__ ((__packed__));
+
+#define MAX_EPOLL_EVENTS 16
+
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+
+/*
  * This list is accessed under irq_lock, except in sigio_handler,
  * where it is safe from being modified.  IRQ handlers won't change it -
  * if an IRQ source has vanished, it will be freed by free_irqs just
@@ -26,46 +82,98 @@
  * remove list elements, taking the irq_lock to do so.
  */
 static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
 
 extern void free_irqs(void);
 
+/*
+ the in_epoll_loop is not static on purpose - we will use this to
+ determine if we can do delayed queue flushes in devices. The idea is -
+ if we read 32 packets at a time using recvmmsg we need an
+ indication that we will be reading more so no point to send now
+ and flush the queue only once we are done with it
+*/
+
+DEFINE_SPINLOCK(uml_sigio_lock);
+
+int in_epoll_loop = 0;
+
+static DEFINE_SPINLOCK(irq_lock);
+static DEFINE_SPINLOCK(event_loop);
+
 void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 {
 	struct irq_fd *irq_fd;
-	int n;
+	unsigned long flags;
+
+	int n, i;
 
 	if (smp_sigio_handler())
 		return;
 
 	while (1) {
-		n = os_waiting_for_events(active_fds);
+		spin_lock_irqsave(&uml_sigio_lock, flags);
+		in_epoll_loop = 1;
+		n = os_waiting_for_events_epoll(
+			&epoll_events, MAX_EPOLL_EVENTS
+		);
 		if (n <= 0) {
-			if (n == -EINTR)
-				continue;
-			else break;
+			in_epoll_loop = 0;
+			spin_unlock_irqrestore(&uml_sigio_lock, flags);
+			break;
 		}
-
-		for (irq_fd = active_fds; irq_fd != NULL;
-		     irq_fd = irq_fd->next) {
-			if (irq_fd->current_events != 0) {
-				irq_fd->current_events = 0;
-				do_IRQ(irq_fd->irq, regs);
+		for (i = 0; i < n ; i++) {
+			for (
+				irq_fd = (struct irq_fd *)
+					epoll_events[i].data.ptr;
+				irq_fd != NULL;
+				irq_fd = irq_fd->leaf) {
+				if (epoll_events[i].events & irq_fd->events) {
+					do_IRQ(irq_fd->irq, regs);
+				}
 			}
 		}
+		in_epoll_loop = 0;
+		spin_unlock_irqrestore(&uml_sigio_lock, flags);
 	}
 
+	/* This needs a better way - it slows down the event loop */
+
 	free_irqs();
 }
-
-static DEFINE_SPINLOCK(irq_lock);
+#define TRUNK_FORMAT "trunk %d\tfd %03d, events %03x, dev %p\n"
+#define LEAF_FORMAT "leaf %d\tfd %03d, events %03x, dev %p\n"
+
+static void dump_interrupt_map (void) {
+	struct irq_fd * irq, *leaf ;
+	printk("MAP:\n");
+	for (irq  = active_fds; irq != NULL; irq = irq->next) {
+		printk(
+			TRUNK_FORMAT,
+			irq->irq, irq->fd, irq->events, irq->id
+		);
+		if (irq->leaf) {
+			for (	
+				leaf = irq->leaf;
+				leaf != NULL;
+				leaf = leaf->leaf
+			) {
+				printk(
+					LEAF_FORMAT,
+					leaf->irq,
+					leaf->fd,
+					leaf->events,
+					leaf->id
+				);
+			}
+		}
+	}
+}
 
 static int activate_fd(int irq, int fd, int type, void *dev_id)
 {
-	struct pollfd *tmp_pfd;
-	struct irq_fd *new_fd, *irq_fd;
+	struct irq_fd *new_fd, *irq_fd, *leaf ;
 	unsigned long flags;
-	int events, err, n;
+	int events = 0, acc_events = 0, err, n, skip = 0;
 
 	err = os_set_fd_async(fd);
 	if (err < 0)
@@ -76,64 +184,56 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
 	if (new_fd == NULL)
 		goto out;
 
-	if (type == IRQ_READ)
-		events = UM_POLLIN | UM_POLLPRI;
-	else events = UM_POLLOUT;
+	if (type & IRQ_READ)
+		events |= EPOLLIN | EPOLLPRI;
+	if (type & IRQ_WRITE)
+		events |= EPOLLOUT;
+
 	*new_fd = ((struct irq_fd) { .next  		= NULL,
-				     .id 		= dev_id,
-				     .fd 		= fd,
-				     .type 		= type,
-				     .irq 		= irq,
-				     .events 		= events,
-				     .current_events 	= 0 } );
+					 .leaf		= NULL,
+					 .id 		= dev_id,
+					 .fd 		= fd,
+					 .type 		= type,
+					 .irq 		= irq,
+					 .events 		= events });
 
 	err = -EBUSY;
 	spin_lock_irqsave(&irq_lock, flags);
+
 	for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
-		if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
-			printk(KERN_ERR "Registering fd %d twice\n", fd);
-			printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
-			printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
-			       dev_id);
-			goto out_unlock;
+		if (irq_fd->fd == fd) {
+		for (leaf = irq_fd; leaf != NULL; leaf = leaf->leaf) {
+			if (leaf->type == type) {
+				printk("Irqs : %d, %d\n", leaf->irq, irq);
+				printk("Ids : 0x%p, 0x%p\n", leaf->id, dev_id);
+				goto out_unlock;
+			}
+			acc_events |= leaf->events;
+		}
+		/* we insert it one-off-the-head - easiest
+		   we also pass our "head" as the pointer to mod
+		   so it walks correctly
+		*/
+		skip = 1;
+		new_fd->leaf = irq_fd->leaf;
+		irq_fd->leaf = new_fd;
+			if ((new_fd->events | acc_events) != acc_events) {
+				n = os_mod_epoll_fd(new_fd->events | acc_events, fd, irq_fd);
+			}
 		}
 	}
-
-	if (type == IRQ_WRITE)
-		fd = -1;
-
-	tmp_pfd = NULL;
-	n = 0;
-
-	while (1) {
-		n = os_create_pollfd(fd, events, tmp_pfd, n);
-		if (n == 0)
-			break;
-
-		/*
-		 * n > 0
-		 * It means we couldn't put new pollfd to current pollfds
-		 * and tmp_fds is NULL or too small for new pollfds array.
-		 * Needed size is equal to n as minimum.
-		 *
-		 * Here we have to drop the lock in order to call
-		 * kmalloc, which might sleep.
-		 * If something else came in and changed the pollfds array
-		 * so we will not be able to put new pollfd struct to pollfds
-		 * then we free the buffer tmp_fds and try again.
-		 */
-		spin_unlock_irqrestore(&irq_lock, flags);
-		kfree(tmp_pfd);
-
-		tmp_pfd = kmalloc(n, GFP_KERNEL);
-		if (tmp_pfd == NULL)
-			goto out_kfree;
-
-		spin_lock_irqsave(&irq_lock, flags);
+	if (! skip) {
+		/* proper IRQ registration */
+		new_fd->next = active_fds;
+		active_fds = new_fd;
+
+		if (new_fd->type != IRQ_NONE ) {	
+			n = os_add_epoll_fd(new_fd->events, fd, new_fd);
+		} else {
+			n = 0;
+		}
 	}
-
-	*last_irq_ptr = new_fd;
-	last_irq_ptr = &new_fd->next;
+	
 
 	spin_unlock_irqrestore(&irq_lock, flags);
 
@@ -141,122 +241,199 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
 	 * This calls activate_fd, so it has to be outside the critical
 	 * section.
 	 */
-	maybe_sigio_broken(fd, (type == IRQ_READ));
+
+	maybe_sigio_broken(fd, (type != IRQ_NONE));
 
 	return 0;
 
  out_unlock:
 	spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
 	kfree(new_fd);
  out:
 	return err;
 }
 
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
+/* Must be called with irq_lock held */
+static struct irq_fd *find_irq_chain_by_fd(int fd)
 {
-	unsigned long flags;
+	struct irq_fd *irq;
 
-	spin_lock_irqsave(&irq_lock, flags);
-	os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
-	spin_unlock_irqrestore(&irq_lock, flags);
+	for (irq = active_fds; irq != NULL; irq = irq->next) {
+		if (irq->fd == fd) {
+		return irq;
+		}
+	}
+	if (irq == NULL) {
+		printk(KERN_ERR
+			"find_irq_chain_by_fd doesn't have descriptor %d\n",
+			   fd);
+		dump_interrupt_map();
+	}
+	return irq;
 }
 
-struct irq_and_dev {
-	int irq;
-	void *dev;
-};
-
-static int same_irq_and_dev(struct irq_fd *irq, void *d)
+static struct irq_fd *find_irq_by_fd(int fd, int irqnum)
 {
-	struct irq_and_dev *data = d;
+	struct irq_fd *irq;
 
-	return ((irq->irq == data->irq) && (irq->id == data->dev));
+	for (irq = find_irq_chain_by_fd(fd); irq != NULL; irq = irq->leaf) {
+		if (irq->irq == irqnum)	return irq;
+	}
+	if (irq == NULL) {
+		printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
+			   fd);
+		dump_interrupt_map();
+	}
+	return irq;
 }
 
-static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
-{
-	struct irq_and_dev data = ((struct irq_and_dev) { .irq  = irq,
-							  .dev  = dev });
-
-	free_irq_by_cb(same_irq_and_dev, &data);
+static void free_leaf_irq_by_irq_and_dev(unsigned int irq, void *dev, struct irq_fd * prev) {
+	/* this is called out of free_irq_by_irq_and_dev with a held lock */
+	struct irq_fd *leaf;
+	if (prev != NULL) {
+		leaf = prev->leaf;
+	} else {
+		return;
+	}
+	while (leaf != NULL) {
+		if ((leaf->irq == irq) && (leaf->id == dev)) {
+			if (leaf->events) {
+				os_del_epoll_fd(leaf->fd);
+			}
+			prev->leaf = leaf->leaf;
+			kfree(leaf);
+		} else {
+			prev = leaf;
+		}
+		leaf = prev->leaf;
+	}
 }
 
-static int same_fd(struct irq_fd *irq, void *fd)
+static int do_free_irq_by_irq_and_dev(unsigned int irq, void *dev)
 {
-	return (irq->fd == *((int *)fd));
-}
+	unsigned long flags;
+	struct irq_fd *prev, * trunk;
+	spin_lock_irqsave(&irq_lock, flags);
 
-void free_irq_by_fd(int fd)
-{
-	free_irq_by_cb(same_fd, &fd);
-}
+	trunk = active_fds;
+	prev = NULL;
 
-/* Must be called with irq_lock held */
-static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
-{
-	struct irq_fd *irq;
-	int i = 0;
-	int fdi;
+	while (trunk != NULL) {
+		
+		/* walk the branch and free irq descriptor if on branch */
 
-	for (irq = active_fds; irq != NULL; irq = irq->next) {
-		if ((irq->fd == fd) && (irq->irq == irqnum))
-			break;
-		i++;
+		if (trunk->leaf != NULL) {
+			free_leaf_irq_by_irq_and_dev(irq, dev, trunk);
+		}
+
+		if ((trunk->irq == irq) && (trunk->id == dev)) {
+			/* delete irq descriptor off trunk */
+			if (trunk->leaf != NULL) {
+				/* leaf non-null, attach instead of
+				   freed irq descriptor
+				 */
+				if (prev != NULL) {
+					prev->next = trunk->leaf;
+				} else {
+					active_fds = trunk->leaf;
+				}
+				trunk->leaf->next = trunk->next;
+				if (trunk->events) {
+					os_del_epoll_fd(trunk->fd);
+				}
+				kfree(trunk);
+			} else {
+				if (prev != NULL) {
+					prev->next = trunk->next;
+				} else {
+					active_fds = trunk->next;
+				}
+				if (trunk->events) {
+					os_del_epoll_fd(trunk->fd);
+				}
+				kfree(trunk);
+			}
+			/* irq + dev should be unique, it is also easier
+			   to restart than to juggle all the pointers after
+			   making holes in the list
+			*/
+			return 1;
+		}
+		prev = trunk;
+		trunk = trunk->next;
 	}
-	if (irq == NULL) {
-		printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
-		       fd);
-		goto out;
+	spin_unlock_irqrestore(&irq_lock, flags);
+	return 0;
+}
+static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) {
+	while (do_free_irq_by_irq_and_dev(irq, dev) != 0) {
 	}
-	fdi = os_get_pollfd(i);
-	if ((fdi != -1) && (fdi != fd)) {
-		printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
-		       "and pollfds, fd %d vs %d, need %d\n", irq->fd,
-		       fdi, fd);
-		irq = NULL;
-		goto out;
+}
+
+void free_irq_by_fd(int fd)
+{
+	struct irq_fd *irq, * found;
+	unsigned long flags;
+	spin_lock_irqsave(&irq_lock, flags);
+	found = find_irq_chain_by_fd(fd);
+	if (found == NULL) {
+		spin_unlock_irqrestore(&irq_lock, flags);
+		return;
 	}
-	*index_out = i;
- out:
-	return irq;
+	os_del_epoll_fd(fd);
+	/* free the whole chain */
+	while (found != NULL) {
+		irq = found;
+		found = found->leaf;
+		kfree(irq);
+	}
+	spin_unlock_irqrestore(&irq_lock, flags);
+	
 }
 
 void reactivate_fd(int fd, int irqnum)
 {
-	struct irq_fd *irq;
+	struct irq_fd *irq, * found;
 	unsigned long flags;
-	int i;
+	int acc_events = 0;
 
 	spin_lock_irqsave(&irq_lock, flags);
-	irq = find_irq_by_fd(fd, irqnum, &i);
-	if (irq == NULL) {
+	found = find_irq_chain_by_fd(fd);
+	if (found == NULL) {
 		spin_unlock_irqrestore(&irq_lock, flags);
 		return;
 	}
-	os_set_pollfd(i, irq->fd);
+	for (
+		irq = found;
+		irq != NULL;
+		irq = irq->leaf) {
+		acc_events |= irq->events;
+	}
+	if (os_add_epoll_fd(acc_events, fd, found) !=0) {
+		os_mod_epoll_fd(acc_events, fd, found);
+	}
 	spin_unlock_irqrestore(&irq_lock, flags);
-
 	add_sigio_fd(fd);
+
 }
 
 void deactivate_fd(int fd, int irqnum)
 {
 	struct irq_fd *irq;
 	unsigned long flags;
-	int i;
 
 	spin_lock_irqsave(&irq_lock, flags);
-	irq = find_irq_by_fd(fd, irqnum, &i);
+	irq = find_irq_by_fd(fd, irqnum);
 	if (irq == NULL) {
 		spin_unlock_irqrestore(&irq_lock, flags);
 		return;
 	}
 
-	os_set_pollfd(i, -1);
+	os_del_epoll_fd(irq->fd);
 	spin_unlock_irqrestore(&irq_lock, flags);
-
 	ignore_sigio_fd(fd);
+
+
 }
 EXPORT_SYMBOL(deactivate_fd);
 
@@ -272,12 +449,11 @@ int deactivate_all_fds(void)
 	int err;
 
 	for (irq = active_fds; irq != NULL; irq = irq->next) {
+		os_del_epoll_fd(irq->fd);   /* ignore err, just do it */
 		err = os_clear_fd_async(irq->fd);
 		if (err)
 			return err;
 	}
-	/* If there is a signal already queued, after unblocking ignore it */
-	os_set_ioignore();
 
 	return 0;
 }
@@ -311,13 +487,13 @@ int um_request_irq(unsigned int irq, int fd, int type,
 {
 	int err;
 
-	if (fd != -1) {
+	err = request_irq(irq, handler, irqflags, devname, dev_id);
+
+	if ((!err) && (fd != -1)) {
 		err = activate_fd(irq, fd, type, dev_id);
-		if (err)
-			return err;
 	}
 
-	return request_irq(irq, handler, irqflags, devname, dev_id);
+	return err;
 }
 
 EXPORT_SYMBOL(um_request_irq);
@@ -355,9 +531,9 @@ void __init init_IRQ(void)
 	int i;
 
 	irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
-
-	for (i = 1; i < NR_IRQS; i++)
+	for (i = 1; i < NR_IRQS - 1 ; i++)
 		irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
+	os_setup_epoll(MAX_EPOLL_EVENTS);
 }
 
 /*
@@ -385,11 +561,11 @@ void __init init_IRQ(void)
  * thread_info.
  *
  * There are three cases -
- *     The first interrupt on the stack - sets up the thread_info and
+ *	 The first interrupt on the stack - sets up the thread_info and
  * handles the interrupt
- *     A nested interrupt interrupting the copying of the thread_info -
+ *	 A nested interrupt interrupting the copying of the thread_info -
  * can't handle the interrupt, as the stack is in an unknown state
- *     A nested interrupt not interrupting the copying of the
+ *	 A nested interrupt not interrupting the copying of the
  * thread_info - doesn't do any setup, just handles the interrupt
  *
  * The first job is to figure out whether we interrupted stack setup.
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74..837aa68 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -6,6 +7,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <poll.h>
+#include <sys/epoll.h>
 #include <signal.h>
 #include <string.h>
 #include <irq_user.h>
@@ -16,117 +18,80 @@
  * Locked by irq_lock in arch/um/kernel/irq.c.  Changed by os_create_pollfd
  * and os_free_irq_by_cb, which are called under irq_lock.
  */
-static struct pollfd *pollfds = NULL;
-static int pollfds_num = 0;
-static int pollfds_size = 0;
 
-int os_waiting_for_events(struct irq_fd *active_fds)
+/* epoll support */
+
+
+static int epollfd = -1;
+
+int os_setup_epoll(int maxevents) {
+	epollfd = epoll_create(maxevents);
+	return epollfd;
+}
+
+int os_waiting_for_events_epoll(void *kernel_events, int maxevents)
 {
-	struct irq_fd *irq_fd;
-	int i, n, err;
+	int n, err;
 
-	n = poll(pollfds, pollfds_num, 0);
+	n = epoll_wait(epollfd,
+		(struct epoll_event *) kernel_events, maxevents, 0);
 	if (n < 0) {
 		err = -errno;
 		if (errno != EINTR)
-			printk(UM_KERN_ERR "os_waiting_for_events:"
-			       " poll returned %d, errno = %d\n", n, errno);
+			printk(
+				UM_KERN_ERR "os_waiting_for_events:"
+				" poll returned %d, error = %s\n", n,
+				strerror(errno)
+			);
 		return err;
 	}
 
-	if (n == 0)
-		return 0;
-
-	irq_fd = active_fds;
-
-	for (i = 0; i < pollfds_num; i++) {
-		if (pollfds[i].revents != 0) {
-			irq_fd->current_events = pollfds[i].revents;
-			pollfds[i].fd = -1;
-		}
-		irq_fd = irq_fd->next;
-	}
 	return n;
 }
 
-int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
-{
-	if (pollfds_num == pollfds_size) {
-		if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
-			/* return min size needed for new pollfds area */
-			return (pollfds_size + 1) * sizeof(pollfds[0]);
-		}
-
-		if (pollfds != NULL) {
-			memcpy(tmp_pfd, pollfds,
-			       sizeof(pollfds[0]) * pollfds_size);
-			/* remove old pollfds */
-			kfree(pollfds);
-		}
-		pollfds = tmp_pfd;
-		pollfds_size++;
-	} else
-		kfree(tmp_pfd);	/* remove not used tmp_pfd */
+int os_add_epoll_fd (int events, int fd, void * data) {
+	struct epoll_event event;
+	int result;
 
-	pollfds[pollfds_num] = ((struct pollfd) { .fd		= fd,
-						  .events	= events,
-						  .revents	= 0 });
-	pollfds_num++;
-
-	return 0;
+	event.data.ptr = data;
+	event.events = events | EPOLLET;
+	result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
+	if ((result) && (errno == EEXIST)) {
+		result = os_mod_epoll_fd (events, fd, data);
+	}
+	if (result) {
+		printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
+	}
+	return result;
 }
 
-void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
-		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
-{
-	struct irq_fd **prev;
-	int i = 0;
-
-	prev = &active_fds;
-	while (*prev != NULL) {
-		if ((*test)(*prev, arg)) {
-			struct irq_fd *old_fd = *prev;
-			if ((pollfds[i].fd != -1) &&
-			    (pollfds[i].fd != (*prev)->fd)) {
-				printk(UM_KERN_ERR "os_free_irq_by_cb - "
-				       "mismatch between active_fds and "
-				       "pollfds, fd %d vs %d\n",
-				       (*prev)->fd, pollfds[i].fd);
-				goto out;
-			}
-
-			pollfds_num--;
-
-			/*
-			 * This moves the *whole* array after pollfds[i]
-			 * (though it doesn't spot as such)!
-			 */
-			memmove(&pollfds[i], &pollfds[i + 1],
-			       (pollfds_num - i) * sizeof(pollfds[0]));
-			if (*last_irq_ptr2 == &old_fd->next)
-				*last_irq_ptr2 = prev;
-
-			*prev = (*prev)->next;
-			if (old_fd->type == IRQ_WRITE)
-				ignore_sigio_fd(old_fd->fd);
-			kfree(old_fd);
-			continue;
-		}
-		prev = &(*prev)->next;
-		i++;
+int os_mod_epoll_fd (int events, int fd, void * data) {
+	struct epoll_event event;
+	int result;
+	event.data.ptr = data;
+	event.events = events;
+	result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
+	if (result) {
+		printk("epollctl mod err fd %d, %s\n", fd, strerror(errno));
 	}
- out:
-	return;
+	return result;
 }
 
-int os_get_pollfd(int i)
-{
-	return pollfds[i].fd;
+int os_del_epoll_fd (int fd) {
+	struct epoll_event event;
+	int result;
+	result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
+	if (result) {
+		printk("epollctl del err %s\n", strerror(errno));
+	}
+	return result;
 }
 
-void os_set_pollfd(int i, int fd)
+void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
+		struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
 {
-	pollfds[i].fd = fd;
+	printk("Someone invoking obsolete deactivate_by_CB!!!\n");
+	return;
 }
 
 void os_set_ioignore(void)
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 02/10] Remove unnecessary 'reactivate' statements
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 03/10] High performance networking subsystem anton.ivanov
                   ` (6 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

The epoll based controller has real (not emulated) edge and
level semantics and the edge/level is handled by epoll. There
is no toggling of the poll set any more, thus it is removed
throughout

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/chan_kern.c     |    2 --
 arch/um/drivers/line.c          |    2 --
 arch/um/drivers/mconsole_kern.c |    2 --
 arch/um/drivers/net_kern.c      |    2 --
 arch/um/drivers/port_kern.c     |    1 -
 arch/um/drivers/random.c        |    1 -
 arch/um/drivers/ubd_kern.c      |    1 -
 7 files changed, 11 deletions(-)

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index acbe6c6..db0ff51 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -564,8 +564,6 @@ void chan_interrupt(struct line *line, int irq)
 			tty_insert_flip_char(port, c, TTY_NORMAL);
 	} while (err > 0);
 
-	if (err == 0)
-		reactivate_fd(chan->fd, irq);
 	if (err == -EIO) {
 		if (chan->primary) {
 			tty_port_tty_hangup(&line->port, false);
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 6c4511f..1e8df84 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -663,8 +663,6 @@ static irqreturn_t winch_interrupt(int irq, void *data)
 		tty_kref_put(tty);
 	}
  out:
-	if (winch->fd != -1)
-		reactivate_fd(winch->fd, WINCH_IRQ);
 	return IRQ_HANDLED;
 }
 
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 3df3bd5..2b9bfa7 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -95,7 +95,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id)
 	}
 	if (!list_empty(&mc_requests))
 		schedule_work(&mconsole_work);
-	reactivate_fd(fd, MCONSOLE_IRQ);
 	return IRQ_HANDLED;
 }
 
@@ -243,7 +242,6 @@ void mconsole_stop(struct mc_request *req)
 		(*req->cmd->handler)(req);
 	}
 	os_set_fd_block(req->originating_fd, 0);
-	reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
 	mconsole_reply(req, "", 0, 0);
 }
 
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 39f1862..64d8426 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -137,8 +137,6 @@ static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
 		schedule_work(&lp->work);
 		goto out;
 	}
-	reactivate_fd(lp->fd, UM_ETH_IRQ);
-
 out:
 	spin_unlock(&lp->lock);
 	return IRQ_HANDLED;
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 40ca5cc..b0e9ff3 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -137,7 +137,6 @@ static void port_work_proc(struct work_struct *unused)
 		if (!port->has_connection)
 			continue;
 
-		reactivate_fd(port->fd, ACCEPT_IRQ);
 		while (port_accept(port))
 			;
 		port->has_connection = 0;
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 9e3a722..ec3d788 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -72,7 +72,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
 				return ret ? : -EAGAIN;
 
 			atomic_inc(&host_sleep_count);
-			reactivate_fd(random_fd, RANDOM_IRQ);
 			add_sigio_fd(random_fd);
 
 			add_wait_queue(&host_read_wait, &wait);
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 3716e69..1cc72ae5 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -466,7 +466,6 @@ static void ubd_handler(void)
 		blk_end_request(req->req, 0, req->length);
 		kfree(req);
 	}
-	reactivate_fd(thread_fd, UBD_IRQ);
 
 	list_for_each_safe(list, next_ele, &restart){
 		ubd = container_of(list, struct ubd, restart);
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 03/10] High performance networking subsystem
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-30  7:35   ` Anton Ivanov (antivano)
  2014-08-29  7:05 ` [uml-devel] [PATCH 04/10] L2TPv3 Transport Driver for UML anton.ivanov
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

Support for multi-packet vector IO - multiple packets
read in one syscall and written in one syscall. Should work with
legacy UML, thorough tested only for the epoll based IRQ controller

Minimal host kernel version for RX - 2.6.32
Minimal host kernel version for TX - 3.0

Tested on Debian 7.0/Ubuntu 12.x LTS which have the relevant
syscalls, but do not have the appropriate glibc routine for TX
(this is why it is a direct syscall).

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/Makefile          |    2 +-
 arch/um/drivers/net_kern.c        |   63 ++++++++++++++++++++++++-------------
 arch/um/include/asm/irq.h         |   26 +++++++++------
 arch/um/include/shared/net_kern.h |   24 ++++++++++++++
 arch/um/include/shared/net_user.h |   24 ++++++++++++++
 arch/um/kernel/irq.c              |    3 ++
 6 files changed, 109 insertions(+), 33 deletions(-)

diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index e7582e1..836baaf 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -10,7 +10,7 @@ slip-objs := slip_kern.o slip_user.o
 slirp-objs := slirp_kern.o slirp_user.o
 daemon-objs := daemon_kern.o daemon_user.o
 umcast-objs := umcast_kern.o umcast_user.o
-net-objs := net_kern.o net_user.o
+net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
 hostaudio-objs := hostaudio_kern.o
 ubd-objs := ubd_kern.o ubd_user.o
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 64d8426..1d253fa 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
  * James Leu (jleu@mindspring.net).
@@ -29,6 +30,7 @@
 
 static DEFINE_SPINLOCK(opened_lock);
 static LIST_HEAD(opened);
+static int rr_counter = 0;
 
 /*
  * The drop_skb is used when we can't allocate an skb.  The
@@ -42,6 +44,7 @@ static DEFINE_SPINLOCK(drop_lock);
 static struct sk_buff *drop_skb;
 static int drop_max;
 
+
 static int update_drop_skb(int max)
 {
 	struct sk_buff *new;
@@ -77,24 +80,38 @@ static int uml_net_rx(struct net_device *dev)
 	struct sk_buff *skb;
 
 	/* If we can't allocate memory, try again next round. */
-	skb = dev_alloc_skb(lp->max_packet);
-	if (skb == NULL) {
-		drop_skb->dev = dev;
-		/* Read a packet into drop_skb and don't do anything with it. */
-		(*lp->read)(lp->fd, drop_skb, lp);
-		dev->stats.rx_dropped++;
+	if (lp->options & UML_NET_USE_SKB_READ) {
+	    /* we expect a full formed, well behaved skb from zero copy drivers here */
+	    skb = (*lp->skb_read)(lp);
+	    if (skb == NULL) {
 		return 0;
-	}
-
-	skb->dev = dev;
-	skb_put(skb, lp->max_packet);
-	skb_reset_mac_header(skb);
-	pkt_len = (*lp->read)(lp->fd, skb, lp);
-
-	if (pkt_len > 0) {
+	    }
+	    pkt_len = skb->len;
+	} else {
+	    skb = dev_alloc_skb(lp->max_packet + 32);
+	    if (skb == NULL) {
+		    drop_skb->dev = dev;
+		    /* Read a packet into drop_skb and don't do anything with it. */
+		    (*lp->read)(lp->fd, drop_skb, lp);
+		    dev->stats.rx_dropped++;
+		    return 0;
+	    }
+
+	    skb_reserve(skb,32);
+	    skb->dev = dev;
+	    skb_put(skb, lp->max_packet);
+	    skb_reset_mac_header(skb);
+
+	    // Mark that virtual devices cannot provide required checksum.
+	    skb->ip_summed = CHECKSUM_NONE;
+	    pkt_len = (*lp->read)(lp->fd, skb, lp);
+	    if (pkt_len > 0) {
 		skb_trim(skb, pkt_len);
 		skb->protocol = (*lp->protocol)(skb);
+	    }
+	}
 
+	if (pkt_len > 0) {
 		dev->stats.rx_bytes += skb->len;
 		dev->stats.rx_packets++;
 		netif_rx(skb);
@@ -192,8 +209,9 @@ static int uml_net_close(struct net_device *dev)
 	struct uml_net_private *lp = netdev_priv(dev);
 
 	netif_stop_queue(dev);
+	deactivate_fd(lp->fd, dev->irq);
 
-	um_free_irq(dev->irq, dev);
+	free_irq(dev->irq, dev);
 	if (lp->close != NULL)
 		(*lp->close)(lp->fd, &lp->user);
 	lp->fd = -1;
@@ -216,7 +234,6 @@ static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	spin_lock_irqsave(&lp->lock, flags);
 
 	len = (*lp->write)(lp->fd, skb, lp);
-	skb_tx_timestamp(skb);
 
 	if (len == skb->len) {
 		dev->stats.tx_packets++;
@@ -273,14 +290,13 @@ static void uml_net_poll_controller(struct net_device *dev)
 static void uml_net_get_drvinfo(struct net_device *dev,
 				struct ethtool_drvinfo *info)
 {
-	strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
-	strlcpy(info->version, "42", sizeof(info->version));
+	strcpy(info->driver, DRIVER_NAME);
+	strcpy(info->version, "42");
 }
 
 static const struct ethtool_ops uml_net_ethtool_ops = {
 	.get_drvinfo	= uml_net_get_drvinfo,
 	.get_link	= ethtool_op_get_link,
-	.get_ts_info	= ethtool_op_get_ts_info,
 };
 
 static void uml_net_user_timer_expire(unsigned long _conn)
@@ -447,6 +463,7 @@ static void eth_configure(int n, void *init, char *mac,
 	 * These just fill in a data structure, so there's no failure
 	 * to be worried about.
 	 */
+	dev->ethtool_ops = &uml_net_ethtool_ops;
 	(*transport->kern->init)(dev, init);
 
 	*lp = ((struct uml_net_private)
@@ -459,7 +476,9 @@ static void eth_configure(int n, void *init, char *mac,
 		  .open 		= transport->user->open,
 		  .close 		= transport->user->close,
 		  .remove 		= transport->user->remove,
+		  .options 		= transport->kern->options,
 		  .read 		= transport->kern->read,
+		  .skb_read 		= transport->kern->skb_read,
 		  .write 		= transport->kern->write,
 		  .add_address 		= transport->user->add_address,
 		  .delete_address  	= transport->user->delete_address });
@@ -475,9 +494,9 @@ static void eth_configure(int n, void *init, char *mac,
 
 	dev->mtu = transport->user->mtu;
 	dev->netdev_ops = &uml_netdev_ops;
-	dev->ethtool_ops = &uml_net_ethtool_ops;
 	dev->watchdog_timeo = (HZ >> 1);
-	dev->irq = UM_ETH_IRQ;
+	dev->irq = UM_ETH_BASE_IRQ + (rr_counter % UM_ETH_IRQ_RR); 
+	rr_counter++;
 
 	err = update_drop_skb(lp->max_packet);
 	if (err)
@@ -829,7 +848,7 @@ static void close_devices(void)
 	spin_lock(&opened_lock);
 	list_for_each(ele, &opened) {
 		lp = list_entry(ele, struct uml_net_private, list);
-		um_free_irq(lp->dev->irq, lp->dev);
+		free_irq(lp->dev->irq, lp->dev);
 		if ((lp->close != NULL) && (lp->fd >= 0))
 			(*lp->close)(lp->fd, &lp->user);
 		if (lp->remove != NULL)
diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
index 4a2037f..be9128b 100644
--- a/arch/um/include/asm/irq.h
+++ b/arch/um/include/asm/irq.h
@@ -1,21 +1,27 @@
+
 #ifndef __UM_IRQ_H
 #define __UM_IRQ_H
 
+#define UM_ETH_IRQ_RR	        32
+
 #define TIMER_IRQ		0
 #define UMN_IRQ			1
 #define CONSOLE_IRQ		2
 #define CONSOLE_WRITE_IRQ	3
 #define UBD_IRQ			4
-#define UM_ETH_IRQ		5
-#define SSL_IRQ			6
-#define SSL_WRITE_IRQ		7
-#define ACCEPT_IRQ		8
-#define MCONSOLE_IRQ		9
-#define WINCH_IRQ		10
-#define SIGIO_WRITE_IRQ 	11
-#define TELNETD_IRQ 		12
-#define XTERM_IRQ 		13
-#define RANDOM_IRQ 		14
+#define UM_ETH_BASE_IRQ		5
+
+#define UM_END_ETH_IRQ	        UM_ETH_BASE_IRQ + UM_ETH_IRQ_RR
+
+#define SSL_IRQ			UM_END_ETH_IRQ + 1
+#define SSL_WRITE_IRQ		UM_END_ETH_IRQ + 2
+#define ACCEPT_IRQ		UM_END_ETH_IRQ + 3
+#define MCONSOLE_IRQ		UM_END_ETH_IRQ + 4
+#define WINCH_IRQ		UM_END_ETH_IRQ + 5
+#define SIGIO_WRITE_IRQ 	UM_END_ETH_IRQ + 6
+#define TELNETD_IRQ 		UM_END_ETH_IRQ + 7
+#define XTERM_IRQ 		UM_END_ETH_IRQ + 8
+#define RANDOM_IRQ 		UM_END_ETH_IRQ + 9
 
 #define LAST_IRQ RANDOM_IRQ
 #define NR_IRQS (LAST_IRQ + 1)
diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h
index 012ac87..2229126 100644
--- a/arch/um/include/shared/net_kern.h
+++ b/arch/um/include/shared/net_kern.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -13,6 +14,8 @@
 #include <linux/list.h>
 #include <linux/workqueue.h>
 
+#define UML_NET_USE_SKB_READ 1
+
 struct uml_net {
 	struct list_head list;
 	struct net_device *dev;
@@ -28,6 +31,7 @@ struct uml_net_private {
 
 	struct work_struct work;
 	int fd;
+	unsigned int options;
 	unsigned char mac[ETH_ALEN];
 	int max_packet;
 	unsigned short (*protocol)(struct sk_buff *);
@@ -36,6 +40,7 @@ struct uml_net_private {
 	void (*remove)(void *);
 	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
 	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+	struct sk_buff * (*skb_read)(struct uml_net_private *);
 
 	void (*add_address)(unsigned char *, unsigned char *, void *);
 	void (*delete_address)(unsigned char *, unsigned char *, void *);
@@ -47,6 +52,8 @@ struct net_kern_info {
 	unsigned short (*protocol)(struct sk_buff *);
 	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
 	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+	struct sk_buff * (*skb_read)(struct uml_net_private *);
+	unsigned int options;
 };
 
 struct transport {
@@ -59,11 +66,28 @@ struct transport {
 	const int setup_size;
 };
 
+struct mmsg_queue_info {
+	int fd;
+	struct mmsghdr * mmsg_send_vector; 
+	void ** skb_send_vector;
+	int queue_depth, head, tail, max_depth;
+	spinlock_t head_lock; 
+	spinlock_t tail_lock; 
+	unsigned int queue_fsm;
+};
+ 
 extern struct net_device *ether_init(int);
 extern unsigned short ether_protocol(struct sk_buff *);
 extern int tap_setup_common(char *str, char *type, char **dev_name,
 			    char **mac_out, char **gate_addr);
 extern void register_transport(struct transport *new);
 extern unsigned short eth_protocol(struct sk_buff *skb);
+extern struct sk_buff *my_build_skb(void * head, void *data, unsigned int frag_size);
+
+extern void flush_pending_netio(void);
+
+extern int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int advance); 
+extern int uml_net_advance_head( struct mmsg_queue_info * queue_info, int advance); 
+extern int uml_net_flush_mmsg_queue(struct mmsg_queue_info * queue_info, int queue_depth);
 
 #endif
diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h
index 3dabbe1..4b46f37 100644
--- a/arch/um/include/shared/net_user.h
+++ b/arch/um/include/shared/net_user.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012 - 2014 Cisco Systems
  * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
  * Licensed under the GPL
  */
@@ -38,10 +39,15 @@ extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr);
 extern void read_output(int fd, char *output_out, int len);
 
 extern int net_read(int fd, void *buf, int len);
+extern int net_readv(int fd, void *iov, int iovcnt);
 extern int net_recvfrom(int fd, void *buf, int len);
+extern int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int *addrlen);
 extern int net_write(int fd, void *buf, int len);
+extern int net_writev(int fd, void *iov, int iovcnt);
 extern int net_send(int fd, void *buf, int len);
 extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
+extern int net_sendmessage(int fd, void *msg, int flags);
+extern int net_recvmessage(int fd, void *msg, int flags);
 
 extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
 extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
@@ -50,4 +56,22 @@ extern char *split_if_spec(char *str, ...);
 
 extern int dev_netmask(void *d, void *m);
 
+
+extern void uml_net_destroy_skb(void * skb);
+extern void * uml_net_build_skb (void * dev);
+extern void * uml_net_skb_data (void * skb);
+
+extern void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size, int offset);
+extern void add_header_buffers(void * msgvec, int size, int header_size);
+extern void * build_mmsg_vector(int size, int iovsize);
+extern void rebuild_skbuf_vector(void ** skbvec, int size, void * dev);
+extern void * build_skbuf_vector(int size, void * dev);
+extern int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
+		unsigned int flags, struct timespec *timeout);
+extern int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
+		unsigned int flags);
+extern void repair_mmsg (void *msgvec, int iovsize, int header_size);
+extern void destroy_skb_vector(void ** vector, int size);
+extern void destroy_mmsg_vector(void * mmsgvector, int size, int free_iov_base);
+
 #endif
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 5d7ee49e..f4c6fb1 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -17,6 +17,7 @@
 #include <as-layout.h>
 #include <kern_util.h>
 #include <os.h>
+#include <net_kern.h>
 
 /*
 *	We are on the "kernel side" so we cannot pick up the sys/epoll.h 
@@ -136,6 +137,8 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 		spin_unlock_irqrestore(&uml_sigio_lock, flags);
 	}
 
+	flush_pending_netio();
+
 	/* This needs a better way - it slows down the event loop */
 
 	free_irqs();
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 04/10] L2TPv3 Transport Driver for UML
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 03/10] High performance networking subsystem anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 05/10] GRE transport " anton.ivanov
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

This transport allows a UML to connect to another UML local
or remote, the Linux host or any other network device running
the industry standard Ethernet over L2TPv3 protocol as per
RFC 3931 (and successors).

The transport supports a common set of features with the kernel
implementation as well as the Cisco contributed L2TPv3 transport
for QEMU/KVM. In all cases this is static tunnels only, no L2TPv3
control plane.

Additionally, the transport supports the so called "soft"
termination where it can listen for an incoming connection
which does not require the remote endpoint to be specified
at configuration time.

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/Kconfig.net               |   10 +
 arch/um/drivers/Makefile          |    2 +
 arch/um/drivers/uml_l2tpv3.h      |  121 ++++++++++
 arch/um/drivers/uml_l2tpv3_kern.c |  442 +++++++++++++++++++++++++++++++++++++
 arch/um/drivers/uml_l2tpv3_user.c |  420 +++++++++++++++++++++++++++++++++++
 5 files changed, 995 insertions(+)
 create mode 100644 arch/um/drivers/uml_l2tpv3.h
 create mode 100644 arch/um/drivers/uml_l2tpv3_kern.c
 create mode 100644 arch/um/drivers/uml_l2tpv3_user.c

diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 820a56f..9a98aa5 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -84,6 +84,16 @@ config UML_NET_SLIP
         UMLs on a single host).  You may choose more than one without
         conflict.  If you don't need UML networking, say N.
 
+config UML_NET_L2TPV3
+	bool "L2TPV3 transport"
+	depends on UML_NET
+	help
+        This User-Mode Linux network transport allows one or more running
+        UMLs on single or multiple hosts to communicate with each other,
+        the host as well as other remote or local network devices supporting
+        the industry standard Ethernet over L2TPv3 protocol as described in
+        the applicable RFCs
+
 config UML_NET_DAEMON
 	bool "Daemon transport"
 	depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 836baaf..f54c279 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -9,6 +9,7 @@
 slip-objs := slip_kern.o slip_user.o
 slirp-objs := slirp_kern.o slirp_user.o
 daemon-objs := daemon_kern.o daemon_user.o
+uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
 umcast-objs := umcast_kern.o umcast_user.o
 net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
 obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
 obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
 obj-$(CONFIG_UML_NET_DAEMON) += daemon.o 
+obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o 
 obj-$(CONFIG_UML_NET_VDE) += vde.o
 obj-$(CONFIG_UML_NET_MCAST) += umcast.o
 obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_l2tpv3.h b/arch/um/drivers/uml_l2tpv3.h
new file mode 100644
index 0000000..5137bc7
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_L2TPV3_H__
+#define __UML_L2TPV3_H__
+
+#include "net_user.h"
+
+
+#define NEW_MODE_IP_VERSION   1		  /* on for v6, off for v4 */
+#define NEW_MODE_UDP	      2		  /* on for udp, off for raw ip */
+#define NEW_MODE_COOKIE	      4		  /* cookie present */
+#define NEW_MODE_COOKIE_SIZE  8		  /* on for 64 bit */
+#define NEW_MODE_NO_COUNTER   16	  /* DT - no counter */
+
+/* legacy modes */
+
+/* mode 0 */
+
+#define LEGACY_UDP6_64_NO_COUNTER (NEW_MODE_IP_VERSION + NEW_MODE_UDP + NEW_MODE_COOKIE + NEW_MODE_COOKIE_SIZE + NEW_MODE_NO_COUNTER)
+
+#define LEGACY_MODE0 LEGACY_UDP6_64_NO_COUNTER
+
+/* mode 1 */
+
+#define LEGACY_IP6_64_NO_COUNTER (NEW_MODE_IP_VERSION + NEW_MODE_COOKIE + NEW_MODE_COOKIE_SIZE + NEW_MODE_NO_COUNTER)
+
+#define LEGACY_MODE1 LEGACY_IP6_64_NO_COUNTER
+
+/* mode 2 */
+
+#define LEGACY_UDP4_64_COUNTER (NEW_MODE_COOKIE + NEW_MODE_UDP + NEW_MODE_COOKIE_SIZE )
+
+#define LEGACY_MODE2 LEGACY_UDP4_64_COUNTER
+
+/* mode 3 */
+
+#define LEGACY_IP4_64_COUNTER (NEW_MODE_COOKIE + NEW_MODE_COOKIE_SIZE)
+
+#define LEGACY_MODE3 LEGACY_IP4_64_COUNTER
+
+
+#define L2TPV3_HEADER 16
+
+
+struct temphtonl {
+   uint32_t low; 
+   uint32_t high;
+};
+
+
+struct uml_l2tpv3_data {
+	void *remote_addr;
+	int  remote_addr_size;
+	char *remote_addr_string;
+	char *local_addr_string;
+	char *local_service;
+	char *remote_service;
+	char *local_session_string;
+	char *remote_session_string;
+	uint32_t local_session;
+	uint32_t remote_session;
+	char *rx_cookie_string;
+	char *tx_cookie_string;
+	uint64_t rx_cookie;
+	uint64_t tx_cookie;
+
+	/* this should be ifdef-ed to be used only in single packet */
+	uint8_t *network_buffer;
+	
+
+	int fd;
+	void *dev;
+
+	uint32_t uml_l2tpv3_flags;
+	uint32_t mode;
+	uint32_t new_mode; /* listening, sending, etc */
+	uint32_t counter;
+   
+	/*  Precomputed offsets */
+	 
+	uint32_t offset;   /* main offset == header offset */
+	uint32_t cookie_offset;
+	uint32_t counter_offset;
+	uint32_t session_offset;
+
+	/* high speed vector io data */
+    
+	void ** skb_recv_vector;
+	void ** skb_send_vector;
+	void * mmsg_recv_vector;
+	void * mmsg_send_vector;
+
+	uint32_t vector_len;
+	uint32_t recv_index;
+	uint32_t recv_enqueued;
+
+	/* normally same as offset, add size of 
+	 * struct ipv4 header in ipv4 raw - API stupiditities
+	 */
+	uint32_t header_size; 
+	void * send_queue_info;
+};  
+
+
+extern const struct net_user_info uml_l2tpv3_user_info;
+
+extern int uml_l2tpv3_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri);
+
+extern int uml_l2tpv3_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri);
+
+extern void l2tpv3_complete_init(void * dev_id, int max_depth);
+extern void l2tpv3_kern_destroy(struct uml_l2tpv3_data *pri);
+
+#define UML_L2TPV3_FLAG_TX_CHECKSUMS		0x00000001
+#define UML_L2TPV3_FLAG_RX_CHECKSUMS		0x00000002
+
+#endif
diff --git a/arch/um/drivers/uml_l2tpv3_kern.c b/arch/um/drivers/uml_l2tpv3_kern.c
new file mode 100644
index 0000000..228ff22
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3_kern.c
@@ -0,0 +1,442 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net_kern.h>
+#include <irq_kern.h>
+#include <irq_user.h>
+#include "uml_l2tpv3.h"
+
+#define DRIVER_NAME "uml-l2tpv3"
+
+struct uml_l2tpv3_init {
+	char *local_addr_string;
+	char *remote_addr_string;
+	char *local_service;
+	char *remote_service;
+	char *rx_cookie_string;
+	char *tx_cookie_string;
+	char *local_session_string;
+	char *remote_session_string;
+	char *mode_string;
+};
+
+static void uml_l2tpv3_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRIVER_NAME);
+	strcpy(info->version, "42");
+}
+
+
+static const struct ethtool_ops uml_l2tpv3_ethtool_ops = 
+{
+	.get_drvinfo			= uml_l2tpv3_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+static void uml_l2tpv3_init(struct net_device *dev, void *data)
+{
+	struct uml_net_private *pri;
+	struct uml_l2tpv3_data *dpri;
+	struct uml_l2tpv3_init *init = data;
+
+	pri = netdev_priv(dev);
+	dpri = (struct uml_l2tpv3_data *) pri->user;
+
+	/* 
+	 *	these are as is, we keep them for future reference
+	 *	and parse them in userspace
+	 */
+
+	dpri->local_addr_string = init->local_addr_string;
+	dpri->remote_addr_string = init->remote_addr_string;
+	dpri->local_service = init->local_service;
+	dpri->remote_service = init->remote_service;
+	dpri->rx_cookie_string = init->rx_cookie_string;
+	dpri->tx_cookie_string = init->tx_cookie_string;
+	dpri->local_session_string = init->local_session_string;
+	dpri->remote_session_string = init->remote_session_string;
+
+	/* the only ones we pre-parse */
+
+	if (init->mode_string != NULL) {
+		sscanf(init->mode_string, "%x", &dpri->new_mode);
+	} else {
+		dpri->new_mode = 0;
+		printk("warning: failed to parse l2tpv3 mode %s\n", init->mode_string);
+	}
+		
+	printk("l2tpv3 mode %x\n", dpri->new_mode);
+	dpri->fd = -1;
+	dpri->dev = dev;
+	printk("l2tpv3 backend - %s:%s<->%s:%s, rxcookie: %s, txcookie:%s, local_session: %s, peer_session: %s\n",  
+		dpri->local_addr_string, 
+		dpri->local_service, 
+		dpri->remote_addr_string, 
+		dpri->remote_service, 
+		dpri->rx_cookie_string, 
+		dpri->tx_cookie_string, 
+		dpri->local_session_string, 
+		dpri->remote_session_string
+	);
+	dpri->uml_l2tpv3_flags = 0; /* we have everything turned off initially */
+	SET_ETHTOOL_OPS(dev, &uml_l2tpv3_ethtool_ops);
+}
+
+static int uml_l2tpv3_verify_header(uint8_t * buffer, struct uml_l2tpv3_data *dpri )
+{
+	uint64_t *cookie64;
+	uint32_t *cookie32;
+	uint32_t *session_id;
+	
+
+	if ((!(dpri->new_mode & NEW_MODE_IP_VERSION)) && (!(dpri->new_mode & NEW_MODE_UDP))){
+		buffer += sizeof(struct iphdr) /* fix for ipv4 raw */;
+	} 
+	
+	session_id = (uint32_t *)(buffer + dpri->session_offset);
+	if (*session_id != dpri->remote_session) {
+		printk("Unknown Sesion id\n");
+		return 0; 
+	}
+
+	if (dpri->new_mode & NEW_MODE_COOKIE) {
+		if (dpri->new_mode & NEW_MODE_COOKIE_SIZE) {
+	  	/* 64 bit cookie */
+	  		cookie64 = (uint64_t *)(buffer + dpri->cookie_offset);
+			if (*cookie64 != dpri->rx_cookie) {
+		 		printk("unknown cookie id\n");
+		 		return 0; 
+	  		}
+	   	} else {
+	  		cookie32 = (uint32_t *)(buffer + dpri->cookie_offset);
+	  		if (*cookie32 != * (uint32_t *) &dpri->rx_cookie) {
+		 		printk("unknown cookie id\n");
+		 		return 0; 
+	  		}
+	   	}
+	}
+	return 1;
+}
+
+static struct sk_buff * uml_l2tpv3_multiread (struct uml_net_private * lp) {
+	struct uml_l2tpv3_data *dpri = (struct uml_l2tpv3_data *) &lp->user;
+	void ** skb_recv_vector = dpri->skb_recv_vector;
+	struct mmsghdr * mmsg_recv_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+	struct sk_buff * result = NULL;
+	struct iovec * iov;
+	int ret;
+	
+	
+	/* Are we done processing the enqueued buffers */
+
+	if (dpri->recv_index >= dpri->recv_enqueued) {
+		ret = net_recvmmsg(
+			dpri->fd, mmsg_recv_vector, dpri->vector_len, 0,NULL);
+		if (ret >= 0) {
+			dpri->recv_enqueued = ret;
+		} else {
+			printk("Error in multi-packet receive %d\n", ret);
+			return NULL;
+		}
+		dpri->recv_index = 0;
+	}
+
+	/* check if we are done processing the enqueued buffers */
+
+	skb_recv_vector += dpri->recv_index;
+	mmsg_recv_vector += dpri->recv_index;
+	while (dpri->recv_index < dpri->recv_enqueued) {
+		dpri->recv_index ++;
+		iov = mmsg_recv_vector->msg_hdr.msg_iov;
+		if (
+			(iov) &&
+			(mmsg_recv_vector->msg_len > dpri->header_size) && 
+			(uml_l2tpv3_verify_header(iov->iov_base, dpri))
+		) {
+			if (!dpri->remote_addr) {
+				if (mmsg_recv_vector->msg_hdr.msg_name) {
+					dpri->remote_addr = 
+						mmsg_recv_vector->msg_hdr.msg_name;
+					dpri->remote_addr_size = 
+						mmsg_recv_vector->msg_hdr.msg_namelen;
+					mmsg_recv_vector->msg_hdr.msg_namelen =  
+						sizeof (struct sockaddr_storage);
+				}
+			}
+			result = (struct sk_buff *)(* skb_recv_vector);
+			if (result) {
+				skb_trim(result, mmsg_recv_vector->msg_len - dpri->header_size);
+
+				result->protocol = (*lp->protocol)(result);
+
+				/* replace the buffer we just (ab)used */
+
+				(* skb_recv_vector) = uml_net_build_skb(lp->dev);
+
+				add_skbuffs(mmsg_recv_vector, 
+						skb_recv_vector, 1, lp->max_packet, 1);
+				return result;
+			} else {
+				printk("encountered failed atomic allocation @%i, skipping to next\n", 
+                        dpri->recv_index);
+			}
+		} else {
+			if (mmsg_recv_vector->msg_hdr.msg_name) {
+				/* reset size */
+				mmsg_recv_vector->msg_hdr.msg_namelen =
+					sizeof (struct sockaddr_storage);
+			}
+			result = NULL;
+		}
+		skb_recv_vector ++;
+		mmsg_recv_vector ++;
+	}
+	return result;
+}
+
+static int uml_l2tpv3_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	int result;
+	struct uml_l2tpv3_data *dpri = (struct uml_l2tpv3_data *) &lp->user;
+	uint8_t  *buffer ;
+
+
+	int offset = dpri->offset;
+	
+	buffer = dpri->network_buffer;
+
+	if (!(dpri->new_mode & NEW_MODE_UDP) && !(dpri->new_mode & NEW_MODE_IP_VERSION))
+	{
+		/* IPv4 RAW mode: Account for the IP header that will be received */
+		offset += sizeof(struct iphdr);
+	}
+	 
+
+	result = uml_l2tpv3_user_recvmsg(
+			fd, 
+			buffer, offset,
+			skb->data, skb->dev->mtu + ETH_HEADER_OTHER,
+			dpri
+		);
+	if (result <= 0) {
+		return result;
+	} 
+	if (
+		!(dpri->new_mode & NEW_MODE_UDP) && 
+		!(dpri->new_mode & NEW_MODE_IP_VERSION)
+	) {
+	/* IPv4 RAW mode: Ignore the IP header */
+		buffer += sizeof(struct iphdr);
+	}
+
+	if ((result > offset) && (uml_l2tpv3_verify_header(buffer, dpri))) {
+		if ((dpri->uml_l2tpv3_flags & UML_L2TPV3_FLAG_RX_CHECKSUMS) != 0) {
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		}
+		return result - offset;
+	} else {
+		return 0;
+	}
+}
+
+static void uml_l2tpv3_form_header(uint8_t * buffer, struct uml_l2tpv3_data *pri) {
+	uint32_t *header;
+	uint32_t *session;
+	uint64_t *cookie64;
+	uint32_t *cookie32;
+	uint32_t *counter;
+	if (pri->new_mode & NEW_MODE_UDP) {
+		header = (uint32_t *) buffer;
+		* header = htonl(0x30000);
+	}
+	session = (uint32_t *) (buffer + pri->session_offset);
+	*session = pri->local_session;
+
+	if (pri->new_mode & NEW_MODE_COOKIE) {
+		if (pri->new_mode & NEW_MODE_COOKIE_SIZE) {
+		   cookie64 = (uint64_t *)(buffer + pri->cookie_offset);
+		   * cookie64 = pri->tx_cookie;
+		} else {
+		   cookie32 = (uint32_t *) (buffer + pri->cookie_offset);
+		   * cookie32 = * ((uint32_t *) &pri->tx_cookie);
+		}
+	}
+
+	if (!(pri->new_mode & NEW_MODE_NO_COUNTER)) {
+		counter = (uint32_t *)(buffer + pri->counter_offset);
+		* counter = htonl(++pri->counter);
+	}
+}
+
+void l2tpv3_complete_init(void * dev_id, int max_depth) {
+
+	struct net_device *dev = dev_id;
+	struct uml_net_private *lp = netdev_priv(dev);
+	struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+	struct mmsg_queue_info * queue_info ; 
+
+	queue_info =  
+		kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+	if (queue_info) {
+		queue_info->fd = pri->fd;
+		queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+		queue_info->skb_send_vector = pri->skb_send_vector;
+		queue_info->head = 0;
+		queue_info->tail = 0;
+		queue_info->queue_depth = 0;
+		queue_info->max_depth = max_depth;
+		spin_lock_init(&queue_info->head_lock); 
+		spin_lock_init(&queue_info->tail_lock); 
+	}
+	pri->send_queue_info = queue_info;
+}
+
+void l2tpv3_kern_destroy(struct uml_l2tpv3_data *pri) {
+
+	int ret = -1;
+	struct mmsg_queue_info * queue_info = pri->send_queue_info;
+	/* flush queue */
+	do {
+		ret = uml_net_flush_mmsg_queue(queue_info, -1);
+	} while (ret != 0);
+	pri->send_queue_info = NULL;
+	kfree(queue_info);
+}
+
+static int uml_l2tpv3_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+	int result, queue_depth;
+	struct mmsghdr * mmsg_send_vector;
+	struct iovec * iov;
+	struct mmsg_queue_info * send_queue_info; 
+	struct sk_buff * mmsg_clone;
+	void ** skb_send_vector;
+	
+	send_queue_info = (struct mmsg_queue_info *) pri->send_queue_info;
+
+	spin_lock(&send_queue_info->tail_lock);
+
+	spin_lock(&send_queue_info->head_lock);
+	queue_depth = send_queue_info->queue_depth;
+	spin_unlock(&send_queue_info->head_lock);
+	
+	if (pri->remote_addr && (queue_depth < send_queue_info->max_depth)) {
+		mmsg_clone = skb_clone(skb, GFP_ATOMIC);
+		if (mmsg_clone) {
+
+			skb_send_vector = pri->skb_send_vector;
+			skb_send_vector +=  send_queue_info->tail;
+			(* skb_send_vector) = mmsg_clone;
+
+			mmsg_send_vector = pri->mmsg_send_vector;
+			mmsg_send_vector += send_queue_info->tail;
+			mmsg_send_vector->msg_hdr.msg_name = pri->remote_addr;
+			mmsg_send_vector->msg_hdr.msg_namelen = pri->remote_addr_size;
+
+			iov = mmsg_send_vector->msg_hdr.msg_iov;
+
+			if (iov) {
+
+				uml_l2tpv3_form_header(iov->iov_base, pri);
+				
+				iov++;
+				
+				iov->iov_base = skb->data;
+				iov->iov_len = skb->len;
+
+				queue_depth = uml_net_advance_tail(send_queue_info, 1);
+			} else {
+				printk("no iov, cannot enqueue\n");
+			}
+		} else {
+			printk("cloning failed\n");
+		}
+	} 
+		
+	spin_unlock(&send_queue_info->tail_lock);
+
+	if (queue_depth > 0) {
+		result = uml_net_flush_mmsg_queue(
+			send_queue_info, queue_depth
+		); 
+	} 
+	return skb->len; /* not particularly correct */
+}
+
+static const struct net_kern_info uml_l2tpv3_kern_info = {
+	.options		= UML_NET_USE_SKB_READ,
+	.init			= uml_l2tpv3_init,
+	.protocol		= eth_protocol,
+	.read			= uml_l2tpv3_read,
+	.skb_read		= uml_l2tpv3_multiread,
+	.write			= uml_l2tpv3_write,
+};
+
+static int uml_l2tpv3_setup(char *str, char **mac_out, void *data)
+{
+	struct uml_l2tpv3_init *init = data;
+	char *remain;
+
+	*init = (
+		(struct uml_l2tpv3_init)
+		   { 
+			 .local_addr_string = "::1",
+			 .local_service = "1701",
+			 .remote_service = "1702",
+			 .rx_cookie_string = "0xdeadbeefdeadbeef",
+			 .tx_cookie_string = "0xdeadbeefdeadbeef",
+			 .local_session_string = "0xFFFFFFFF",
+			 .remote_session_string = "0xFFFFFFFF",
+			 .mode_string = "0",	
+		   }
+			);
+
+	remain = split_if_spec(str, 
+			mac_out, 
+			&init->local_addr_string, 
+			&init->local_service, 
+			&init->remote_addr_string, 
+			&init->remote_service, 
+			&init->rx_cookie_string, 
+			&init->tx_cookie_string, 
+			&init->local_session_string, 
+			&init->remote_session_string, 
+			&init->mode_string, 
+			NULL
+		);
+	if (remain != NULL)
+		printk(KERN_WARNING " Strange interface spec \n");
+	return 1;
+}
+
+static struct transport uml_l2tpv3_transport = {
+	.list 		= LIST_HEAD_INIT(uml_l2tpv3_transport.list),
+	.name 		= "l2tpv3",
+	.setup  	= uml_l2tpv3_setup,
+	.user 		= &uml_l2tpv3_user_info,
+	.kern 		= &uml_l2tpv3_kern_info,
+	.private_size 	= sizeof(struct uml_l2tpv3_data),
+	.setup_size 	= sizeof(struct uml_l2tpv3_init),
+};
+
+static int register_uml_l2tpv3(void)
+{
+	register_transport(&uml_l2tpv3_transport);
+	return 0;
+}
+
+late_initcall(register_uml_l2tpv3);
diff --git a/arch/um/drivers/uml_l2tpv3_user.c b/arch/um/drivers/uml_l2tpv3_user.c
new file mode 100644
index 0000000..5a68151
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3_user.c
@@ -0,0 +1,420 @@
+/*
+ * Copyright (C) 2012-2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+#include <net_user.h>
+#include <os.h>
+#include <um_malloc.h>
+#include <user.h>
+#include "uml_l2tpv3.h"
+
+#define VECTOR_SIZE 128
+
+int l2tpv3_parse_cookie32(char *src , void * dst) 
+{
+	if (
+		(src == NULL) || 
+		(sscanf(src, "%x", (unsigned int *) dst) != 1)
+	) { 
+		printk(UM_KERN_ERR "cannot parse cookie!!!: %s\n", src);
+		return -1;
+	} 
+	* (( uint32_t *) dst) = htonl(* ((uint32_t* )dst));
+	return 0;
+}
+
+int l2tpv3_parse_cookie64(char *src , void * dst)
+{
+	struct temphtonl temph;
+	uint32_t temp;
+	const int num = 42;
+	if (
+		(src == NULL) || 
+		(sscanf(src, "%llx", (long unsigned int *) &temph) != 1)
+	) { 
+		printk(UM_KERN_ERR "cannot parse cookie!!!: %s\n", src);
+		return -1;
+	} 
+	if(*(char *)&num == 42) {
+		// why oh why there is no htonll
+		temp = htonl(temph.high);
+		temph.high = htonl(temph.low);
+		temph.low = temp;
+	} else {
+		temph.low = htonl(temph.low); 
+		temph.high = htonl(temph.high);
+	}	
+	memcpy(dst, &temph, sizeof (uint64_t));
+	return 0;
+}
+
+static void uml_l2tpv3_remove(void *data)
+{
+	struct uml_l2tpv3_data *pri = data;
+
+	l2tpv3_kern_destroy(pri); 
+	if (pri->fd > 0) {
+		close(pri->fd);
+	}
+	pri->fd = -1;
+	if (pri->skb_send_vector) {
+		/* this one should be empty - we flushed it so we just free it */
+		kfree(pri->skb_send_vector);
+		pri->skb_send_vector = NULL;
+	}
+	if (pri->mmsg_send_vector) {
+		destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 1);
+		pri->mmsg_send_vector = NULL;
+	}
+	if (pri->skb_recv_vector) {
+		destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+		pri->skb_recv_vector = NULL; 
+	}
+	if (pri->mmsg_recv_vector) {
+		destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 1);
+		pri->mmsg_recv_vector = NULL; 
+	}
+	if (pri->network_buffer) {
+		kfree(pri->network_buffer);
+		pri->network_buffer = NULL; 
+	}
+}
+
+static int uml_l2tpv3_user_init(void *data, void *dev)
+{
+	struct uml_l2tpv3_data *pri = data;
+	int fd;
+	int sock_family, sock_type, sock_proto;
+	int ret;
+	struct addrinfo hints;
+	struct addrinfo *result;
+	char service[NI_MAXSERV];
+	struct mmsghdr * mmsghdr;
+
+	pri->offset = 4;
+	pri->session_offset = 0;
+	pri->cookie_offset = 4;
+	pri->counter_offset = 4;
+
+	pri->fd = -1;
+
+	
+
+	/* used only in single packet modes, should be obsoleted */
+	
+	pri->network_buffer = uml_kmalloc(pri->header_size, UM_GFP_KERNEL); 
+	if (!pri->network_buffer) {
+		printk("uml_l2tpv3_user_init: could not allocate buffer\n");
+		return -1;
+	}
+
+
+	/* basic variable parsing */
+	 
+	pri->local_session = 0;
+	if (l2tpv3_parse_cookie32(pri->local_session_string,&pri->local_session) !=0) {
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	pri->remote_session = 0;
+	if (l2tpv3_parse_cookie32(pri->remote_session_string,&pri->remote_session) !=0) {
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	if (pri->new_mode & NEW_MODE_COOKIE) {
+		if (pri->new_mode & NEW_MODE_COOKIE_SIZE) {
+		/* 64 bit cookie */
+			pri->offset += 8;
+			pri->counter_offset += 8;
+			if (l2tpv3_parse_cookie64(pri->tx_cookie_string,&pri->tx_cookie) !=0) {
+				uml_l2tpv3_remove(pri);
+				return -1;
+			}
+			if (l2tpv3_parse_cookie64(pri->rx_cookie_string,&pri->rx_cookie) !=0) {
+				uml_l2tpv3_remove(pri);
+				return -1;
+			}
+		} else {
+		/* 32 bit cookie */
+			pri->offset += 4;
+			pri->counter_offset +=4;
+			pri->tx_cookie = 0;
+			if (l2tpv3_parse_cookie32(pri->tx_cookie_string,&pri->tx_cookie) !=0) {
+				uml_l2tpv3_remove(pri);
+				return -1;
+			}
+			pri->rx_cookie = 0;
+			if (l2tpv3_parse_cookie32(pri->rx_cookie_string,&pri->rx_cookie) !=0) {
+				uml_l2tpv3_remove(pri);
+				return -1;
+			}
+		}
+	}
+	if (pri->remote_addr_string) {			
+	/* we now allocate it only if it we are not "listening" */
+		pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+	} else {
+		pri->remote_addr = NULL;
+	}
+
+	if (pri->new_mode & NEW_MODE_IP_VERSION) {
+	 /* IPv6 */
+		sock_family = AF_INET6;
+	} else {
+	 /* IPv4 */
+		sock_family = AF_INET;
+	}
+	if (pri->new_mode & NEW_MODE_UDP) {
+		printk(UM_KERN_ERR "uml_l2tpv3_user_init: preparing udp socket for mode %x\n ", pri->new_mode);
+		sock_type = SOCK_DGRAM;
+		sock_proto = 0;
+		/* space for header. In UDP mode, the 
+		* egress packet also includes the 
+		* 'Ver' and 'Reserved' fields.
+		*/
+
+		pri->offset += 4;
+		pri->counter_offset += 4;
+		pri->session_offset += 4;
+		pri->cookie_offset += 4;
+	} else {
+		printk(UM_KERN_ERR "uml_l2tpv3_user_init: preparing raw socket for mode %x\n ", pri->new_mode);
+		sock_type = SOCK_RAW;
+		sock_proto = 0x73;
+	}
+
+	if (!(pri->new_mode & NEW_MODE_NO_COUNTER)) {
+		pri->offset += 4;
+	}
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_flags = AI_PASSIVE;
+	hints.ai_family = sock_family;
+	hints.ai_socktype = sock_type;
+	hints.ai_protocol = sock_proto;
+
+	if ((fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol)) == -1) {
+		fd = -errno;
+		printk(UM_KERN_ERR "uml_l2tpv3_user_init: socket creation failed, "
+			"errno = %d\n", -fd);
+		uml_l2tpv3_remove(pri);
+		return fd;
+	} else {
+		pri->fd = fd;
+	}
+
+	/* Get the details of the local endpoint, and bind it. */
+	memset(service, '\0', NI_MAXSERV);
+	if (pri->new_mode & NEW_MODE_UDP) {
+		strncpy(service, pri->local_service, NI_MAXSERV - 1);
+		service[NI_MAXSERV - 1] = '\0';
+	}
+
+	ret = getaddrinfo(pri->local_addr_string, service, &hints, &result);
+
+	if ((ret != 0) || (result == NULL)) {
+		printk(UM_KERN_ERR "uml_l2tpv3_user_init: Unable to parse the local endpoint: %d\n", ret);
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)result->ai_addr, result->ai_addrlen)) {
+		printk("uml_l2tpv3_user_init:	could not bind socket: %d\n", errno);
+		freeaddrinfo(result);
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	printk("uml_l2tpv3_user_init: socket bound\n");
+	freeaddrinfo(result);
+
+	if (pri->remote_addr != NULL) {
+	/* Get the details of the remote endpoint. */
+		memset(service, '\0', NI_MAXSERV);
+		if (pri->new_mode & NEW_MODE_UDP) {
+			strncpy(service, pri->remote_service, NI_MAXSERV - 1);
+			service[NI_MAXSERV - 1] = '\0';
+		}
+		memset(&hints, 0, sizeof(hints));
+		hints.ai_flags = AI_PASSIVE;
+		hints.ai_family = sock_family;
+		hints.ai_socktype = sock_type;
+		hints.ai_protocol = sock_proto;
+		ret = getaddrinfo(pri->remote_addr_string, service, &hints, &result);
+		if ((ret != 0) || (result == NULL)) {
+			printk(UM_KERN_ERR "uml_l2tpv3_user_init: Unable to parse the remote endpoint: %d\n", ret);
+			uml_l2tpv3_remove(pri);
+			return -1;
+		}
+		memset(pri->remote_addr, '\0' , sizeof(struct sockaddr_storage));
+		memcpy(pri->remote_addr, result->ai_addr, result->ai_addrlen);
+		pri->remote_addr_size = result->ai_addrlen;
+		freeaddrinfo(result);
+	 }
+
+	/* vector IO init */
+
+	int queue_size = VECTOR_SIZE * 4096;
+	/* we do not care about the result, this is a tuning, not critical */
+	setsockopt(socket, SOL_SOCKET, SO_RCVBUF, &queue_size, sizeof(queue_size)); 
+
+
+	pri->vector_len = VECTOR_SIZE;
+	pri->recv_index = 0;
+	pri->recv_enqueued = 0;
+	pri->header_size = pri->offset /* fix for ipv4 raw */;
+
+	if ((!(pri->new_mode & NEW_MODE_IP_VERSION)) && (!(pri->new_mode & NEW_MODE_UDP))){
+		 pri->header_size += sizeof(struct iphdr) /* fix for ipv4 raw */;
+	}
+
+	pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+	pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+	add_header_buffers(pri->mmsg_recv_vector, VECTOR_SIZE, pri->header_size); 
+	add_skbuffs(
+		 pri->mmsg_recv_vector, 
+		 pri->skb_recv_vector, 
+		 VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER, 
+		 1
+	);
+
+	pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+	if (pri->skb_send_vector) {
+		memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+	} else {
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+	if (! pri->mmsg_send_vector) {
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+	/* note - we do not need to do the ipv4 header size correction here */
+	add_header_buffers(pri->mmsg_send_vector, VECTOR_SIZE, pri->offset); 
+
+	if (!pri->remote_addr) {
+		mmsghdr = (struct mmsghdr *) pri->mmsg_recv_vector;
+		mmsghdr->msg_hdr.msg_name = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+		if (mmsghdr->msg_hdr.msg_name) {
+			mmsghdr->msg_hdr.msg_namelen = sizeof(struct sockaddr_storage);
+		} else {
+			printk("uml_l2tpv3_user_init: Failed to allocate remote address name\n");
+		}
+	}
+	pri->dev = dev;
+
+	/* init kernel side structures that are opaque to userspace - 
+	 *  locks, timers, state machine, etc 
+	 */ 
+ 
+
+	l2tpv3_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+	if (!pri->send_queue_info) {
+		printk("uml_l2tpv3:queue control allocation failed\n");
+		uml_l2tpv3_remove(pri);
+		return -1;
+	}
+
+	if (pri->fd < 0) {
+		return pri->fd;
+	}
+	return 0;
+}
+
+static int uml_l2tpv3_open(void *data)
+{
+	struct uml_l2tpv3_data *pri = data;
+	return pri->fd;
+}
+
+
+int uml_l2tpv3_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri)
+{
+	struct msghdr message;
+	struct iovec vec[2];
+
+	vec[0].iov_base = header;
+	vec[0].iov_len = headerlen;
+	vec[1].iov_base = data;
+	vec[1].iov_len = datalen;
+
+	message.msg_name = pri->remote_addr;
+	message.msg_namelen = pri->remote_addr_size;
+	message.msg_iov = (struct iovec *) &vec;
+	message.msg_iovlen = 2;
+	message.msg_control = NULL;
+	message.msg_controllen = 0;
+	message.msg_flags = MSG_DONTWAIT;
+
+
+	if (pri->remote_addr != NULL) {
+		return net_sendmessage(fd, &message, MSG_DONTWAIT);
+	} else {
+		return -1;
+	}
+}
+int uml_l2tpv3_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri)
+{
+	struct msghdr message;
+	struct iovec vec[2];
+
+	vec[0].iov_base = header;
+	vec[0].iov_len = headerlen;
+	vec[1].iov_base = data;
+	vec[1].iov_len = datalen;
+
+	if (!pri->remote_addr) {
+		pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_ATOMIC);
+		if (pri->remote_addr) {
+			message.msg_name = pri->remote_addr;
+			message.msg_namelen = pri->remote_addr_size;
+		} else {
+			message.msg_name = NULL;
+			message.msg_namelen = 0;
+		}
+	} else {
+		message.msg_name = NULL;
+		message.msg_namelen = 0;
+	}
+	 
+	message.msg_iov = (struct iovec *) &vec;
+	message.msg_iovlen = 2;
+	message.msg_control = NULL;
+	message.msg_controllen = 0;
+	message.msg_flags = MSG_DONTWAIT;
+
+	return net_recvmessage(fd, &message, MSG_DONTWAIT);
+}
+const struct net_user_info uml_l2tpv3_user_info = {
+	.init		= uml_l2tpv3_user_init,
+	.open		= uml_l2tpv3_open,
+	.close	 	= NULL,
+	.remove	 	= uml_l2tpv3_remove,
+	.add_address	= NULL,
+	.delete_address = NULL,
+	.mtu		= ETH_MAX_PACKET,
+	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER + L2TPV3_HEADER,
+};
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 05/10] GRE transport for UML
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
                   ` (2 preceding siblings ...)
  2014-08-29  7:05 ` [uml-devel] [PATCH 04/10] L2TPv3 Transport Driver for UML anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 06/10] RAW Ethernet " anton.ivanov
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

This transport allows a UML to connect to another UML local
or remote, the Linux host or any other network device running
the industry standard Ethernet over GRE protocol. The transport
supports all features of RFC 2784.

The transport supports a common set of features with the kernel
implementation. Checksum offload is supported on RX, TODO on TX.

Additionally, the transport supports the so called "soft"
termination where it can listen for an incoming connection
which does not require the remote endpoint to be specified
at configuration time.

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/Kconfig.net            |   11 +
 arch/um/drivers/Makefile       |    2 +
 arch/um/drivers/uml_gre.h      |   85 ++++++++
 arch/um/drivers/uml_gre_kern.c |  446 ++++++++++++++++++++++++++++++++++++++++
 arch/um/drivers/uml_gre_user.c |  355 ++++++++++++++++++++++++++++++++
 5 files changed, 899 insertions(+)
 create mode 100644 arch/um/drivers/uml_gre.h
 create mode 100644 arch/um/drivers/uml_gre_kern.c
 create mode 100644 arch/um/drivers/uml_gre_user.c

diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 9a98aa5..7c8ba68 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -94,6 +94,17 @@ config UML_NET_L2TPV3
         the industry standard Ethernet over L2TPv3 protocol as described in
         the applicable RFCs
 
+config UML_NET_GRE
+	bool "GRE transport"
+	depends on UML_NET
+	help
+        This User-Mode Linux network transport allows one or more running
+        UMLs on single or multiple hosts to communicate with each other,
+        the host as well as other remote or local network devices supporting
+        the industry standard Ethernet over GRE protocol as described in
+        the applicable RFCs. The driver supports Soft GRE (wait for connect)
+        as used in Cable systems, etc.
+
 config UML_NET_DAEMON
 	bool "Daemon transport"
 	depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index f54c279..66127ee 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -10,6 +10,7 @@ slip-objs := slip_kern.o slip_user.o
 slirp-objs := slirp_kern.o slirp_user.o
 daemon-objs := daemon_kern.o daemon_user.o
 uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
+uml_gre-objs := uml_gre_kern.o uml_gre_user.o
 umcast-objs := umcast_kern.o umcast_user.o
 net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -45,6 +46,7 @@ obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
 obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
 obj-$(CONFIG_UML_NET_DAEMON) += daemon.o 
 obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o 
+obj-$(CONFIG_UML_NET_GRE) += uml_gre.o 
 obj-$(CONFIG_UML_NET_VDE) += vde.o
 obj-$(CONFIG_UML_NET_MCAST) += umcast.o
 obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_gre.h b/arch/um/drivers/uml_gre.h
new file mode 100644
index 0000000..353306a
--- /dev/null
+++ b/arch/um/drivers/uml_gre.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_GRE_H__
+#define __UML_GRE_H__
+
+#include "net_user.h"
+
+
+#define GRE_MODE_CHECKSUM     8 	          /* checksum - todo*/
+#define GRE_MODE_RESERVED     4 		  /* unused */
+#define GRE_MODE_KEY	      2		  /* KEY present */
+#define GRE_MODE_SEQUENCE     1	  /* no sequence */
+
+#define GRE_MODE_IP_VERSION   16	  /* on for v6, off for v4 */
+
+
+/* legacy modes */
+
+
+#define MAX_GRE_HEADER 16
+
+
+struct uml_gre_data {
+        void *remote_addr;
+        int  remote_addr_size;
+        char *remote_addr_string;
+        char *local_addr_string;
+        char *rx_key_string;
+        char *tx_key_string;
+        uint32_t rx_key;
+        uint32_t tx_key;
+        uint8_t *network_buffer;
+	int fd;
+	void *dev;
+        uint32_t mode; /* listening, sending, etc */
+        uint32_t sequence;
+   
+	/*  Precomputed offsets */
+	 
+        uint32_t offset;   /* main offset == header offset */
+        uint32_t protocol_offset;
+        uint32_t checksum_offset;
+        uint32_t key_offset;
+        uint32_t sequence_offset;
+
+	void ** skb_recv_vector;
+	void * mmsg_recv_vector;
+
+	void ** skb_send_vector;
+	void * mmsg_send_vector;
+
+	uint32_t vector_len;
+	uint32_t recv_index;
+	uint32_t recv_enqueued;
+	/* normally same as offset, add size of struct ipv4 header in ipv4 raw - API stupiditities */
+	uint32_t header_size; 
+
+	void * send_queue_info;
+
+};
+
+struct gre_minimal_header {
+   uint16_t header;
+   uint16_t arptype;
+};
+
+
+extern const struct net_user_info uml_gre_user_info;
+
+
+extern int uml_gre_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri);
+
+extern int uml_gre_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri);
+
+
+extern void gre_complete_init(void * dev_id, int max_depth);
+extern void gre_kern_destroy(struct uml_gre_data *pri);
+
+
+
+#endif
diff --git a/arch/um/drivers/uml_gre_kern.c b/arch/um/drivers/uml_gre_kern.c
new file mode 100644
index 0000000..5956db7
--- /dev/null
+++ b/arch/um/drivers/uml_gre_kern.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include "net_kern.h"
+#include "uml_gre.h"
+
+#define DRIVER_NAME "uml-gre"
+
+#define GRE_IRB htons(0x6558)
+#define ETHER_HEADER_SIZE 14 
+
+struct uml_gre_init {
+	char *local_addr_string;
+	char *remote_addr_string;
+	char *rx_key_string;
+	char *tx_key_string;
+	char *mode_string;
+};
+
+static void uml_gre_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRIVER_NAME);
+	strcpy(info->version, "42");
+}
+
+
+
+static const struct ethtool_ops uml_gre_ethtool_ops = 
+{
+	.get_drvinfo			= uml_gre_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+
+
+static void uml_gre_init(struct net_device *dev, void *data)
+{
+	struct uml_net_private *pri;
+	struct uml_gre_data *dpri;
+	struct uml_gre_init *init = data;
+
+	pri = netdev_priv(dev);
+	dpri = (struct uml_gre_data *) pri->user;
+
+		/* 
+		these are as is, we keep them for future reference
+		and parse them in userspace
+
+	*/
+
+	dpri->local_addr_string = init->local_addr_string;
+	dpri->remote_addr_string = init->remote_addr_string;
+	dpri->rx_key_string = init->rx_key_string;
+	dpri->tx_key_string = init->tx_key_string;
+
+	if (init->mode_string != NULL) {
+	   sscanf(init->mode_string, "%x", &dpri->mode);
+	} else {
+	   dpri->mode = 0;
+	}
+	dpri->fd = -1;
+	dpri->dev = dev;
+	printk("gre backend - %s<->%s, rx_key: %s tx_key: %s, mode %i\n",  
+		dpri->local_addr_string, 
+		dpri->remote_addr_string, 
+		dpri->rx_key_string, 
+		dpri->tx_key_string,
+		dpri->mode
+		);
+	SET_ETHTOOL_OPS(dev, &uml_gre_ethtool_ops);
+}
+
+static int uml_gre_verify_header(uint8_t *header_buffer, 
+		struct sk_buff *skb, 
+		struct uml_gre_data *dpri)
+{
+	struct gre_minimal_header * header;
+	uint16_t old_checksum;
+       	uint32_t data_sum;
+       	uint32_t and_ether_sum;
+
+	/* this is never called with a NULL SKB, the SKB must be trimmed
+	   to correct size prior to calling */
+
+	if (!(dpri->mode & GRE_MODE_IP_VERSION)) {
+		header_buffer += sizeof(struct iphdr) /* fix for ipv4 raw */;
+	} 
+	
+	header = (struct gre_minimal_header *) header_buffer;
+	      
+      	if (
+		(header->header == htons((dpri->mode & 0xF) << 12)) && 
+		(header->arptype == GRE_IRB)
+	   ) {
+	 	/* header bits and type match, check key if present */
+		if (dpri->mode &  GRE_MODE_KEY) {
+			if (*((uint32_t *)(header_buffer + dpri->key_offset)) != dpri->rx_key) {
+				/* key mismatch, drop frame */ 
+				skb->dev->stats.rx_dropped++;
+				return 0;
+			}
+		}
+		/* 
+		 We compute the checksum if there is GRE checksum 
+		 and supply it to the kernel as "checksum offload" in a
+		 CHECKSUM_COMPLETE form so it can be used for any protocol 
+		 */
+
+		if (dpri->mode & GRE_MODE_CHECKSUM) {
+			old_checksum = 	* ((uint16_t *) (header_buffer + dpri->checksum_offset));
+			* ((uint32_t *) (header_buffer + dpri->checksum_offset)) = 0;
+
+			/* this will break with VLAN tags */
+
+			data_sum = csum_partial(skb->data + ETHER_HEADER_SIZE, skb->len - ETHER_HEADER_SIZE, 0); 
+			and_ether_sum = csum_partial(skb->data, ETHER_HEADER_SIZE, data_sum);
+
+			if (old_checksum !=  csum_fold(csum_partial(header_buffer, dpri->offset, and_ether_sum))) {
+				skb->dev->stats.rx_dropped++;
+				return 0;
+			} else {
+				skb->csum=data_sum;
+				skb->ip_summed = CHECKSUM_COMPLETE;
+			}
+		}
+		return 1;
+	} else {
+		skb->dev->stats.rx_dropped++;
+	}
+	return 0;
+}
+
+static struct sk_buff * uml_gre_multiread (struct uml_net_private * lp) {
+	struct uml_gre_data *dpri = (struct uml_gre_data *) &lp->user;
+	void ** skb_recv_vector = dpri->skb_recv_vector;
+	struct mmsghdr * mmsg_recv_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+	struct sk_buff * result;
+	struct iovec * iov;
+	int ret;
+	
+	
+	/* Are we done processing the enqueued buffers */
+
+
+	if (dpri->recv_index >= dpri->recv_enqueued) {
+		ret = net_recvmmsg(
+			dpri->fd, mmsg_recv_vector, dpri->vector_len, 0,NULL);
+		if (ret >= 0) {
+			dpri->recv_enqueued = ret;
+		} else {
+			printk("Error in multi-packet receive %d\n", ret);
+			return NULL;
+		}
+		dpri->recv_index = 0;
+	}
+
+	/* check if we are done processing the enqueued buffers */
+
+	skb_recv_vector += dpri->recv_index;
+	mmsg_recv_vector += dpri->recv_index;
+	while (dpri->recv_index < dpri->recv_enqueued) {
+		dpri->recv_index ++;
+		iov = mmsg_recv_vector->msg_hdr.msg_iov;
+		if (
+			(iov) &&
+			(mmsg_recv_vector->msg_len > dpri->header_size) && 
+			(uml_gre_verify_header(iov->iov_base, result, dpri))
+		) {
+			if (!dpri->remote_addr) {
+				if (mmsg_recv_vector->msg_hdr.msg_name) {
+					dpri->remote_addr = mmsg_recv_vector->msg_hdr.msg_name;
+					dpri->remote_addr_size = 
+						mmsg_recv_vector->msg_hdr.msg_namelen;
+					mmsg_recv_vector->msg_hdr.msg_namelen =  sizeof (struct sockaddr_storage);
+				}
+			}
+			result = (struct sk_buff *)(* skb_recv_vector);
+			if (result) {
+				skb_trim(result, mmsg_recv_vector->msg_len - dpri->header_size);
+				result->protocol = (*lp->protocol)(result);
+				/* replace the buffer we just (ab)used */
+				(* skb_recv_vector) = uml_net_build_skb(lp->dev);
+				add_skbuffs(mmsg_recv_vector, skb_recv_vector, 1, lp->max_packet, 1);
+				return result;
+			} else {
+				printk("encountered failed atomic allocation @%i, skipping to next\n", dpri->recv_index);
+			}
+		} else {
+			if (mmsg_recv_vector->msg_hdr.msg_name) {
+				/* reset size */
+				mmsg_recv_vector->msg_hdr.msg_namelen =
+					sizeof (struct sockaddr_storage);
+			}
+			result = NULL;
+		}
+		skb_recv_vector ++;
+		mmsg_recv_vector ++;
+	}
+	return result;
+}
+
+static int uml_gre_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	int result;
+	struct uml_gre_data *dpri = (struct uml_gre_data *) &lp->user;
+	uint8_t  *buffer ;
+
+
+	int offset = dpri->offset;
+	
+	buffer = dpri->network_buffer;
+
+	if (dpri->mode & GRE_MODE_IP_VERSION)
+	{
+		/* IPv4 RAW mode: Account for the IP header that will be received */
+		offset += sizeof(struct iphdr);
+	}
+
+	result = uml_gre_user_recvmsg(
+			fd, 
+			buffer, offset,
+			skb->data, skb->dev->mtu + ETH_HEADER_OTHER,
+			dpri
+		);
+	if (result <= 0) {
+		return result;
+	} 
+	if (!(dpri->mode & GRE_MODE_IP_VERSION)) {
+	/* IPv4 RAW mode: Ignore the IP header */
+		buffer += sizeof(struct iphdr);
+	}
+
+	if ((result > offset) && (uml_gre_verify_header(buffer, skb, dpri))) {
+		return result - offset;
+	} else {
+		return 0;
+	}
+}
+
+static void uml_gre_form_header(uint8_t * header_buffer, 
+		struct sk_buff* skb, 
+		struct uml_gre_data *pri) 
+{
+	struct gre_minimal_header *header;
+
+	__wsum partial_sum;
+
+	if (!header_buffer) {
+		return;
+	}	
+
+	header = (struct gre_minimal_header *) header_buffer;
+
+	header->header = htons((pri->mode & 0xF)<<12); 
+	header->arptype = GRE_IRB; 
+
+	if (pri->mode & GRE_MODE_SEQUENCE) {
+		* ((uint32_t *)(header_buffer + pri->sequence_offset)) = htonl(++pri->sequence);
+	}
+
+	if (pri->mode & GRE_MODE_KEY) {
+		* ((uint32_t *)(header_buffer + pri->key_offset)) = pri->tx_key; /* we will keep 'em htonled */
+	}
+
+	/* TODO: The methodology here should be:
+	   	1. Report the driver as NETIF_F_HW_CSUM
+		2. We will get a start csum and an end csum and where to put it
+		3. Compute the csum, stash it
+		4. Write where we are told
+		5. Determine what else do we need to csum on either side of the HW_CSUM instructions
+		6. Adjust for the fact that we may have modified the packet as part of csum computation
+		7. Store the newly computed gre csum
+		
+		In the meantime we are just doing brute force on xmit
+	*/
+
+	if (pri->mode & GRE_MODE_CHECKSUM) {
+		* ((uint32_t *) (header_buffer + pri->checksum_offset)) = 0;
+		partial_sum = csum_partial(skb->data,skb->len, 0);
+ 		partial_sum = csum_partial(header_buffer, pri->offset, partial_sum);
+		* ((uint16_t *) (header_buffer + pri->checksum_offset)) 
+			=  csum_fold(partial_sum);
+	}
+}
+
+
+void gre_complete_init(void * dev_id, int max_depth) {
+
+	struct net_device *dev = dev_id;
+	struct uml_net_private *lp = netdev_priv(dev);
+	struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+	struct mmsg_queue_info * queue_info ; 
+
+	queue_info =  
+		kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+	if (queue_info) {
+		queue_info->fd = pri->fd;
+		queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+		queue_info->skb_send_vector = pri->skb_send_vector;
+		queue_info->head = 0;
+		queue_info->tail = 0;
+		queue_info->queue_depth = 0;
+		queue_info->max_depth = max_depth;
+		spin_lock_init(&queue_info->head_lock); 
+		spin_lock_init(&queue_info->tail_lock); 
+	}
+	pri->send_queue_info = queue_info;
+}
+
+void gre_kern_destroy(struct uml_gre_data *pri) {
+
+	int ret = -1;
+	struct mmsg_queue_info * queue_info = pri->send_queue_info;
+	/* flush queue */
+	do {
+		ret = uml_net_flush_mmsg_queue(queue_info, -1);
+	} while (ret != 0);
+	pri->send_queue_info = NULL;
+	kfree(queue_info);
+}
+
+static int uml_gre_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+	int result, queue_depth;
+	struct mmsghdr * mmsg_send_vector;
+	struct iovec * iov;
+	struct mmsg_queue_info * send_queue_info; 
+	struct sk_buff * mmsg_clone;
+	void ** skb_send_vector;
+	
+	send_queue_info = (struct mmsg_queue_info *) pri->send_queue_info;
+
+	spin_lock(&send_queue_info->tail_lock);
+
+	spin_lock(&send_queue_info->head_lock);
+	queue_depth = send_queue_info->queue_depth;
+	spin_unlock(&send_queue_info->head_lock);
+	
+	if (pri->remote_addr && (queue_depth < send_queue_info->max_depth)) {
+		mmsg_clone = skb_clone(skb, GFP_ATOMIC);
+		if (mmsg_clone) {
+
+			skb_send_vector = pri->skb_send_vector;
+			skb_send_vector +=  send_queue_info->tail;
+			(* skb_send_vector) = mmsg_clone;
+
+			mmsg_send_vector = pri->mmsg_send_vector;
+			mmsg_send_vector += send_queue_info->tail;
+			mmsg_send_vector->msg_hdr.msg_name = pri->remote_addr;
+			mmsg_send_vector->msg_hdr.msg_namelen = pri->remote_addr_size;
+
+			iov = mmsg_send_vector->msg_hdr.msg_iov;
+
+			if (iov) {
+
+				uml_gre_form_header(iov->iov_base, skb, pri);
+				
+				iov++;
+				
+				iov->iov_base = skb->data;
+				iov->iov_len = skb->len;
+
+				queue_depth = uml_net_advance_tail(send_queue_info, 1);
+			} else {
+				printk("no iov, cannot enqueue\n");
+			}
+		} else {
+			printk("cloning failed\n");
+		}
+	} 
+		
+	spin_unlock(&send_queue_info->tail_lock);
+
+	if (queue_depth > 0) {
+		result = uml_net_flush_mmsg_queue(
+			send_queue_info, queue_depth
+		); 
+	} 
+	return skb->len; /* not particularly correct */
+}
+
+static const struct net_kern_info uml_gre_kern_info = {
+	.options		= UML_NET_USE_SKB_READ,
+	.init			= uml_gre_init,
+	.protocol		= eth_protocol,
+	.read			= uml_gre_read,
+	.skb_read		= uml_gre_multiread,
+	.write			= uml_gre_write,
+};
+
+static int uml_gre_setup(char *str, char **mac_out, void *data)
+{
+	struct uml_gre_init *init = data;
+	char *remain;
+
+	*init = (
+		(struct uml_gre_init)
+		   { 
+			 .local_addr_string = "::1",
+			 .mode_string = "0",
+		   }
+			);
+
+	remain = split_if_spec(str, 
+			mac_out, 
+			&init->local_addr_string, 
+			&init->remote_addr_string, 
+			&init->rx_key_string, 
+			&init->tx_key_string, 
+			&init->mode_string, 
+			NULL
+		);
+	if (remain != NULL)
+		printk(KERN_WARNING " Strange interface spec \n");
+	return 1;
+}
+
+static struct transport uml_gre_transport = {
+	.list 		= LIST_HEAD_INIT(uml_gre_transport.list),
+	.name 		= "gre",
+	.setup  	= uml_gre_setup,
+	.user 		= &uml_gre_user_info,
+	.kern 		= &uml_gre_kern_info,
+	.private_size 	= sizeof(struct uml_gre_data),
+	.setup_size 	= sizeof(struct uml_gre_init),
+};
+
+static int register_uml_gre(void)
+{
+	register_transport(&uml_gre_transport);
+	return 0;
+}
+
+late_initcall(register_uml_gre);
diff --git a/arch/um/drivers/uml_gre_user.c b/arch/um/drivers/uml_gre_user.c
new file mode 100644
index 0000000..ec4dd72
--- /dev/null
+++ b/arch/um/drivers/uml_gre_user.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (C) 2012-2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+#include "uml_gre.h"
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+#include "user.h"
+
+#define VECTOR_SIZE 32
+#define PROTO_GRE 47
+
+static int gre_parse_key(char *src , void * dst) {
+	if ((src == NULL) || (sscanf(src, "%x", (unsigned int *) dst) != 1)) { 
+		printk(UM_KERN_ERR "cannot parse key!!!: %s\n", src);
+		return -1;
+	} 
+	* (( uint32_t *) dst) = htonl(* ((uint32_t* )dst));
+	return 0;
+}
+
+static void uml_gre_remove(void *data)
+{
+	struct uml_gre_data *pri = data;
+
+	gre_kern_destroy(pri); 
+	if (pri->fd > 0) {
+		close(pri->fd);
+	}
+	pri->fd = -1;
+	if (pri->skb_send_vector) {
+		/* this one should be empty - we flushed it so we just free it */
+		kfree(pri->skb_send_vector);
+		pri->skb_send_vector = NULL;
+	}
+	if (pri->mmsg_send_vector) {
+		destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 1);
+		pri->mmsg_send_vector = NULL;
+	}
+	if (pri->skb_recv_vector) {
+		destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+		pri->skb_recv_vector = NULL; 
+	}
+	if (pri->mmsg_recv_vector) {
+		destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 1);
+		pri->mmsg_recv_vector = NULL; 
+	}
+	if (pri->network_buffer) {
+		kfree(pri->network_buffer);
+		pri->network_buffer = NULL; 
+	}
+}
+
+static int uml_gre_user_init(void *data, void *dev)
+{
+	struct uml_gre_data *pri = data;
+	int fd;
+
+	int sock_family;
+	int ret;
+	struct addrinfo hints;
+	struct addrinfo *result;
+	char service[NI_MAXSERV];
+	  
+	/* 
+	 
+	This may look ugly, but there is no way about it
+	  UML DIY threading is incompatible with getaddrinfo
+	  so all resolution has to be done using legacy functions
+	
+	  */
+	
+	struct mmsghdr * mmsghdr;
+
+	printk(UM_KERN_INFO "gre user init mode %i\n", pri->mode);
+
+	pri->offset = sizeof(struct gre_minimal_header);
+	pri->checksum_offset = pri->offset;
+	pri->key_offset = pri->offset;
+	pri->sequence_offset = pri->offset;
+
+	pri->fd = -1;
+
+	if (pri->mode & GRE_MODE_CHECKSUM) {
+		pri->offset += 4;
+		pri->key_offset += 4;
+		pri->sequence_offset += 4;
+	}
+
+	if (pri->mode & GRE_MODE_KEY) {
+		pri->offset += 4;
+		pri->sequence_offset +=4;
+		pri->tx_key = 0;
+		pri->rx_key = 0;
+		if (gre_parse_key(pri->tx_key_string,&pri->tx_key) !=0) {
+			return -1;
+		}
+		if (gre_parse_key(pri->rx_key_string,&pri->rx_key) !=0) {
+			return -1;
+		}
+	}
+
+	if (pri->mode & GRE_MODE_SEQUENCE) {
+		pri->offset += 4;
+	}
+
+	/* basic variable parsing */
+	
+	if (pri->remote_addr_string) {		
+		/* we now allocate it only if it we are not "listening" */
+		pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+	} else {
+		pri->remote_addr = NULL;
+	}
+
+	if (pri->mode & GRE_MODE_IP_VERSION) {
+		/* IPv6 */
+		sock_family = AF_INET6;
+	} else {
+		/* IPv4 */
+		sock_family = AF_INET;
+	}
+
+	printk(UM_KERN_ERR "uml_gre_user_init: preparing raw socket for mode %x\n ", pri->mode);
+
+	memset(&hints, 0, sizeof(hints));
+
+	hints.ai_flags = AI_PASSIVE;
+	hints.ai_family = sock_family;
+	hints.ai_socktype = SOCK_RAW;
+	hints.ai_protocol = PROTO_GRE;
+	
+	if ((fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol)) == -1) {
+		fd = -errno;
+		printk(UM_KERN_ERR "uml_gre_user_init: socket creation failed, "
+		 "errno = %d\n", -fd);
+		return fd;
+	}
+
+	pri->fd = fd;
+
+	memset(service, '\0', NI_MAXSERV);
+	ret = getaddrinfo(pri->local_addr_string, service, &hints, &result);
+	if ((ret != 0) || (result == NULL)) {
+		printk(UM_KERN_ERR "uml_gre_user_init: Unable to parse the local endpoint: %d\n", ret);
+		uml_gre_remove(pri);
+		return -1;
+	}
+	if (bind(fd, (struct sockaddr *)result->ai_addr, result->ai_addrlen)) {
+		printk("uml_gre_user_init:  could not bind socket: %d\n", errno);
+		freeaddrinfo(result);
+		uml_gre_remove(pri);
+		return -1;
+	}
+
+	printk("uml_gre_user_init: socket bound\n");
+	freeaddrinfo(result);
+
+	if (pri->remote_addr) {
+		memset(service, '\0', NI_MAXSERV);
+		memset(&hints, 0, sizeof(hints));
+
+		hints.ai_flags = AI_PASSIVE;
+		hints.ai_family = sock_family;
+		hints.ai_socktype = SOCK_RAW;
+		hints.ai_protocol = PROTO_GRE;
+
+		ret = getaddrinfo(pri->remote_addr_string, service, &hints, &result);
+
+		if ((ret != 0) || (result == NULL)) {
+			printk(UM_KERN_ERR "uml_gre_user_init: Unable to parse the remote endpoint: %d\n", ret);
+			uml_gre_remove(pri);
+			return -1;
+		}
+
+		memset(pri->remote_addr, '\0' , sizeof(struct sockaddr_storage));
+		memcpy(pri->remote_addr, result->ai_addr, result->ai_addrlen);
+		pri->remote_addr_size = result->ai_addrlen;
+		freeaddrinfo(result);
+	}
+		
+	/* vector IO init */
+
+	pri->vector_len = VECTOR_SIZE;
+	pri->recv_index = 0;
+	pri->recv_enqueued = 0;
+	pri->header_size = pri->offset /* fix for ipv4 raw */;
+
+	if (!(pri->mode & GRE_MODE_IP_VERSION)){
+		pri->header_size += sizeof(struct iphdr) /* fix for ipv4 raw */;
+	}
+
+	pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+	if (! pri->skb_recv_vector) {
+		uml_gre_remove(pri);
+		return -1;
+	}
+	pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+	if (! pri->mmsg_recv_vector) {
+		uml_gre_remove(pri);
+		return -1;
+	}
+	add_header_buffers(pri->mmsg_recv_vector, VECTOR_SIZE, pri->header_size); 
+	add_skbuffs(
+		pri->mmsg_recv_vector, 
+		pri->skb_recv_vector, 
+		VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER, 
+		1
+	);
+
+	pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+	if (pri->skb_send_vector) {
+		memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+	} else {
+		uml_gre_remove(pri);
+		return -1;
+	}
+	pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+	if (! pri->mmsg_send_vector) {
+		uml_gre_remove(pri);
+		return -1;
+	}
+	add_header_buffers(pri->mmsg_send_vector, VECTOR_SIZE, pri->offset); 
+
+
+	pri->network_buffer = uml_kmalloc(pri->header_size, UM_GFP_KERNEL); /* enough for any header, regardless how stupid */
+
+	if (!pri->network_buffer) {
+		printk("uml_gre_user_init: could not allocate buffer\n");
+		close(fd);
+		return -1;
+	}
+
+	if (!pri->remote_addr) {
+		mmsghdr = (struct mmsghdr *) pri->mmsg_recv_vector;
+		mmsghdr->msg_hdr.msg_name = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+		if (mmsghdr->msg_hdr.msg_name) {
+			mmsghdr->msg_hdr.msg_namelen = sizeof(struct sockaddr_storage);
+		} else {
+			printk("uml_gre_user_init: Failed to allocate remote address name\n");
+		}
+	}
+	 
+	pri->dev = dev;
+
+	gre_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+	if (pri->fd < 0) {
+		return pri->fd;
+	}
+
+	printk("uml_gre_user_init: init complete, fd %i\n", fd);
+ 
+	return 0;
+}
+
+static int uml_gre_open(void *data)
+{
+	struct uml_gre_data *pri = data;
+	return pri->fd;
+}
+
+
+int uml_gre_user_sendmsg(int fd, 
+			void *header, int headerlen, void *data, 
+			int datalen, struct uml_gre_data *pri)
+{
+	struct msghdr message;
+	struct iovec vec[2];
+	vec[0].iov_base = header;
+	vec[0].iov_len = headerlen;
+	vec[1].iov_base = data;
+	vec[1].iov_len = datalen;
+
+
+	message.msg_name = pri->remote_addr;
+	message.msg_namelen = pri->remote_addr_size;
+	message.msg_iov = (struct iovec *) &vec;
+	message.msg_iovlen = 2;
+	message.msg_control = NULL;
+	message.msg_controllen = 0;
+	message.msg_flags = MSG_DONTWAIT;
+
+
+	if (pri->remote_addr != NULL) {
+		return net_sendmessage(fd, &message, MSG_DONTWAIT);
+	} else {
+		return -1;
+	}
+}
+int uml_gre_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri)
+{
+	struct msghdr message;
+	struct iovec vec[2];
+	vec[0].iov_base = header;
+	vec[0].iov_len = headerlen;
+	vec[1].iov_base = data;
+	vec[1].iov_len = datalen;
+
+	if (!pri->remote_addr) {
+		pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+		if (pri->remote_addr) {
+			message.msg_name = pri->remote_addr;
+			message.msg_namelen = pri->remote_addr_size;
+		} else {
+			message.msg_name = NULL;
+			message.msg_namelen = 0;
+		}
+	} else {
+			message.msg_name = NULL;
+			message.msg_namelen = 0;
+	}
+	
+	message.msg_iov = (struct iovec *) &vec;
+	message.msg_iovlen = 2;
+	message.msg_control = NULL;
+	message.msg_controllen = 0;
+	message.msg_flags = MSG_DONTWAIT;
+
+	return net_recvmessage(fd, &message, MSG_DONTWAIT);
+}
+const struct net_user_info uml_gre_user_info = {
+	.init		= uml_gre_user_init,
+	.open		= uml_gre_open,
+	.close	 	= NULL,
+	.remove	 	= uml_gre_remove,
+	.add_address	= NULL,
+	.delete_address = NULL,
+	.mtu		= ETH_MAX_PACKET,
+	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER + MAX_GRE_HEADER,
+};
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 06/10] RAW Ethernet transport for UML
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
                   ` (3 preceding siblings ...)
  2014-08-29  7:05 ` [uml-devel] [PATCH 05/10] GRE transport " anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 07/10] Performance and NUMA improvements for ubd anton.ivanov
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

This is an alternative to the well known pcap transport.

In the absense of special hardware support pcap is slow,
guaranteed to be slow and with significant penalties on
NUMA/SMP systems due to the timestamping of every packet.

This transport does not incur any of these timestamping
penalties. It reads and writes packets directly using
recvmmsg and sendmmsg calls.

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/Kconfig.net            |   12 ++
 arch/um/drivers/Makefile       |    2 +
 arch/um/drivers/uml_raw.h      |   57 +++++++++
 arch/um/drivers/uml_raw_kern.c |  259 ++++++++++++++++++++++++++++++++++++++++
 arch/um/drivers/uml_raw_user.c |  166 +++++++++++++++++++++++++
 5 files changed, 496 insertions(+)
 create mode 100644 arch/um/drivers/uml_raw.h
 create mode 100644 arch/um/drivers/uml_raw_kern.c
 create mode 100644 arch/um/drivers/uml_raw_user.c

diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 7c8ba68..e38f839 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -105,6 +105,18 @@ config UML_NET_GRE
         the applicable RFCs. The driver supports Soft GRE (wait for connect)
         as used in Cable systems, etc.
 
+config UML_NET_RAW
+	bool "RAW transport"
+	depends on UML_NET
+	help
+        This User-Mode Linux network transport allows UML to bind a raw 
+        Ethernet interface using a high-performance non-capture oriented
+        method to read and write traffic. The difference between this driver
+        and any form of PCAP is that this driver does not incur the cost 
+        of getting the timestamp for every packet read. This allows it to
+        reach higher performance levels (in the Gigabit range).
+        
+
 config UML_NET_DAEMON
 	bool "Daemon transport"
 	depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 66127ee..9c9d821 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -11,6 +11,7 @@ slirp-objs := slirp_kern.o slirp_user.o
 daemon-objs := daemon_kern.o daemon_user.o
 uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
 uml_gre-objs := uml_gre_kern.o uml_gre_user.o
+uml_raw-objs := uml_raw_kern.o uml_raw_user.o
 umcast-objs := umcast_kern.o umcast_user.o
 net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
 mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -47,6 +48,7 @@ obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
 obj-$(CONFIG_UML_NET_DAEMON) += daemon.o 
 obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o 
 obj-$(CONFIG_UML_NET_GRE) += uml_gre.o 
+obj-$(CONFIG_UML_NET_RAW) += uml_raw.o 
 obj-$(CONFIG_UML_NET_VDE) += vde.o
 obj-$(CONFIG_UML_NET_MCAST) += umcast.o
 obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_raw.h b/arch/um/drivers/uml_raw.h
new file mode 100644
index 0000000..224205e
--- /dev/null
+++ b/arch/um/drivers/uml_raw.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_RAW_H__
+#define __UML_RAW_H__
+
+#include "net_user.h"
+
+struct uml_raw_data {
+	char *host_iface;
+	int fd;
+	void *dev;
+        uint32_t uml_raw_flags;
+
+	/* packet mmap read */
+
+	uint8_t *scratch_buffer;  /* for dummy reads*/
+	uint8_t *multiread_buffer;
+	int ring_index;
+
+	/* multi-rx read */
+
+	void ** skb_recv_vector;
+	void * mmsg_recv_vector;
+	void ** skb_send_vector;
+	void * mmsg_send_vector;
+
+	uint32_t vector_len;
+	uint32_t recv_index;
+	uint32_t recv_enqueued;
+
+	void * send_queue_info;
+
+};
+
+extern const struct net_user_info uml_raw_user_info;
+
+extern int uml_raw_user_write(int fd, void *buf, int len,
+			     struct uml_raw_data *pri);
+
+extern void raw_complete_init(void * dev_id, int max_depth);
+extern void raw_kern_destroy(struct uml_raw_data *pri);
+
+#define UML_RAW_FLAG_TX_CHECKSUMS                0x00000001
+#define UML_RAW_FLAG_RX_CHECKSUMS                0x00000002
+
+
+#define UML_RAW_TP_BLOCK_SIZE 4096
+#define UML_RAW_TP_FRAME_SIZE 2048
+#define UML_RAW_TP_BLOCK_NR 32
+#define UML_RAW_TP_FRAME_NR 64
+
+
+#endif
diff --git a/arch/um/drivers/uml_raw_kern.c b/arch/um/drivers/uml_raw_kern.c
new file mode 100644
index 0000000..ea6dbdf
--- /dev/null
+++ b/arch/um/drivers/uml_raw_kern.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_packet.h>
+#include "net_kern.h"
+#include "uml_raw.h"
+#include "linux/mutex.h"
+
+#define DRIVER_NAME "uml-raw"
+
+
+struct uml_raw_init {
+	char *host_iface;
+};
+
+static void uml_raw_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	strcpy(info->driver, DRIVER_NAME);
+	strcpy(info->version, "42");
+}
+
+
+static const struct ethtool_ops uml_raw_ethtool_ops = {
+	.get_drvinfo			  = uml_raw_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+};
+
+
+static void uml_raw_init(struct net_device *dev, void *data)
+{
+	struct uml_net_private *pri;
+	struct uml_raw_data *dpri;
+	struct uml_raw_init *init = data;
+
+	pri = netdev_priv(dev);
+	dpri = (struct uml_raw_data *) pri->user;
+	dpri->host_iface = init->host_iface;
+	dpri->fd = -1;
+	dpri->dev = dev;
+	 
+	/* We will free this pointer. If it contains crap we're burned. */
+
+	printk("raw backend - host iface: %s",  dpri->host_iface);
+	printk("\n");
+	printk("enabling ethtool support\n");
+	dpri->uml_raw_flags = 0; /* we have everything turned off initially */
+	SET_ETHTOOL_OPS(dev, &uml_raw_ethtool_ops);
+
+}
+
+static struct sk_buff * uml_raw_multiread (struct uml_net_private * lp) {
+	struct uml_raw_data *dpri = (struct uml_raw_data *) &lp->user;
+	void ** skb_vector = dpri->skb_recv_vector;
+	struct mmsghdr * mmsg_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+	struct sk_buff * result = NULL;
+	struct iovec * iov;
+	int ret;
+	 
+	if (dpri->recv_index >= dpri->recv_enqueued) {
+		dpri->recv_index = 0;
+		if (dpri->recv_enqueued) {
+			 rebuild_skbuf_vector(skb_vector, dpri->recv_enqueued, lp->dev);
+			 add_skbuffs(mmsg_vector, skb_vector, dpri->recv_enqueued, lp->max_packet, 0);
+		}
+		ret = net_recvmmsg(
+			 dpri->fd, dpri->mmsg_recv_vector, dpri->vector_len, 0, NULL);
+		if (ret >= 0) {
+			 dpri->recv_enqueued = ret;
+		} else {
+			 dpri->recv_enqueued = 0;
+			 return NULL;
+		}
+	}
+	skb_vector += dpri->recv_index;
+	mmsg_vector += dpri->recv_index;
+	while (dpri->recv_index < dpri->recv_enqueued) {
+		dpri->recv_index ++;
+		iov = mmsg_vector->msg_hdr.msg_iov;
+		if ((mmsg_vector->msg_len) && (iov)) {
+			result = (struct sk_buff *)(* skb_vector);
+			if (result) {
+				skb_trim(result, mmsg_vector->msg_len);
+				result->protocol = (*lp->protocol)(result);
+				return result;
+			} else {
+				printk("encountered failed atomic allocation, skipping to next\n");
+			}
+		} else {
+			uml_net_destroy_skb(* skb_vector ) ; /* otherwise we leak it */
+			result = NULL;
+		}
+		mmsg_vector++;
+		skb_vector++;
+	} 
+	return result;
+}
+
+
+
+static int uml_raw_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	int result;
+	struct uml_raw_data *dpri;
+	dpri = (struct uml_raw_data *) lp->user;
+
+	result = net_read(fd, skb_mac_header(skb),
+		skb->dev->mtu + ETH_HEADER_OTHER);
+
+	return result;
+}
+
+static int uml_raw_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+	struct uml_raw_data *pri = (struct uml_raw_data *) &lp->user;
+	int result, queue_depth;
+	struct mmsghdr * mmsg_send_vector;
+	struct iovec * iov;
+	struct mmsg_queue_info * send_queue_info; 
+	struct sk_buff * mmsg_clone;
+	void ** skb_send_vector;
+	
+	send_queue_info = (struct mmsg_queue_info *) pri->send_queue_info;
+
+	spin_lock(&send_queue_info->tail_lock);
+
+	spin_lock(&send_queue_info->head_lock);
+	queue_depth = send_queue_info->queue_depth;
+	spin_unlock(&send_queue_info->head_lock);
+	
+	if (queue_depth < send_queue_info->max_depth) {
+		mmsg_clone = skb_clone(skb, GFP_ATOMIC);
+		if (mmsg_clone) {
+
+			skb_send_vector = pri->skb_send_vector;
+			skb_send_vector +=  send_queue_info->tail;
+			(* skb_send_vector) = mmsg_clone;
+
+			mmsg_send_vector = pri->mmsg_send_vector;
+			mmsg_send_vector += send_queue_info->tail;
+
+			iov = mmsg_send_vector->msg_hdr.msg_iov;
+
+			if (iov) {
+
+				iov->iov_base = skb->data;
+				iov->iov_len = skb->len;
+
+				queue_depth = uml_net_advance_tail(send_queue_info, 1);
+			} else {
+				printk("no iov, cannot enqueue\n");
+			}
+		} else {
+			printk("cloning failed\n");
+		}
+	} 
+		
+	spin_unlock(&send_queue_info->tail_lock);
+
+	if (queue_depth > 0) {
+		result = uml_net_flush_mmsg_queue(
+			send_queue_info, queue_depth
+		); 
+	} 
+	return skb->len; /* not particularly correct */
+}
+
+static const struct net_kern_info uml_raw_kern_info = {
+	.options		= UML_NET_USE_SKB_READ,
+	.init			= uml_raw_init,
+	.protocol		= eth_protocol,
+	.read			= uml_raw_read,
+	.skb_read		= uml_raw_multiread,
+	.write			= uml_raw_write
+};
+
+static int uml_raw_setup(char *str, char **mac_out, void *data)
+{
+	struct uml_raw_init *init = data;
+	char *remain;
+
+	*init = (
+		(struct uml_raw_init)
+			{ .host_iface = "eth0"}
+	);
+
+	remain = split_if_spec(str, mac_out, &init->host_iface, NULL);
+	if (remain != NULL)
+		printk(KERN_WARNING " Strange interface spec \n");
+
+	return 1;
+}
+
+
+void raw_kern_destroy(struct uml_raw_data *pri) {
+
+	int ret = -1;
+	struct mmsg_queue_info * queue_info = pri->send_queue_info;
+	/* flush queue */
+	do {
+		ret = uml_net_flush_mmsg_queue(queue_info, -1);
+	} while (ret != 0);
+	pri->send_queue_info = NULL;
+	kfree(queue_info);
+}
+
+
+void raw_complete_init(void * dev_id, int max_depth) {
+
+	struct net_device *dev = dev_id;
+	struct uml_net_private *lp = netdev_priv(dev);
+	struct uml_raw_data *pri = (struct uml_raw_data *) &lp->user;
+	struct mmsg_queue_info * queue_info ; 
+	int err;
+
+	queue_info =  
+		kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+	if (queue_info) {
+		queue_info->fd = pri->fd;
+		queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+		queue_info->skb_send_vector = pri->skb_send_vector;
+		queue_info->head = 0;
+		queue_info->tail = 0;
+		queue_info->queue_depth = 0;
+		queue_info->max_depth = max_depth;
+		spin_lock_init(&queue_info->head_lock); 
+		spin_lock_init(&queue_info->tail_lock); 
+	}
+	pri->send_queue_info = queue_info;
+}
+
+
+
+static struct transport uml_raw_transport = {
+	.list 		= LIST_HEAD_INIT(uml_raw_transport.list),
+	.name 		= "raw",
+	.setup  	= uml_raw_setup,
+	.user 		= &uml_raw_user_info,
+	.kern 		= &uml_raw_kern_info,
+	.private_size 	= sizeof(struct uml_raw_data),
+	.setup_size 	= sizeof(struct uml_raw_init),
+};
+
+static int register_uml_raw(void)
+{
+	register_transport(&uml_raw_transport);
+	return 0;
+}
+
+late_initcall(register_uml_raw);
diff --git a/arch/um/drivers/uml_raw_user.c b/arch/um/drivers/uml_raw_user.c
new file mode 100644
index 0000000..ab31108
--- /dev/null
+++ b/arch/um/drivers/uml_raw_user.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <sys/mman.h>
+
+
+#include "uml_raw.h"
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+#include "user.h"
+
+#define VECTOR_SIZE 32
+
+static void uml_raw_remove(void *data)
+{
+	struct uml_raw_data *pri = data;
+
+	raw_kern_destroy(pri); 
+	if (pri->fd > 0) {
+		close(pri->fd);
+	}
+	pri->fd = -1;
+	if (pri->skb_send_vector) {
+		/* this one should be empty - we flushed it so we just free it */
+		kfree(pri->skb_send_vector);
+		pri->skb_send_vector = NULL;
+	}
+	if (pri->mmsg_send_vector) {
+		destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 0);
+		pri->mmsg_send_vector = NULL;
+	}
+	if (pri->skb_recv_vector) {
+		destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+		pri->skb_recv_vector = NULL; 
+	}
+	if (pri->mmsg_recv_vector) {
+		destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 0);
+		pri->mmsg_recv_vector = NULL; 
+	}
+}
+
+static int uml_raw_user_init(void *data, void *dev)
+{
+	struct uml_raw_data *pri = data;
+	struct ifreq ifr;
+	int fd;
+	struct sockaddr_ll sock;
+	int err;
+	struct tpacket_req tpacket;
+	 
+	pri->ring_index = 0;
+	 
+	if ((fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))) == -1) {
+		 err = -errno;
+		 printk(UM_KERN_ERR "uml_raw_open : raw socket creation failed, "
+				 "errno = %d\n", -err);
+		 return err;
+	}
+	
+	pri->fd = fd;
+	pri->vector_len = VECTOR_SIZE;
+	pri->recv_index = 0;
+	pri->recv_enqueued = 0;
+	pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+	if (! pri->skb_recv_vector) {
+		uml_raw_remove(pri);
+		return -1;
+	}
+	pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 1);
+	if (! pri->mmsg_recv_vector) {
+		uml_raw_remove(pri);
+		return -1;
+	}
+	add_skbuffs(
+		 pri->mmsg_recv_vector, 
+		 pri->skb_recv_vector, 
+		 VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER, 
+		 0
+	);
+
+	pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+	if (pri->skb_send_vector) {
+		memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+	} else {
+		uml_raw_remove(pri);
+		return -1;
+	}
+	pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 1);
+	if (! pri->mmsg_send_vector) {
+		uml_raw_remove(pri);
+		return -1;
+	}
+	memset(&ifr, 0, sizeof(ifr));
+	strncpy(&ifr.ifr_name, pri->host_iface, sizeof(ifr.ifr_name) - 1);
+	if(ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
+		err = -errno;
+		printk(UM_KERN_ERR "SIOCGIFINDEX, failed to get raw interface index for %s", pri->host_iface);
+		uml_raw_remove(pri);
+		return(-1);
+	}
+
+	sock.sll_family = AF_PACKET;
+	sock.sll_protocol = htons(ETH_P_ALL);
+	sock.sll_ifindex = ifr.ifr_ifindex;
+
+	printk(UM_KERN_INFO "uml_raw: binding raw on interface index: %i\n", ifr.ifr_ifindex);
+	if (bind(fd, (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+		printk(UM_KERN_ERR "uml_raw: failed to bind raw socket");
+		uml_raw_remove(pri);
+		return(-1);
+	}
+
+	pri->dev = dev;
+
+	raw_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+	if (pri->fd < 0) {
+		return pri->fd;
+	}
+
+	return 0;
+}
+
+static int uml_raw_open(void *data)
+{
+	struct uml_raw_data *pri = data;
+	return pri->fd;
+}
+
+
+int uml_raw_user_write(int fd, void *buf, int len, struct uml_raw_data *pri)
+{
+	return net_write(fd, buf, len);
+}
+
+const struct net_user_info uml_raw_user_info = {
+	.init		= uml_raw_user_init,
+	.open		= uml_raw_open,
+	.close	 	= NULL,
+	.remove	 	= uml_raw_remove,
+	.add_address	= NULL,
+	.delete_address = NULL,
+	.mtu		= ETH_MAX_PACKET,
+	.max_packet	= ETH_MAX_PACKET + ETH_HEADER_OTHER,
+};
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 07/10] Performance and NUMA improvements for ubd
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
                   ` (4 preceding siblings ...)
  2014-08-29  7:05 ` [uml-devel] [PATCH 06/10] RAW Ethernet " anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 08/10] Minor performance optimization " anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 09/10] Better IPC for UBD anton.ivanov
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

The use of the seek()/read() and seek()/write() is a terminal
disease on NUMA. Intense use of this on shared files (f.e.
the master for a COW image) can cause anything up to and including
killing CPUs on unhandled NMIs.

This patch deals with this UML major issue (and one of UML biggest
performance pitfalls). As a result you can now run (subject to
correct pinning) 2000+ UMLs on a NUMA box without crashing it.

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/ubd_kern.c  |   29 ++++++++---------------------
 arch/um/include/shared/os.h |    2 ++
 arch/um/os-Linux/file.c     |   18 ++++++++++++++++++
 3 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 1cc72ae5..35ba00b 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2012-2014 Cisco Systems
  * Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
  * Licensed under the GPL
  */
@@ -534,11 +535,7 @@ static int read_cow_bitmap(int fd, void *buf, int offset, int len)
 {
 	int err;
 
-	err = os_seek_file(fd, offset);
-	if (err < 0)
-		return err;
-
-	err = os_read_file(fd, buf, len);
+	err = os_pread_file(fd, buf, len, offset);
 	if (err < 0)
 		return err;
 
@@ -1375,14 +1372,10 @@ static int update_bitmap(struct io_thread_req *req)
 	if(req->cow_offset == -1)
 		return 0;
 
-	n = os_seek_file(req->fds[1], req->cow_offset);
-	if(n < 0){
-		printk("do_io - bitmap lseek failed : err = %d\n", -n);
-		return 1;
-	}
-
-	n = os_write_file(req->fds[1], &req->bitmap_words,
-			  sizeof(req->bitmap_words));
+	n = os_pwrite_file(req->fds[1], &req->bitmap_words,
+			  sizeof(req->bitmap_words),
+			  req->cow_offset
+			);
 	if(n != sizeof(req->bitmap_words)){
 		printk("do_io - bitmap update failed, err = %d fd = %d\n", -n,
 		       req->fds[1]);
@@ -1426,18 +1419,12 @@ static void do_io(struct io_thread_req *req)
 		len = (end - start) * req->sectorsize;
 		buf = &req->buffer[start * req->sectorsize];
 
-		err = os_seek_file(req->fds[bit], off);
-		if(err < 0){
-			printk("do_io - lseek failed : err = %d\n", -err);
-			req->error = 1;
-			return;
-		}
 		if(req->op == UBD_READ){
 			n = 0;
 			do {
 				buf = &buf[n];
 				len -= n;
-				n = os_read_file(req->fds[bit], buf, len);
+				n = os_pread_file(req->fds[bit], buf, len, off);
 				if (n < 0) {
 					printk("do_io - read failed, err = %d "
 					       "fd = %d\n", -n, req->fds[bit]);
@@ -1447,7 +1434,7 @@ static void do_io(struct io_thread_req *req)
 			} while((n < len) && (n != 0));
 			if (n < len) memset(&buf[n], 0, len - n);
 		} else {
-			n = os_write_file(req->fds[bit], buf, len);
+			n = os_pwrite_file(req->fds[bit], buf, len, off);
 			if(n != len){
 				printk("do_io - write failed err = %d "
 				       "fd = %d\n", -n, req->fds[bit]);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 17b4e9f..7f544f4 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -142,6 +142,8 @@ extern int os_seek_file(int fd, unsigned long long offset);
 extern int os_open_file(const char *file, struct openflags flags, int mode);
 extern int os_read_file(int fd, void *buf, int len);
 extern int os_write_file(int fd, const void *buf, int count);
+extern int os_pread_file(int fd, void *buf, int len, unsigned long long offset);
+extern int os_pwrite_file(int fd, const void *buf, int count, unsigned long long offset);
 extern int os_sync_file(int fd);
 extern int os_file_size(const char *file, unsigned long long *size_out);
 extern int os_file_modtime(const char *file, unsigned long *modtime);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 07a7501..64951fd 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -266,6 +266,24 @@ int os_write_file(int fd, const void *buf, int len)
 	return n;
 }
 
+int os_pread_file(int fd, void *buf, int len, unsigned long long offset)
+{
+	int n = pread(fd, buf, len, offset);
+
+	if (n < 0)
+		return -errno;
+	return n;
+}
+
+int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset)
+{
+	int n = pwrite(fd, (void *) buf, len, offset);
+
+	if (n < 0)
+		return -errno;
+	return n;
+}
+
 int os_sync_file(int fd)
 {
 	int n = fsync(fd);
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 08/10] Minor performance optimization for ubd
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
                   ` (5 preceding siblings ...)
  2014-08-29  7:05 ` [uml-devel] [PATCH 07/10] Performance and NUMA improvements for ubd anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  2014-08-29  7:05 ` [uml-devel] [PATCH 09/10] Better IPC for UBD anton.ivanov
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

Obvious performance optimization - it is not necessary
to read the requests one at a time in the IRQ handler

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/ubd_kern.c |   29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 35ba00b..66d424a 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -443,6 +443,8 @@ static void do_ubd_request(struct request_queue * q);
 static int thread_fd = -1;
 static LIST_HEAD(restart);
 
+static struct io_thread_req  * ubd_request_list[MAX_SG];
+
 /* XXX - move this inside ubd_intr. */
 /* Called without dev->lock held, and only in interrupt context. */
 static void ubd_handler(void)
@@ -451,21 +453,34 @@ static void ubd_handler(void)
 	struct ubd *ubd;
 	struct list_head *list, *next_ele;
 	unsigned long flags;
-	int n;
+	int n, i;
+
+	/* 
+	 * obvious optimization - we do not need to read the reqs one at a time
+	 * we can read all pending reqs in one interrupt and handle them in bulk
+	 */
 
 	while(1){
-		n = os_read_file(thread_fd, &req,
-				 sizeof(struct io_thread_req *));
-		if(n != sizeof(req)){
+        do {
+		n = os_read_file(thread_fd, &ubd_request_list,
+				 sizeof(struct io_thread_req *) * MAX_SG);
+        } while (n == -EINTR);
+		if(n < 0){
 			if(n == -EAGAIN)
 				break;
 			printk(KERN_ERR "spurious interrupt in ubd_handler, "
 			       "err = %d\n", -n);
 			return;
+		} else if (n % sizeof(struct io_thread_req *) != 0)  {
+			printk(KERN_ERR "spurious interrupt in ubd_handler, "
+			       "err = %d\n", -n);
+			return;
+		}
+		for (i = 0; i < n / sizeof(struct io_thread_req *); i++) {
+			req = ubd_request_list[i];
+			blk_end_request(req->req, 0, req->length);
+			kfree(req);
 		}
-
-		blk_end_request(req->req, 0, req->length);
-		kfree(req);
 	}
 
 	list_for_each_safe(list, next_ele, &restart){
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [uml-devel] [PATCH 09/10] Better IPC for UBD
  2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
                   ` (6 preceding siblings ...)
  2014-08-29  7:05 ` [uml-devel] [PATCH 08/10] Minor performance optimization " anton.ivanov
@ 2014-08-29  7:05 ` anton.ivanov
  7 siblings, 0 replies; 10+ messages in thread
From: anton.ivanov @ 2014-08-29  7:05 UTC (permalink / raw)
  To: user-mode-linux-devel

From: Anton Ivanov <antivano@cisco.com>

socketpair() is a better IPC choice for lots of small requests
as it allows deeper (and configurable) queues than pipe()

As a result UBD will process nearly all of the requests submitted
to it instead of bouncing a significant percentage under load

Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
 arch/um/drivers/ubd_kern.c |    2 +-
 arch/um/drivers/ubd_user.c |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 66d424a..ae78211 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1268,7 +1268,7 @@ static bool submit_request(struct io_thread_req *io_req, struct ubd *dev)
 	int n = os_write_file(thread_fd, &io_req,
 			     sizeof(io_req));
 	if (n != sizeof(io_req)) {
-		if (n != -EAGAIN)
+		if ((n != -EAGAIN) && (n != -ENOBUFS))
 			printk("write to io thread failed, "
 			       "errno = %d\n", -n);
 		else if (list_empty(&dev->restart))
diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c
index e376f9b..117ff13 100644
--- a/arch/um/drivers/ubd_user.c
+++ b/arch/um/drivers/ubd_user.c
@@ -25,7 +25,7 @@ int start_io_thread(unsigned long sp, int *fd_out)
 {
 	int pid, fds[2], err;
 
-	err = os_pipe(fds, 1, 1);
+	err = socketpair(AF_UNIX, SOCK_STREAM, 0, (int *) &fds);
 	if(err < 0){
 		printk("start_io_thread - os_pipe failed, err = %d\n", -err);
 		goto out;
-- 
1.7.10.4


------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [uml-devel] [PATCH 03/10] High performance networking subsystem
  2014-08-29  7:05 ` [uml-devel] [PATCH 03/10] High performance networking subsystem anton.ivanov
@ 2014-08-30  7:35   ` Anton Ivanov (antivano)
  0 siblings, 0 replies; 10+ messages in thread
From: Anton Ivanov (antivano) @ 2014-08-30  7:35 UTC (permalink / raw)
  To: anton.ivanov@kot-begemot.co.uk,
	user-mode-linux-devel@lists.sourceforge.net

Hi Richard, hi list,

some notes on this patchset: we have had the multipacket rx portion of 
this stable and in use for quite a while (nearly a year).

The tx portion is new and it looks like it has some issues which did not 
show up before I ported it to a more recent kernel (I would not be 
surprised if I introduced them when porting this from 3.3.y/OpenWRT to 
3.12.y/Stock).

If I do not figure out what exactly is going on with the tx by end of 
next week I am going to resubmit an older version of this patch (and its 
corresponding transports) which has only multi-packet rx (which is 
stable) and leave TX for a later incremental.

A.


On 29/08/14 08:05, anton.ivanov@kot-begemot.co.uk wrote:
> From: Anton Ivanov <antivano@cisco.com>
>
> Support for multi-packet vector IO - multiple packets
> read in one syscall and written in one syscall. Should work with
> legacy UML, thorough tested only for the epoll based IRQ controller
>
> Minimal host kernel version for RX - 2.6.32
> Minimal host kernel version for TX - 3.0
>
> Tested on Debian 7.0/Ubuntu 12.x LTS which have the relevant
> syscalls, but do not have the appropriate glibc routine for TX
> (this is why it is a direct syscall).
>
> Signed-off-by: Anton Ivanov <antivano@cisco.com>
> ---
>   arch/um/drivers/Makefile          |    2 +-
>   arch/um/drivers/net_kern.c        |   63 ++++++++++++++++++++++++-------------
>   arch/um/include/asm/irq.h         |   26 +++++++++------
>   arch/um/include/shared/net_kern.h |   24 ++++++++++++++
>   arch/um/include/shared/net_user.h |   24 ++++++++++++++
>   arch/um/kernel/irq.c              |    3 ++
>   6 files changed, 109 insertions(+), 33 deletions(-)
>
> diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
> index e7582e1..836baaf 100644
> --- a/arch/um/drivers/Makefile
> +++ b/arch/um/drivers/Makefile
> @@ -10,7 +10,7 @@ slip-objs := slip_kern.o slip_user.o
>   slirp-objs := slirp_kern.o slirp_user.o
>   daemon-objs := daemon_kern.o daemon_user.o
>   umcast-objs := umcast_kern.o umcast_user.o
> -net-objs := net_kern.o net_user.o
> +net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
>   mconsole-objs := mconsole_kern.o mconsole_user.o
>   hostaudio-objs := hostaudio_kern.o
>   ubd-objs := ubd_kern.o ubd_user.o
> diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
> index 64d8426..1d253fa 100644
> --- a/arch/um/drivers/net_kern.c
> +++ b/arch/um/drivers/net_kern.c
> @@ -1,4 +1,5 @@
>   /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
>    * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
>    * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
>    * James Leu (jleu@mindspring.net).
> @@ -29,6 +30,7 @@
>   
>   static DEFINE_SPINLOCK(opened_lock);
>   static LIST_HEAD(opened);
> +static int rr_counter = 0;
>   
>   /*
>    * The drop_skb is used when we can't allocate an skb.  The
> @@ -42,6 +44,7 @@ static DEFINE_SPINLOCK(drop_lock);
>   static struct sk_buff *drop_skb;
>   static int drop_max;
>   
> +
>   static int update_drop_skb(int max)
>   {
>   	struct sk_buff *new;
> @@ -77,24 +80,38 @@ static int uml_net_rx(struct net_device *dev)
>   	struct sk_buff *skb;
>   
>   	/* If we can't allocate memory, try again next round. */
> -	skb = dev_alloc_skb(lp->max_packet);
> -	if (skb == NULL) {
> -		drop_skb->dev = dev;
> -		/* Read a packet into drop_skb and don't do anything with it. */
> -		(*lp->read)(lp->fd, drop_skb, lp);
> -		dev->stats.rx_dropped++;
> +	if (lp->options & UML_NET_USE_SKB_READ) {
> +	    /* we expect a full formed, well behaved skb from zero copy drivers here */
> +	    skb = (*lp->skb_read)(lp);
> +	    if (skb == NULL) {
>   		return 0;
> -	}
> -
> -	skb->dev = dev;
> -	skb_put(skb, lp->max_packet);
> -	skb_reset_mac_header(skb);
> -	pkt_len = (*lp->read)(lp->fd, skb, lp);
> -
> -	if (pkt_len > 0) {
> +	    }
> +	    pkt_len = skb->len;
> +	} else {
> +	    skb = dev_alloc_skb(lp->max_packet + 32);
> +	    if (skb == NULL) {
> +		    drop_skb->dev = dev;
> +		    /* Read a packet into drop_skb and don't do anything with it. */
> +		    (*lp->read)(lp->fd, drop_skb, lp);
> +		    dev->stats.rx_dropped++;
> +		    return 0;
> +	    }
> +
> +	    skb_reserve(skb,32);
> +	    skb->dev = dev;
> +	    skb_put(skb, lp->max_packet);
> +	    skb_reset_mac_header(skb);
> +
> +	    // Mark that virtual devices cannot provide required checksum.
> +	    skb->ip_summed = CHECKSUM_NONE;
> +	    pkt_len = (*lp->read)(lp->fd, skb, lp);
> +	    if (pkt_len > 0) {
>   		skb_trim(skb, pkt_len);
>   		skb->protocol = (*lp->protocol)(skb);
> +	    }
> +	}
>   
> +	if (pkt_len > 0) {
>   		dev->stats.rx_bytes += skb->len;
>   		dev->stats.rx_packets++;
>   		netif_rx(skb);
> @@ -192,8 +209,9 @@ static int uml_net_close(struct net_device *dev)
>   	struct uml_net_private *lp = netdev_priv(dev);
>   
>   	netif_stop_queue(dev);
> +	deactivate_fd(lp->fd, dev->irq);
>   
> -	um_free_irq(dev->irq, dev);
> +	free_irq(dev->irq, dev);
>   	if (lp->close != NULL)
>   		(*lp->close)(lp->fd, &lp->user);
>   	lp->fd = -1;
> @@ -216,7 +234,6 @@ static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
>   	spin_lock_irqsave(&lp->lock, flags);
>   
>   	len = (*lp->write)(lp->fd, skb, lp);
> -	skb_tx_timestamp(skb);
>   
>   	if (len == skb->len) {
>   		dev->stats.tx_packets++;
> @@ -273,14 +290,13 @@ static void uml_net_poll_controller(struct net_device *dev)
>   static void uml_net_get_drvinfo(struct net_device *dev,
>   				struct ethtool_drvinfo *info)
>   {
> -	strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
> -	strlcpy(info->version, "42", sizeof(info->version));
> +	strcpy(info->driver, DRIVER_NAME);
> +	strcpy(info->version, "42");
>   }
>   
>   static const struct ethtool_ops uml_net_ethtool_ops = {
>   	.get_drvinfo	= uml_net_get_drvinfo,
>   	.get_link	= ethtool_op_get_link,
> -	.get_ts_info	= ethtool_op_get_ts_info,
>   };
>   
>   static void uml_net_user_timer_expire(unsigned long _conn)
> @@ -447,6 +463,7 @@ static void eth_configure(int n, void *init, char *mac,
>   	 * These just fill in a data structure, so there's no failure
>   	 * to be worried about.
>   	 */
> +	dev->ethtool_ops = &uml_net_ethtool_ops;
>   	(*transport->kern->init)(dev, init);
>   
>   	*lp = ((struct uml_net_private)
> @@ -459,7 +476,9 @@ static void eth_configure(int n, void *init, char *mac,
>   		  .open 		= transport->user->open,
>   		  .close 		= transport->user->close,
>   		  .remove 		= transport->user->remove,
> +		  .options 		= transport->kern->options,
>   		  .read 		= transport->kern->read,
> +		  .skb_read 		= transport->kern->skb_read,
>   		  .write 		= transport->kern->write,
>   		  .add_address 		= transport->user->add_address,
>   		  .delete_address  	= transport->user->delete_address });
> @@ -475,9 +494,9 @@ static void eth_configure(int n, void *init, char *mac,
>   
>   	dev->mtu = transport->user->mtu;
>   	dev->netdev_ops = &uml_netdev_ops;
> -	dev->ethtool_ops = &uml_net_ethtool_ops;
>   	dev->watchdog_timeo = (HZ >> 1);
> -	dev->irq = UM_ETH_IRQ;
> +	dev->irq = UM_ETH_BASE_IRQ + (rr_counter % UM_ETH_IRQ_RR);
> +	rr_counter++;
>   
>   	err = update_drop_skb(lp->max_packet);
>   	if (err)
> @@ -829,7 +848,7 @@ static void close_devices(void)
>   	spin_lock(&opened_lock);
>   	list_for_each(ele, &opened) {
>   		lp = list_entry(ele, struct uml_net_private, list);
> -		um_free_irq(lp->dev->irq, lp->dev);
> +		free_irq(lp->dev->irq, lp->dev);
>   		if ((lp->close != NULL) && (lp->fd >= 0))
>   			(*lp->close)(lp->fd, &lp->user);
>   		if (lp->remove != NULL)
> diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
> index 4a2037f..be9128b 100644
> --- a/arch/um/include/asm/irq.h
> +++ b/arch/um/include/asm/irq.h
> @@ -1,21 +1,27 @@
> +
>   #ifndef __UM_IRQ_H
>   #define __UM_IRQ_H
>   
> +#define UM_ETH_IRQ_RR	        32
> +
>   #define TIMER_IRQ		0
>   #define UMN_IRQ			1
>   #define CONSOLE_IRQ		2
>   #define CONSOLE_WRITE_IRQ	3
>   #define UBD_IRQ			4
> -#define UM_ETH_IRQ		5
> -#define SSL_IRQ			6
> -#define SSL_WRITE_IRQ		7
> -#define ACCEPT_IRQ		8
> -#define MCONSOLE_IRQ		9
> -#define WINCH_IRQ		10
> -#define SIGIO_WRITE_IRQ 	11
> -#define TELNETD_IRQ 		12
> -#define XTERM_IRQ 		13
> -#define RANDOM_IRQ 		14
> +#define UM_ETH_BASE_IRQ		5
> +
> +#define UM_END_ETH_IRQ	        UM_ETH_BASE_IRQ + UM_ETH_IRQ_RR
> +
> +#define SSL_IRQ			UM_END_ETH_IRQ + 1
> +#define SSL_WRITE_IRQ		UM_END_ETH_IRQ + 2
> +#define ACCEPT_IRQ		UM_END_ETH_IRQ + 3
> +#define MCONSOLE_IRQ		UM_END_ETH_IRQ + 4
> +#define WINCH_IRQ		UM_END_ETH_IRQ + 5
> +#define SIGIO_WRITE_IRQ 	UM_END_ETH_IRQ + 6
> +#define TELNETD_IRQ 		UM_END_ETH_IRQ + 7
> +#define XTERM_IRQ 		UM_END_ETH_IRQ + 8
> +#define RANDOM_IRQ 		UM_END_ETH_IRQ + 9
>   
>   #define LAST_IRQ RANDOM_IRQ
>   #define NR_IRQS (LAST_IRQ + 1)
> diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h
> index 012ac87..2229126 100644
> --- a/arch/um/include/shared/net_kern.h
> +++ b/arch/um/include/shared/net_kern.h
> @@ -1,4 +1,5 @@
>   /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
>    * Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
>    * Licensed under the GPL
>    */
> @@ -13,6 +14,8 @@
>   #include <linux/list.h>
>   #include <linux/workqueue.h>
>   
> +#define UML_NET_USE_SKB_READ 1
> +
>   struct uml_net {
>   	struct list_head list;
>   	struct net_device *dev;
> @@ -28,6 +31,7 @@ struct uml_net_private {
>   
>   	struct work_struct work;
>   	int fd;
> +	unsigned int options;
>   	unsigned char mac[ETH_ALEN];
>   	int max_packet;
>   	unsigned short (*protocol)(struct sk_buff *);
> @@ -36,6 +40,7 @@ struct uml_net_private {
>   	void (*remove)(void *);
>   	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
>   	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
> +	struct sk_buff * (*skb_read)(struct uml_net_private *);
>   
>   	void (*add_address)(unsigned char *, unsigned char *, void *);
>   	void (*delete_address)(unsigned char *, unsigned char *, void *);
> @@ -47,6 +52,8 @@ struct net_kern_info {
>   	unsigned short (*protocol)(struct sk_buff *);
>   	int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
>   	int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
> +	struct sk_buff * (*skb_read)(struct uml_net_private *);
> +	unsigned int options;
>   };
>   
>   struct transport {
> @@ -59,11 +66,28 @@ struct transport {
>   	const int setup_size;
>   };
>   
> +struct mmsg_queue_info {
> +	int fd;
> +	struct mmsghdr * mmsg_send_vector;
> +	void ** skb_send_vector;
> +	int queue_depth, head, tail, max_depth;
> +	spinlock_t head_lock;
> +	spinlock_t tail_lock;
> +	unsigned int queue_fsm;
> +};
> +
>   extern struct net_device *ether_init(int);
>   extern unsigned short ether_protocol(struct sk_buff *);
>   extern int tap_setup_common(char *str, char *type, char **dev_name,
>   			    char **mac_out, char **gate_addr);
>   extern void register_transport(struct transport *new);
>   extern unsigned short eth_protocol(struct sk_buff *skb);
> +extern struct sk_buff *my_build_skb(void * head, void *data, unsigned int frag_size);
> +
> +extern void flush_pending_netio(void);
> +
> +extern int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int advance);
> +extern int uml_net_advance_head( struct mmsg_queue_info * queue_info, int advance);
> +extern int uml_net_flush_mmsg_queue(struct mmsg_queue_info * queue_info, int queue_depth);
>   
>   #endif
> diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h
> index 3dabbe1..4b46f37 100644
> --- a/arch/um/include/shared/net_user.h
> +++ b/arch/um/include/shared/net_user.h
> @@ -1,4 +1,5 @@
>   /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
>    * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
>    * Licensed under the GPL
>    */
> @@ -38,10 +39,15 @@ extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr);
>   extern void read_output(int fd, char *output_out, int len);
>   
>   extern int net_read(int fd, void *buf, int len);
> +extern int net_readv(int fd, void *iov, int iovcnt);
>   extern int net_recvfrom(int fd, void *buf, int len);
> +extern int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int *addrlen);
>   extern int net_write(int fd, void *buf, int len);
> +extern int net_writev(int fd, void *iov, int iovcnt);
>   extern int net_send(int fd, void *buf, int len);
>   extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
> +extern int net_sendmessage(int fd, void *msg, int flags);
> +extern int net_recvmessage(int fd, void *msg, int flags);
>   
>   extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
>   extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
> @@ -50,4 +56,22 @@ extern char *split_if_spec(char *str, ...);
>   
>   extern int dev_netmask(void *d, void *m);
>   
> +
> +extern void uml_net_destroy_skb(void * skb);
> +extern void * uml_net_build_skb (void * dev);
> +extern void * uml_net_skb_data (void * skb);
> +
> +extern void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size, int offset);
> +extern void add_header_buffers(void * msgvec, int size, int header_size);
> +extern void * build_mmsg_vector(int size, int iovsize);
> +extern void rebuild_skbuf_vector(void ** skbvec, int size, void * dev);
> +extern void * build_skbuf_vector(int size, void * dev);
> +extern int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
> +		unsigned int flags, struct timespec *timeout);
> +extern int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
> +		unsigned int flags);
> +extern void repair_mmsg (void *msgvec, int iovsize, int header_size);
> +extern void destroy_skb_vector(void ** vector, int size);
> +extern void destroy_mmsg_vector(void * mmsgvector, int size, int free_iov_base);
> +
>   #endif
> diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
> index 5d7ee49e..f4c6fb1 100644
> --- a/arch/um/kernel/irq.c
> +++ b/arch/um/kernel/irq.c
> @@ -17,6 +17,7 @@
>   #include <as-layout.h>
>   #include <kern_util.h>
>   #include <os.h>
> +#include <net_kern.h>
>   
>   /*
>   *	We are on the "kernel side" so we cannot pick up the sys/epoll.h
> @@ -136,6 +137,8 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
>   		spin_unlock_irqrestore(&uml_sigio_lock, flags);
>   	}
>   
> +	flush_pending_netio();
> +
>   	/* This needs a better way - it slows down the event loop */
>   
>   	free_irqs();

------------------------------------------------------------------------------
Slashdot TV.  
Video for Nerds.  Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-08-30  7:35 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-08-29  7:05 [uml-devel] [PATCH 01/10] Epoll based interrupt controller anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 03/10] High performance networking subsystem anton.ivanov
2014-08-30  7:35   ` Anton Ivanov (antivano)
2014-08-29  7:05 ` [uml-devel] [PATCH 04/10] L2TPv3 Transport Driver for UML anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 05/10] GRE transport " anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 06/10] RAW Ethernet " anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 07/10] Performance and NUMA improvements for ubd anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 08/10] Minor performance optimization " anton.ivanov
2014-08-29  7:05 ` [uml-devel] [PATCH 09/10] Better IPC for UBD anton.ivanov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.