* [uml-devel] [PATCH v3 01/10] Epoll based interrupt controller
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
` (9 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
1. Minimum kernel 2.5.99
2. No "walk the list" lookups for received IRQs - immediate identification
of the correct handler to invoke
3. Full set of IRQ semantics - edge, level, read, write
3.1. Write is now a *REAL* write - so if you (ab)use the
write to signify NONE (as in line.c) you will hang!!!
3.2. Read is fully backward compatible
4. Otherwise mostly compatible with original poll() based controller
5. Provides significant performance improvement (up to 10x times for
large device numbers) ands lays the groundwork for the network and
timer improvements to follow
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/drivers/line.c | 3 +-
arch/um/include/shared/irq_user.h | 19 +-
arch/um/include/shared/os.h | 13 +-
arch/um/kernel/irq.c | 454 +++++++++++++++++++++++++------------
arch/um/os-Linux/irq.c | 145 +++++-------
5 files changed, 391 insertions(+), 243 deletions(-)
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 8035145..6c4511f 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -283,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
if (err)
return err;
if (output)
- err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
+ err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
line_write_interrupt, IRQF_SHARED,
driver->write_irq_name, data);
return err;
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index df56330..472282c 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -9,16 +10,18 @@
#include <sysdep/ptrace.h>
struct irq_fd {
- struct irq_fd *next;
- void *id;
- int fd;
- int type;
- int irq;
- int events;
- int current_events;
+ struct irq_fd *next;
+ struct irq_fd *leaf;
+ void *id;
+ int fd;
+ int type;
+ int irq;
+ int events;
};
-enum { IRQ_READ, IRQ_WRITE };
+#define IRQ_NONE 0
+#define IRQ_READ 1
+#define IRQ_WRITE 2
struct siginfo;
extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 021104d..17b4e9f 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -276,15 +277,17 @@ extern void halt_skas(void);
extern void reboot_skas(void);
/* irq.c */
-extern int os_waiting_for_events(struct irq_fd *active_fds);
-extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
+
+extern int os_setup_epoll(int maxevents);
+extern int os_waiting_for_events_epoll(void *kernel_events, int maxevents);
+extern int os_add_epoll_fd (int events, int fd, void * data);
+extern int os_mod_epoll_fd (int events, int fd, void * data);
+extern int os_del_epoll_fd (int fd);
+
extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
extern void os_free_irq_later(struct irq_fd *active_fds,
int irq, void *dev_id);
-extern int os_get_pollfd(int i);
-extern void os_set_pollfd(int i, int fd);
-extern void os_set_ioignore(void);
/* sigio.c */
extern int add_sigio_fd(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 1d8505b..2869160 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
* Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
@@ -18,6 +19,61 @@
#include <os.h>
/*
+* We are on the "kernel side" so we cannot pick up the sys/epoll.h
+* So we lift out of it the applicable key definitions.
+*/
+
+
+enum EPOLL_EVENTS
+ {
+ EPOLLIN = 0x001,
+#define EPOLLIN EPOLLIN
+ EPOLLPRI = 0x002,
+#define EPOLLPRI EPOLLPRI
+ EPOLLOUT = 0x004,
+#define EPOLLOUT EPOLLOUT
+ EPOLLRDNORM = 0x040,
+#define EPOLLRDNORM EPOLLRDNORM
+ EPOLLRDBAND = 0x080,
+#define EPOLLRDBAND EPOLLRDBAND
+ EPOLLWRNORM = 0x100,
+#define EPOLLWRNORM EPOLLWRNORM
+ EPOLLWRBAND = 0x200,
+#define EPOLLWRBAND EPOLLWRBAND
+ EPOLLMSG = 0x400,
+#define EPOLLMSG EPOLLMSG
+ EPOLLERR = 0x008,
+#define EPOLLERR EPOLLERR
+ EPOLLHUP = 0x010,
+#define EPOLLHUP EPOLLHUP
+ EPOLLRDHUP = 0x2000,
+#define EPOLLRDHUP EPOLLRDHUP
+ EPOLLONESHOT = (1 << 30),
+#define EPOLLONESHOT EPOLLONESHOT
+ EPOLLET = (1 << 31)
+#define EPOLLET EPOLLET
+ };
+
+
+typedef union epoll_data
+{
+ void *ptr;
+ int fd;
+ uint32_t u32;
+ uint64_t u64;
+} epoll_data_t;
+
+struct epoll_event
+{
+ uint32_t events; /* Epoll events */
+ epoll_data_t data; /* User data variable */
+} __attribute__ ((__packed__));
+
+#define MAX_EPOLL_EVENTS 16
+
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+
+/*
* This list is accessed under irq_lock, except in sigio_handler,
* where it is safe from being modified. IRQ handlers won't change it -
* if an IRQ source has vanished, it will be freed by free_irqs just
@@ -26,46 +82,98 @@
* remove list elements, taking the irq_lock to do so.
*/
static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
extern void free_irqs(void);
+/*
+ the in_epoll_loop is not static on purpose - we will use this to
+ determine if we can do delayed queue flushes in devices. The idea is -
+ if we read 32 packets at a time using recvmmsg we need an
+ indication that we will be reading more so no point to send now
+ and flush the queue only once we are done with it
+*/
+
+DEFINE_SPINLOCK(uml_sigio_lock);
+
+int in_epoll_loop = 0;
+
+static DEFINE_SPINLOCK(irq_lock);
+static DEFINE_SPINLOCK(event_loop);
+
void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
struct irq_fd *irq_fd;
- int n;
+ unsigned long flags;
+
+ int n, i;
if (smp_sigio_handler())
return;
while (1) {
- n = os_waiting_for_events(active_fds);
+ spin_lock_irqsave(¨_sigio_lock, flags);
+ in_epoll_loop = 1;
+ n = os_waiting_for_events_epoll(
+ &epoll_events, MAX_EPOLL_EVENTS
+ );
if (n <= 0) {
- if (n == -EINTR)
- continue;
- else break;
+ in_epoll_loop = 0;
+ spin_unlock_irqrestore(¨_sigio_lock, flags);
+ break;
}
-
- for (irq_fd = active_fds; irq_fd != NULL;
- irq_fd = irq_fd->next) {
- if (irq_fd->current_events != 0) {
- irq_fd->current_events = 0;
- do_IRQ(irq_fd->irq, regs);
+ for (i = 0; i < n ; i++) {
+ for (
+ irq_fd = (struct irq_fd *)
+ epoll_events[i].data.ptr;
+ irq_fd != NULL;
+ irq_fd = irq_fd->leaf) {
+ if (epoll_events[i].events & irq_fd->events) {
+ do_IRQ(irq_fd->irq, regs);
+ }
}
}
+ in_epoll_loop = 0;
+ spin_unlock_irqrestore(¨_sigio_lock, flags);
}
+ /* This needs a better way - it slows down the event loop */
+
free_irqs();
}
-
-static DEFINE_SPINLOCK(irq_lock);
+#define TRUNK_FORMAT "trunk %d\tfd %03d, events %03x, dev %p\n"
+#define LEAF_FORMAT "leaf %d\tfd %03d, events %03x, dev %p\n"
+
+static void dump_interrupt_map (void) {
+ struct irq_fd * irq, *leaf ;
+ printk("MAP:\n");
+ for (irq = active_fds; irq != NULL; irq = irq->next) {
+ printk(
+ TRUNK_FORMAT,
+ irq->irq, irq->fd, irq->events, irq->id
+ );
+ if (irq->leaf) {
+ for (
+ leaf = irq->leaf;
+ leaf != NULL;
+ leaf = leaf->leaf
+ ) {
+ printk(
+ LEAF_FORMAT,
+ leaf->irq,
+ leaf->fd,
+ leaf->events,
+ leaf->id
+ );
+ }
+ }
+ }
+}
static int activate_fd(int irq, int fd, int type, void *dev_id)
{
- struct pollfd *tmp_pfd;
- struct irq_fd *new_fd, *irq_fd;
+ struct irq_fd *new_fd, *irq_fd, *leaf ;
unsigned long flags;
- int events, err, n;
+ int events = 0, acc_events = 0, err, n, skip = 0;
err = os_set_fd_async(fd);
if (err < 0)
@@ -76,64 +184,56 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
if (new_fd == NULL)
goto out;
- if (type == IRQ_READ)
- events = UM_POLLIN | UM_POLLPRI;
- else events = UM_POLLOUT;
+ if (type & IRQ_READ)
+ events |= EPOLLIN | EPOLLPRI;
+ if (type & IRQ_WRITE)
+ events |= EPOLLOUT;
+
*new_fd = ((struct irq_fd) { .next = NULL,
- .id = dev_id,
- .fd = fd,
- .type = type,
- .irq = irq,
- .events = events,
- .current_events = 0 } );
+ .leaf = NULL,
+ .id = dev_id,
+ .fd = fd,
+ .type = type,
+ .irq = irq,
+ .events = events });
err = -EBUSY;
spin_lock_irqsave(&irq_lock, flags);
+
for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
- if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
- printk(KERN_ERR "Registering fd %d twice\n", fd);
- printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
- printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
- dev_id);
- goto out_unlock;
+ if (irq_fd->fd == fd) {
+ for (leaf = irq_fd; leaf != NULL; leaf = leaf->leaf) {
+ if (leaf->type == type) {
+ printk("Irqs : %d, %d\n", leaf->irq, irq);
+ printk("Ids : 0x%p, 0x%p\n", leaf->id, dev_id);
+ goto out_unlock;
+ }
+ acc_events |= leaf->events;
+ }
+ /* we insert it one-off-the-head - easiest
+ we also pass our "head" as the pointer to mod
+ so it walks correctly
+ */
+ skip = 1;
+ new_fd->leaf = irq_fd->leaf;
+ irq_fd->leaf = new_fd;
+ if ((new_fd->events | acc_events) != acc_events) {
+ n = os_mod_epoll_fd(new_fd->events | acc_events, fd, irq_fd);
+ }
}
}
-
- if (type == IRQ_WRITE)
- fd = -1;
-
- tmp_pfd = NULL;
- n = 0;
-
- while (1) {
- n = os_create_pollfd(fd, events, tmp_pfd, n);
- if (n == 0)
- break;
-
- /*
- * n > 0
- * It means we couldn't put new pollfd to current pollfds
- * and tmp_fds is NULL or too small for new pollfds array.
- * Needed size is equal to n as minimum.
- *
- * Here we have to drop the lock in order to call
- * kmalloc, which might sleep.
- * If something else came in and changed the pollfds array
- * so we will not be able to put new pollfd struct to pollfds
- * then we free the buffer tmp_fds and try again.
- */
- spin_unlock_irqrestore(&irq_lock, flags);
- kfree(tmp_pfd);
-
- tmp_pfd = kmalloc(n, GFP_KERNEL);
- if (tmp_pfd == NULL)
- goto out_kfree;
-
- spin_lock_irqsave(&irq_lock, flags);
+ if (! skip) {
+ /* proper IRQ registration */
+ new_fd->next = active_fds;
+ active_fds = new_fd;
+
+ if (new_fd->type != IRQ_NONE ) {
+ n = os_add_epoll_fd(new_fd->events, fd, new_fd);
+ } else {
+ n = 0;
+ }
}
- *last_irq_ptr = new_fd;
- last_irq_ptr = &new_fd->next;
spin_unlock_irqrestore(&irq_lock, flags);
@@ -141,122 +241,199 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
* This calls activate_fd, so it has to be outside the critical
* section.
*/
- maybe_sigio_broken(fd, (type == IRQ_READ));
+
+ maybe_sigio_broken(fd, (type != IRQ_NONE));
return 0;
out_unlock:
spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
kfree(new_fd);
out:
return err;
}
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
+/* Must be called with irq_lock held */
+static struct irq_fd *find_irq_chain_by_fd(int fd)
{
- unsigned long flags;
+ struct irq_fd *irq;
- spin_lock_irqsave(&irq_lock, flags);
- os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
- spin_unlock_irqrestore(&irq_lock, flags);
+ for (irq = active_fds; irq != NULL; irq = irq->next) {
+ if (irq->fd == fd) {
+ return irq;
+ }
+ }
+ if (irq == NULL) {
+ printk(KERN_ERR
+ "find_irq_chain_by_fd doesn't have descriptor %d\n",
+ fd);
+ dump_interrupt_map();
+ }
+ return irq;
}
-struct irq_and_dev {
- int irq;
- void *dev;
-};
-
-static int same_irq_and_dev(struct irq_fd *irq, void *d)
+static struct irq_fd *find_irq_by_fd(int fd, int irqnum)
{
- struct irq_and_dev *data = d;
+ struct irq_fd *irq;
+
+ for (irq = find_irq_chain_by_fd(fd); irq != NULL; irq = irq->leaf) {
+ if (irq->irq == irqnum) return irq;
+ }
+ if (irq == NULL) {
+ printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
+ fd);
+ dump_interrupt_map();
+ }
+ return irq;
+}
- return ((irq->irq == data->irq) && (irq->id == data->dev));
+static void free_leaf_irq_by_irq_and_dev(unsigned int irq, void *dev, struct irq_fd * prev) {
+ /* this is called out of free_irq_by_irq_and_dev with a held lock */
+ struct irq_fd *leaf;
+ if (prev != NULL) {
+ leaf = prev->leaf;
+ } else {
+ return;
+ }
+ while (leaf != NULL) {
+ if ((leaf->irq == irq) && (leaf->id == dev)) {
+ if (leaf->events) {
+ os_del_epoll_fd(leaf->fd);
+ }
+ prev->leaf = leaf->leaf;
+ kfree(leaf);
+ } else {
+ prev = leaf;
+ }
+ leaf = prev->leaf;
+ }
}
-static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
+static int do_free_irq_by_irq_and_dev(unsigned int irq, void *dev)
{
- struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq,
- .dev = dev });
+ unsigned long flags;
+ struct irq_fd *prev, * trunk;
+ spin_lock_irqsave(&irq_lock, flags);
- free_irq_by_cb(same_irq_and_dev, &data);
-}
+ trunk = active_fds;
+ prev = NULL;
-static int same_fd(struct irq_fd *irq, void *fd)
-{
- return (irq->fd == *((int *)fd));
-}
+ while (trunk != NULL) {
-void free_irq_by_fd(int fd)
-{
- free_irq_by_cb(same_fd, &fd);
-}
+ /* walk the branch and free irq descriptor if on branch */
-/* Must be called with irq_lock held */
-static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
-{
- struct irq_fd *irq;
- int i = 0;
- int fdi;
+ if (trunk->leaf != NULL) {
+ free_leaf_irq_by_irq_and_dev(irq, dev, trunk);
+ }
- for (irq = active_fds; irq != NULL; irq = irq->next) {
- if ((irq->fd == fd) && (irq->irq == irqnum))
- break;
- i++;
+ if ((trunk->irq == irq) && (trunk->id == dev)) {
+ /* delete irq descriptor off trunk */
+ if (trunk->leaf != NULL) {
+ /* leaf non-null, attach instead of
+ freed irq descriptor
+ */
+ if (prev != NULL) {
+ prev->next = trunk->leaf;
+ } else {
+ active_fds = trunk->leaf;
+ }
+ trunk->leaf->next = trunk->next;
+ if (trunk->events) {
+ os_del_epoll_fd(trunk->fd);
+ }
+ kfree(trunk);
+ } else {
+ if (prev != NULL) {
+ prev->next = trunk->next;
+ } else {
+ active_fds = trunk->next;
+ }
+ if (trunk->events) {
+ os_del_epoll_fd(trunk->fd);
+ }
+ kfree(trunk);
+ }
+ /* irq + dev should be unique, it is also easier
+ to restart than to juggle all the pointers after
+ making holes in the list
+ */
+ return 1;
+ }
+ prev = trunk;
+ trunk = trunk->next;
}
- if (irq == NULL) {
- printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
- fd);
- goto out;
+ spin_unlock_irqrestore(&irq_lock, flags);
+ return 0;
+}
+static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) {
+ while (do_free_irq_by_irq_and_dev(irq, dev) != 0) {
}
- fdi = os_get_pollfd(i);
- if ((fdi != -1) && (fdi != fd)) {
- printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
- "and pollfds, fd %d vs %d, need %d\n", irq->fd,
- fdi, fd);
- irq = NULL;
- goto out;
+}
+
+void free_irq_by_fd(int fd)
+{
+ struct irq_fd *irq, * found;
+ unsigned long flags;
+ spin_lock_irqsave(&irq_lock, flags);
+ found = find_irq_chain_by_fd(fd);
+ if (found == NULL) {
+ spin_unlock_irqrestore(&irq_lock, flags);
+ return;
}
- *index_out = i;
- out:
- return irq;
+ os_del_epoll_fd(fd);
+ /* free the whole chain */
+ while (found != NULL) {
+ irq = found;
+ found = found->leaf;
+ kfree(irq);
+ }
+ spin_unlock_irqrestore(&irq_lock, flags);
+
}
void reactivate_fd(int fd, int irqnum)
{
- struct irq_fd *irq;
+ struct irq_fd *irq, * found;
unsigned long flags;
- int i;
+ int acc_events = 0;
spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
- if (irq == NULL) {
+ found = find_irq_chain_by_fd(fd);
+ if (found == NULL) {
spin_unlock_irqrestore(&irq_lock, flags);
return;
}
- os_set_pollfd(i, irq->fd);
+ for (
+ irq = found;
+ irq != NULL;
+ irq = irq->leaf) {
+ acc_events |= irq->events;
+ }
+ if (os_add_epoll_fd(acc_events, fd, found) !=0) {
+ os_mod_epoll_fd(acc_events, fd, found);
+ }
spin_unlock_irqrestore(&irq_lock, flags);
-
add_sigio_fd(fd);
+
}
void deactivate_fd(int fd, int irqnum)
{
struct irq_fd *irq;
unsigned long flags;
- int i;
spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
+ irq = find_irq_by_fd(fd, irqnum);
if (irq == NULL) {
spin_unlock_irqrestore(&irq_lock, flags);
return;
}
- os_set_pollfd(i, -1);
+ os_del_epoll_fd(irq->fd);
spin_unlock_irqrestore(&irq_lock, flags);
-
ignore_sigio_fd(fd);
+
+
}
EXPORT_SYMBOL(deactivate_fd);
@@ -272,12 +449,11 @@ int deactivate_all_fds(void)
int err;
for (irq = active_fds; irq != NULL; irq = irq->next) {
+ os_del_epoll_fd(irq->fd); /* ignore err, just do it */
err = os_clear_fd_async(irq->fd);
if (err)
return err;
}
- /* If there is a signal already queued, after unblocking ignore it */
- os_set_ioignore();
return 0;
}
@@ -311,13 +487,13 @@ int um_request_irq(unsigned int irq, int fd, int type,
{
int err;
- if (fd != -1) {
+ err = request_irq(irq, handler, irqflags, devname, dev_id);
+
+ if ((!err) && (fd != -1)) {
err = activate_fd(irq, fd, type, dev_id);
- if (err)
- return err;
}
- return request_irq(irq, handler, irqflags, devname, dev_id);
+ return err;
}
EXPORT_SYMBOL(um_request_irq);
@@ -355,9 +531,9 @@ void __init init_IRQ(void)
int i;
irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
-
- for (i = 1; i < NR_IRQS; i++)
+ for (i = 1; i < NR_IRQS - 1 ; i++)
irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
+ os_setup_epoll(MAX_EPOLL_EVENTS);
}
/*
@@ -385,11 +561,11 @@ void __init init_IRQ(void)
* thread_info.
*
* There are three cases -
- * The first interrupt on the stack - sets up the thread_info and
+ * The first interrupt on the stack - sets up the thread_info and
* handles the interrupt
- * A nested interrupt interrupting the copying of the thread_info -
+ * A nested interrupt interrupting the copying of the thread_info -
* can't handle the interrupt, as the stack is in an unknown state
- * A nested interrupt not interrupting the copying of the
+ * A nested interrupt not interrupting the copying of the
* thread_info - doesn't do any setup, just handles the interrupt
*
* The first job is to figure out whether we interrupted stack setup.
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74..837aa68 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -6,6 +7,7 @@
#include <stdlib.h>
#include <errno.h>
#include <poll.h>
+#include <sys/epoll.h>
#include <signal.h>
#include <string.h>
#include <irq_user.h>
@@ -16,117 +18,80 @@
* Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd
* and os_free_irq_by_cb, which are called under irq_lock.
*/
-static struct pollfd *pollfds = NULL;
-static int pollfds_num = 0;
-static int pollfds_size = 0;
-int os_waiting_for_events(struct irq_fd *active_fds)
+/* epoll support */
+
+
+static int epollfd = -1;
+
+int os_setup_epoll(int maxevents) {
+ epollfd = epoll_create(maxevents);
+ return epollfd;
+}
+
+int os_waiting_for_events_epoll(void *kernel_events, int maxevents)
{
- struct irq_fd *irq_fd;
- int i, n, err;
+ int n, err;
- n = poll(pollfds, pollfds_num, 0);
+ n = epoll_wait(epollfd,
+ (struct epoll_event *) kernel_events, maxevents, 0);
if (n < 0) {
err = -errno;
if (errno != EINTR)
- printk(UM_KERN_ERR "os_waiting_for_events:"
- " poll returned %d, errno = %d\n", n, errno);
+ printk(
+ UM_KERN_ERR "os_waiting_for_events:"
+ " poll returned %d, error = %s\n", n,
+ strerror(errno)
+ );
return err;
}
- if (n == 0)
- return 0;
-
- irq_fd = active_fds;
-
- for (i = 0; i < pollfds_num; i++) {
- if (pollfds[i].revents != 0) {
- irq_fd->current_events = pollfds[i].revents;
- pollfds[i].fd = -1;
- }
- irq_fd = irq_fd->next;
- }
return n;
}
-int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
-{
- if (pollfds_num == pollfds_size) {
- if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
- /* return min size needed for new pollfds area */
- return (pollfds_size + 1) * sizeof(pollfds[0]);
- }
-
- if (pollfds != NULL) {
- memcpy(tmp_pfd, pollfds,
- sizeof(pollfds[0]) * pollfds_size);
- /* remove old pollfds */
- kfree(pollfds);
- }
- pollfds = tmp_pfd;
- pollfds_size++;
- } else
- kfree(tmp_pfd); /* remove not used tmp_pfd */
+int os_add_epoll_fd (int events, int fd, void * data) {
+ struct epoll_event event;
+ int result;
- pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
- .events = events,
- .revents = 0 });
- pollfds_num++;
-
- return 0;
+ event.data.ptr = data;
+ event.events = events | EPOLLET;
+ result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
+ if ((result) && (errno == EEXIST)) {
+ result = os_mod_epoll_fd (events, fd, data);
+ }
+ if (result) {
+ printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
+ }
+ return result;
}
-void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
- struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
-{
- struct irq_fd **prev;
- int i = 0;
-
- prev = &active_fds;
- while (*prev != NULL) {
- if ((*test)(*prev, arg)) {
- struct irq_fd *old_fd = *prev;
- if ((pollfds[i].fd != -1) &&
- (pollfds[i].fd != (*prev)->fd)) {
- printk(UM_KERN_ERR "os_free_irq_by_cb - "
- "mismatch between active_fds and "
- "pollfds, fd %d vs %d\n",
- (*prev)->fd, pollfds[i].fd);
- goto out;
- }
-
- pollfds_num--;
-
- /*
- * This moves the *whole* array after pollfds[i]
- * (though it doesn't spot as such)!
- */
- memmove(&pollfds[i], &pollfds[i + 1],
- (pollfds_num - i) * sizeof(pollfds[0]));
- if (*last_irq_ptr2 == &old_fd->next)
- *last_irq_ptr2 = prev;
-
- *prev = (*prev)->next;
- if (old_fd->type == IRQ_WRITE)
- ignore_sigio_fd(old_fd->fd);
- kfree(old_fd);
- continue;
- }
- prev = &(*prev)->next;
- i++;
+int os_mod_epoll_fd (int events, int fd, void * data) {
+ struct epoll_event event;
+ int result;
+ event.data.ptr = data;
+ event.events = events;
+ result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
+ if (result) {
+ printk("epollctl mod err fd %d, %s\n", fd, strerror(errno));
}
- out:
- return;
+ return result;
}
-int os_get_pollfd(int i)
-{
- return pollfds[i].fd;
+int os_del_epoll_fd (int fd) {
+ struct epoll_event event;
+ int result;
+ result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
+ if (result) {
+ printk("epollctl del err %s\n", strerror(errno));
+ }
+ return result;
}
-void os_set_pollfd(int i, int fd)
+void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
+ struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
{
- pollfds[i].fd = fd;
+ printk("Someone invoking obsolete deactivate_by_CB!!!\n");
+ return;
}
void os_set_ioignore(void)
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 02/10] Remove unnecessary 'reactivate' statements
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 01/10] Epoll based interrupt controller anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 03/10] High performance networking subsystem anton.ivanov
` (8 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
The epoll based controller has real (not emulated) edge and
level semantics and the edge/level is handled by epoll. There
is no toggling of the poll set any more, thus it is removed
throughout
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/drivers/chan_kern.c | 2 --
arch/um/drivers/line.c | 2 --
arch/um/drivers/mconsole_kern.c | 2 --
arch/um/drivers/net_kern.c | 2 --
arch/um/drivers/port_kern.c | 1 -
arch/um/drivers/random.c | 1 -
arch/um/drivers/ubd_kern.c | 1 -
7 files changed, 11 deletions(-)
diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index acbe6c6..db0ff51 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -564,8 +564,6 @@ void chan_interrupt(struct line *line, int irq)
tty_insert_flip_char(port, c, TTY_NORMAL);
} while (err > 0);
- if (err == 0)
- reactivate_fd(chan->fd, irq);
if (err == -EIO) {
if (chan->primary) {
tty_port_tty_hangup(&line->port, false);
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 6c4511f..1e8df84 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -663,8 +663,6 @@ static irqreturn_t winch_interrupt(int irq, void *data)
tty_kref_put(tty);
}
out:
- if (winch->fd != -1)
- reactivate_fd(winch->fd, WINCH_IRQ);
return IRQ_HANDLED;
}
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 3df3bd5..2b9bfa7 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -95,7 +95,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id)
}
if (!list_empty(&mc_requests))
schedule_work(&mconsole_work);
- reactivate_fd(fd, MCONSOLE_IRQ);
return IRQ_HANDLED;
}
@@ -243,7 +242,6 @@ void mconsole_stop(struct mc_request *req)
(*req->cmd->handler)(req);
}
os_set_fd_block(req->originating_fd, 0);
- reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
mconsole_reply(req, "", 0, 0);
}
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 39f1862..64d8426 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -137,8 +137,6 @@ static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
schedule_work(&lp->work);
goto out;
}
- reactivate_fd(lp->fd, UM_ETH_IRQ);
-
out:
spin_unlock(&lp->lock);
return IRQ_HANDLED;
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 40ca5cc..b0e9ff3 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -137,7 +137,6 @@ static void port_work_proc(struct work_struct *unused)
if (!port->has_connection)
continue;
- reactivate_fd(port->fd, ACCEPT_IRQ);
while (port_accept(port))
;
port->has_connection = 0;
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index 9e3a722..ec3d788 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -72,7 +72,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
return ret ? : -EAGAIN;
atomic_inc(&host_sleep_count);
- reactivate_fd(random_fd, RANDOM_IRQ);
add_sigio_fd(random_fd);
add_wait_queue(&host_read_wait, &wait);
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 3716e69..1cc72ae5 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -466,7 +466,6 @@ static void ubd_handler(void)
blk_end_request(req->req, 0, req->length);
kfree(req);
}
- reactivate_fd(thread_fd, UBD_IRQ);
list_for_each_safe(list, next_ele, &restart){
ubd = container_of(list, struct ubd, restart);
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 03/10] High performance networking subsystem
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 01/10] Epoll based interrupt controller anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 02/10] Remove unnecessary 'reactivate' statements anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 04/10] L2TPv3 Transport Driver for UML anton.ivanov
` (7 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
Support for multi-packet vector IO - multiple packets
read in one syscall and (optionally) written in one syscall.
Support for (optional) queueing on EAGAIN/ENOBUFS - applies
only to socket transports. Sorry TAP, -EYOULOSE - it will remain
slower than any socket transport for a very log time because
sendmmsg/recvmmsg is supported only for sockets, not for tap fds.
Should work with legacy UML, thorough tested only for the epoll
based IRQ controller
Minimal host kernel version for RX - 2.6.32
Minimal host kernel version for TX - 3.0 - optional, config
option UML_NET_VECTOR_TX
Tested on Debian 7.0/Ubuntu 12.x LTS host which have the relevant
syscalls, but do not have the appropriate glibc routine for TX
(this is why it is a direct syscall).
Tested thoroughly with Debian and OpenWRT guests across a range of
kernels (3.2, 3.3, 3.4, 3.8, 3.12).
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/Kconfig.net | 9 ++
arch/um/drivers/Makefile | 2 +-
| 308 +++++++++++++++++++++++++++++++++++
| 317 +++++++++++++++++++++++++++++++++++++
arch/um/drivers/net_kern.c | 63 +++++---
arch/um/include/asm/irq.h | 26 +--
arch/um/include/shared/net_kern.h | 31 ++++
arch/um/include/shared/net_user.h | 24 +++
arch/um/kernel/irq.c | 5 +
9 files changed, 752 insertions(+), 33 deletions(-)
create mode 100644 arch/um/drivers/net_extra_kern.c
create mode 100644 arch/um/drivers/net_extra_user.c
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index 820a56f..e4a7cf2 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -21,6 +21,15 @@ config UML_NET
enable at least one of the following transport options to actually
make use of UML networking.
+config UML_NET_VECTOR_TX
+ bool "Vector transmit in network devices"
+ depends on UML_NET
+ help
+ Accelerate network IO by using sendmmsg() linux syscall. This option
+ requires the host running UML to run at least linux 3.0
+ Presently the acceleration is only for forwarding including firewall,
+ NAT, etc where it yields 25%+ improvement in packet rates and throughput
+
config UML_NET_ETHERTAP
bool "Ethertap transport"
depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index e7582e1..836baaf 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -10,7 +10,7 @@ slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
umcast-objs := umcast_kern.o umcast_user.o
-net-objs := net_kern.o net_user.o
+net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
mconsole-objs := mconsole_kern.o mconsole_user.o
hostaudio-objs := hostaudio_kern.o
ubd-objs := ubd_kern.o ubd_user.o
--git a/arch/um/drivers/net_extra_kern.c b/arch/um/drivers/net_extra_kern.c
new file mode 100644
index 0000000..5ee6f9b
--- /dev/null
+++ b/arch/um/drivers/net_extra_kern.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <linux/bootmem.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/platform_device.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include "init.h"
+#include "irq_kern.h"
+#include "irq_user.h"
+#include "mconsole_kern.h"
+#include "net_kern.h"
+#include "net_user.h"
+
+#define DRIVER_NAME "uml-netdev"
+
+/*
+ These are wrappers around key kernel side functions so we can
+ invoke them from the user side of our Schizofreniac self
+
+*/
+
+extern spinlock_t uml_sigio_lock;
+extern int in_epoll_loop;
+
+static DEFINE_SPINLOCK(net_queue_list);
+
+static struct mmsg_queue_info * pending_queue = NULL;
+
+void uml_net_destroy_skb(void * skb)
+{
+ if (skb) {
+ kfree_skb((struct sk_buff *) skb);
+ }
+}
+
+void * uml_net_build_skb (void * dev)
+{
+ struct uml_net_private *lp = netdev_priv((struct net_device *) dev);
+ struct sk_buff * skb;
+
+ skb = dev_alloc_skb(lp->max_packet + 32);
+ if (skb) {
+ /* add some tunneling space just in case, we usually do not need it as we use vector IO */
+ skb_reserve(skb,32);
+ skb->dev = dev;
+ skb_put(skb, lp->max_packet);
+ skb_reset_mac_header(skb);
+ skb->ip_summed = CHECKSUM_NONE;
+ } else {
+ printk("Failed Atomic SKB Allocation, will drop\n");
+ }
+ return skb;
+}
+
+void * uml_net_skb_data (void * skb) {
+ if (skb) {
+ return ((struct sk_buff *) skb)->data;
+ } else {
+ printk("hole in vector!!!\n");
+ return NULL;
+ }
+}
+
+int uml_net_advance_head( struct mmsg_queue_info * queue_info, int advance)
+{
+ int queue_depth;
+ queue_info->head =
+ (queue_info->head + advance)
+ % queue_info->max_depth;
+
+ /* caller is already holding the head_lock */
+
+ spin_lock(&queue_info->tail_lock);
+ queue_info->queue_depth -= advance;
+
+ /* we are at 0, use this to
+ * reset head and tail so we can use max size vectors
+ */
+ if (queue_info->queue_depth == 0) {
+ queue_info->head = 0;
+ queue_info->tail = 0;
+ }
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->tail_lock);
+ return queue_depth;
+}
+
+/*
+* This is called by enqueuers which should hold the
+* head lock already
+*/
+
+int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int advance)
+{
+ int queue_depth;
+ queue_info->tail =
+ (queue_info->tail + advance)
+ % queue_info->max_depth;
+ spin_lock(&queue_info->head_lock);
+ queue_info->queue_depth += advance;
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->head_lock);
+ return queue_depth;
+}
+
+/*
+* Generic vector enqueue with support for forming headers using transport
+* specific callback. Allows GRE, L2TPv3, RAW (and potentially when ported)
+* daemon to use a common enqueue procedure in vector mode
+*/
+
+int uml_net_enqueue (
+ struct mmsg_queue_info * queue_info,
+ struct sk_buff * skb,
+ struct uml_net_private *lp,
+ void (*form_header)(void * header, struct sk_buff * skb, struct uml_net_private * lp),
+ void * remote_addr,
+ int remote_addr_size)
+{
+
+ int queue_depth;
+ struct sk_buff * mmsg_clone;
+ struct mmsghdr * mmsg_send_vector;
+ void ** skb_send_vector;
+ struct iovec * iov;
+
+ if (!queue_info) {
+ /* someone passed us a NULL queue */
+ return 0;
+ }
+
+ spin_lock(&queue_info->tail_lock);
+ spin_lock(&queue_info->head_lock);
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->head_lock);
+
+ if (queue_depth < queue_info->max_depth) {
+ mmsg_clone = skb_clone(skb, GFP_ATOMIC);
+ if (mmsg_clone) {
+
+ skb_send_vector = queue_info->skb_send_vector;
+ skb_send_vector += queue_info->tail;
+
+ (* skb_send_vector) = mmsg_clone;
+
+ mmsg_send_vector = queue_info->mmsg_send_vector;
+ mmsg_send_vector += queue_info->tail;
+
+ iov = mmsg_send_vector->msg_hdr.msg_iov;
+
+ if (iov) {
+ mmsg_send_vector->msg_hdr.msg_name = remote_addr;
+ mmsg_send_vector->msg_hdr.msg_namelen = remote_addr_size;
+ if (form_header != NULL) {
+ (* form_header)(iov->iov_base, skb, lp);
+ iov++;
+ }
+ iov->iov_base = skb->data;
+ iov->iov_len = skb->len;
+
+ queue_depth = uml_net_advance_tail(queue_info, 1);
+ } else {
+ printk("no iov, cannot enqueue\n");
+ }
+ } else {
+ printk("cloning failed\n");
+ }
+ }
+ spin_unlock(&queue_info->tail_lock);
+ return queue_depth;
+}
+
+static int send_mmsg_queue(struct mmsg_queue_info * queue_info, int queue_depth)
+{
+ int fd = queue_info->fd;
+ struct mmsghdr * send_from;
+ void ** skb_send_vector;
+ int result = 0, send_len, skb_index, allowed_drop = 0;
+
+ if (! queue_info) {
+ /* someone passed a null queue, should not occur */
+ return 0;
+ }
+
+ if (spin_trylock(&queue_info->head_lock)) {
+ if (spin_trylock(&queue_info->tail_lock)) {
+ /* update queue_depth to current value */
+ queue_depth = queue_info->queue_depth;
+ spin_unlock(&queue_info->tail_lock);
+ if (queue_depth > 0) {
+ send_len = queue_depth;
+ send_from = queue_info->mmsg_send_vector;
+ send_from += queue_info->head;
+ if (send_len + queue_info->head > queue_info->max_depth) {
+ send_len = queue_info->max_depth - queue_info->head;
+ }
+ if (send_len > 0) {
+ result = net_sendmmsg(
+ fd, send_from, send_len, 0
+ );
+ }
+ if (result < 0) {
+ printk("error %i in multisend\n", result);
+ result = send_len; /* drop the lot */
+ }
+ if (result > 0) {
+ if (result != send_len) {
+ /* we need to drop a few, exponentially increasing
+ * drop bucket in use
+ */
+ result += allowed_drop;
+ allowed_drop += allowed_drop * 2 + 1;
+ if (result > send_len) {
+ /* do not drop beyond requested size */
+ result = send_len;
+ }
+ } else {
+ /* clear drop bucket size */
+ allowed_drop = 0;
+ }
+ skb_send_vector = queue_info->skb_send_vector;
+ skb_send_vector += queue_info->head;
+ for (skb_index = 0; skb_index < send_len; skb_index++) {
+ uml_net_destroy_skb(* skb_send_vector);
+ (* skb_send_vector) = NULL; /* just in case */
+ skb_send_vector ++ ;
+ }
+ queue_depth = uml_net_advance_head(queue_info, result);
+ }
+ }
+ }
+ spin_unlock(&queue_info->head_lock);
+ }
+ return queue_depth;
+}
+
+int uml_net_flush_mmsg_queue(
+ struct mmsg_queue_info * queue_info, int queue_depth)
+{
+ int old_queue_depth;
+
+ if (queue_depth >= (queue_info->max_depth - 1)) {
+ /* queue full, flush some regardless */
+ queue_depth = send_mmsg_queue(queue_info, queue_depth);
+ }
+ if ((queue_depth > 0) && (spin_trylock(¨_sigio_lock))) {
+ /* unconditional flush, non zero queue - not in epoll loop so not forwarding */
+ if (!(in_epoll_loop)) {
+ while (queue_depth > 0) {
+ queue_depth = send_mmsg_queue(queue_info, queue_depth);
+ }
+ }
+ spin_unlock(¨_sigio_lock);
+ }
+
+ /* we are forwarding (most likely) - check if there is a pending queue, if there is a
+ * pending queue, flush it, then put the current queue as pending
+ */
+
+ spin_lock(&net_queue_list);
+ if ((pending_queue) && (pending_queue != queue_info)) {
+ old_queue_depth = send_mmsg_queue(pending_queue, 1);
+ while (old_queue_depth > 0) {
+ old_queue_depth =
+ send_mmsg_queue(pending_queue, old_queue_depth);
+ }
+ }
+ if (queue_depth) {
+ pending_queue = queue_info;
+ } else {
+ pending_queue = NULL;
+ }
+ spin_unlock(&net_queue_list);
+
+ return queue_depth;
+}
+
+/*
+* this is invoked out of the IRQ IO event loop to flush pending
+* packets on "current" interface
+*/
+
+void flush_pending_netio(void) {
+ int result;
+ spin_lock(&net_queue_list);
+ if (pending_queue) {
+ do {
+ result = send_mmsg_queue(pending_queue, 1);
+ } while (result > 0);
+ }
+ pending_queue = NULL;
+ spin_unlock(&net_queue_list);
+}
--git a/arch/um/drivers/net_extra_user.c b/arch/um/drivers/net_extra_user.c
new file mode 100644
index 0000000..1037899
--- /dev/null
+++ b/arch/um/drivers/net_extra_user.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Licensed under the GPL
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/wait.h>
+#include <asm/unistd.h>
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+
+/*
+* Principles of operation:
+*
+* EVERYTHING here is built to tolerate a failed memory allocation.
+* If either a header buffer or a data buffer (taken from skb->data)
+* is NULL the read will fail and the packet will be dropped. This
+* is the normal behaviour of recvmsg and recvmmsg functions - if a
+* particular iov_base == NULL and its corresponding iov_baselen is
+* 0 we truncate and/or drop the packet altogether.
+*
+* On the negative side this means that we have to do a few more
+* checks for NULL here and there. On the positive side this means
+* that the whole thing is more robust including under low
+* memory conditions.
+*
+* There is one special case which we need to handle as a result of
+* this - any header verification functions should return "broken
+* header" on hitting a NULL. This will in turn invoke the applicable
+* packet drop logic.
+*
+* Any changes should follow this overall design.
+*
+* Side effect - none of these need to use the shared (and mutexed)
+* drop skb. This is surplus to reqs, the normal recvm(m)msg drop
+* mechanics will drop it.
+*/
+
+int net_readv(int fd, void *iov, int iovcnt)
+{
+ int n;
+
+ CATCH_EINTR(n = readv(fd, iov, iovcnt));
+ if ((n < 0) && (errno == EAGAIN))
+ return 0;
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int *addrlen)
+{
+ int n;
+
+ CATCH_EINTR(n = recvfrom(fd, buf, len, 0, src_addr, addrlen));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_writev(int fd, void *iov, int iovcnt)
+{
+ int n;
+
+ CATCH_EINTR(n = writev(fd, iov, iovcnt));
+
+ if ((n < 0) && ((errno == EAGAIN) || (errno == ENOBUFS)))
+ return 0;
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_sendmessage(int fd, void *msg, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = sendmsg(fd, msg, flags));
+ if (n < 0) {
+ if ((errno == EAGAIN) || (errno == ENOBUFS))
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+int net_recvmessage(int fd, void *msg, int flags)
+{
+ int n;
+
+ CATCH_EINTR(n = recvmsg(fd, msg, flags));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout)
+{
+ int n;
+
+ CATCH_EINTR(n = recvmmsg(fd, msgvec, vlen, flags, timeout));
+ if (n < 0) {
+ if (errno == EAGAIN)
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags)
+{
+ int n;
+
+#ifdef HAS_SENDMMSG
+
+ /* has proper sendmmsg */
+
+ CATCH_EINTR(n = sendmmsg(fd, msgvec, vlen, flags));
+#else
+
+ /* no glibc wrapper for sendmmsg - Ubuntu LTS 12.04, Debian 7.x */
+
+ CATCH_EINTR(n = syscall(__NR_sendmmsg, fd, msgvec, vlen, flags));
+#endif
+ if (n < 0) {
+ if ((errno == EAGAIN) || (errno == ENOBUFS))
+ return 0;
+ return -errno;
+ }
+ else if (n == 0)
+ return -ENOTCONN;
+ return n;
+}
+
+void destroy_skb_vector(void ** vector, int size)
+{
+ int i;
+ void ** tofree = vector;
+
+ for (i=0;i<size;i++) {
+ if ( * vector) {
+ uml_net_destroy_skb(* vector);
+ }
+ vector ++;
+ }
+ kfree(tofree);
+}
+
+void destroy_mmsg_vector(void * mmsgvector, int size, int free_iov_base)
+{
+ struct mmsghdr * vector = (struct mmsghdr *) mmsgvector;
+ struct iovec * iov;
+ int i;
+ if (vector) {
+ for (i = 0; i < size; i++) {
+ iov = vector->msg_hdr.msg_iov;
+ if (iov) {
+ if (free_iov_base) {
+ kfree(iov->iov_base);
+ }
+ kfree(iov);
+ }
+ vector ++;
+ }
+ kfree(mmsgvector);
+ } else {
+ printk("NULL mmsg vector in destroy, should not occur\n");
+ }
+}
+
+void * build_skbuf_vector(int size, void * dev)
+{
+ int i;
+ void **result, **vector;
+ result = uml_kmalloc(size * sizeof(void *), UM_GFP_KERNEL);
+ vector = result;
+ if (vector) {
+ for (i = 0; i < size; i++) {
+ * vector = uml_net_build_skb(dev);
+ vector++;
+ }
+ }
+ return result;
+}
+
+void rebuild_skbuf_vector(void ** skbvec, int size, void * dev)
+{
+ int i;
+ if (skbvec) {
+ for (i = 0; i < size; i++) {
+ * skbvec = uml_net_build_skb(dev);
+ skbvec++;
+ }
+ }
+}
+
+void repair_mmsg (void *vec, int iovsize, int header_size)
+{
+ struct mmsghdr * msgvec = (struct mmsghdr *) vec;
+ struct iovec * iov;
+ if (! msgvec->msg_hdr.msg_iov) {
+ msgvec->msg_hdr.msg_iov = uml_kmalloc(sizeof(struct iovec) * iovsize, UM_GFP_KERNEL);
+ }
+ iov = msgvec->msg_hdr.msg_iov;
+ if (iov) {
+ if (! iov->iov_base) {
+ iov->iov_base=uml_kmalloc(header_size, UM_GFP_KERNEL);
+ }
+ if (iov->iov_base) {
+ /* put correct header size just in case - we may have had a short frame */
+ iov->iov_len = header_size;
+ } else {
+ printk("failed to allocate a header buffer, will cause a packet drop later\n");
+ iov->iov_len = 0;
+ }
+ }
+}
+
+void * build_mmsg_vector(int size, int iovsize)
+{
+ int i;
+ struct mmsghdr *msgvec, *result;
+ struct iovec * iov;
+
+ result = uml_kmalloc(sizeof(struct mmsghdr) * size, UM_GFP_KERNEL);
+ msgvec = result;
+ if (msgvec) {
+ memset(msgvec, '\0', sizeof(struct mmsghdr) * size);
+ for ( i = 0; i < size; i++) {
+ iov = uml_kmalloc(sizeof(struct iovec) * iovsize, UM_GFP_KERNEL);
+ msgvec->msg_hdr.msg_iov=iov;
+ if (iov) {
+ memset(iov, '\0', sizeof(struct iovec) * iovsize);
+ msgvec->msg_hdr.msg_iovlen=iovsize;
+ } else {
+ printk("failed to allocate iov\n");
+ msgvec->msg_hdr.msg_iovlen=0; /* silent drop on receive, no xmit */
+ }
+ msgvec++;
+ }
+ }
+ return result;
+}
+
+void add_header_buffers(void * msgvec, int size, int header_size)
+{
+ int i;
+ struct iovec * iov;
+ struct mmsghdr * mmsgvec = (struct mmsghdr *) msgvec;
+ for ( i = 0; i < size; i++) {
+ iov = mmsgvec->msg_hdr.msg_iov;
+ if (iov) {
+ iov->iov_base=uml_kmalloc(header_size, UM_GFP_KERNEL);
+ if (iov->iov_base) {
+ iov->iov_len = header_size;
+ } else {
+ printk("failed to allocate a header buffer, will cause a packet drop later\n");
+ iov->iov_len = 0;
+ }
+ }
+ mmsgvec++;
+ }
+}
+
+/* NOTE - this is only for offset = 0 or 1, other cases are unhandled!!! */
+
+void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size, int offset) {
+ int i;
+ struct iovec * iov;
+ struct mmsghdr * mmsgvec = (struct mmsghdr *) msgvec;
+ for ( i = 0; i < size; i++) {
+ /*
+ This heavily relies on all IOVs being present, if the initial allocation
+ fails it must clean up and switch to "normal" per-packet receive instead
+ Later allocations of skbufs can fail - this will result in short reads
+ and skips
+
+ */
+ iov = mmsgvec->msg_hdr.msg_iov;
+ if (iov) {
+ iov += offset;
+ iov->iov_base=uml_net_skb_data(* skbvec);
+ if (iov->iov_base) {
+ iov->iov_len = skb_size;
+ } else {
+ printk("NULL SKB will drop\n");
+ iov->iov_len = 0;
+ }
+ } else {
+ printk("NULL IOV will drop\n");
+ }
+ mmsgvec++;
+ skbvec++;
+ }
+}
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 64d8426..2889804 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
* James Leu (jleu@mindspring.net).
@@ -29,6 +30,7 @@
static DEFINE_SPINLOCK(opened_lock);
static LIST_HEAD(opened);
+static int rr_counter = 0;
/*
* The drop_skb is used when we can't allocate an skb. The
@@ -42,6 +44,7 @@ static DEFINE_SPINLOCK(drop_lock);
static struct sk_buff *drop_skb;
static int drop_max;
+
static int update_drop_skb(int max)
{
struct sk_buff *new;
@@ -77,24 +80,38 @@ static int uml_net_rx(struct net_device *dev)
struct sk_buff *skb;
/* If we can't allocate memory, try again next round. */
- skb = dev_alloc_skb(lp->max_packet);
- if (skb == NULL) {
- drop_skb->dev = dev;
- /* Read a packet into drop_skb and don't do anything with it. */
- (*lp->read)(lp->fd, drop_skb, lp);
- dev->stats.rx_dropped++;
+ if (lp->options & UML_NET_USE_SKB_READ) {
+ /* we expect a full formed, well behaved skb from zero copy drivers here */
+ skb = (*lp->skb_read)(lp);
+ if (skb == NULL) {
return 0;
- }
-
- skb->dev = dev;
- skb_put(skb, lp->max_packet);
- skb_reset_mac_header(skb);
- pkt_len = (*lp->read)(lp->fd, skb, lp);
-
- if (pkt_len > 0) {
+ }
+ pkt_len = skb->len;
+ } else {
+ skb = dev_alloc_skb(lp->max_packet + 32);
+ if (skb == NULL) {
+ drop_skb->dev = dev;
+ /* Read a packet into drop_skb and don't do anything with it. */
+ (*lp->read)(lp->fd, drop_skb, lp);
+ dev->stats.rx_dropped++;
+ return 0;
+ }
+
+ skb_reserve(skb,32);
+ skb->dev = dev;
+ skb_put(skb, lp->max_packet);
+ skb_reset_mac_header(skb);
+
+ // Mark that virtual devices cannot provide required checksum.
+ skb->ip_summed = CHECKSUM_NONE;
+ pkt_len = (*lp->read)(lp->fd, skb, lp);
+ if (pkt_len > 0) {
skb_trim(skb, pkt_len);
skb->protocol = (*lp->protocol)(skb);
+ }
+ }
+ if (pkt_len > 0) {
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
netif_rx(skb);
@@ -192,8 +209,9 @@ static int uml_net_close(struct net_device *dev)
struct uml_net_private *lp = netdev_priv(dev);
netif_stop_queue(dev);
+ deactivate_fd(lp->fd, dev->irq);
- um_free_irq(dev->irq, dev);
+ free_irq(dev->irq, dev);
if (lp->close != NULL)
(*lp->close)(lp->fd, &lp->user);
lp->fd = -1;
@@ -216,7 +234,6 @@ static int uml_net_start_xmit(struct sk_buff *skb, struct net_device *dev)
spin_lock_irqsave(&lp->lock, flags);
len = (*lp->write)(lp->fd, skb, lp);
- skb_tx_timestamp(skb);
if (len == skb->len) {
dev->stats.tx_packets++;
@@ -273,14 +290,13 @@ static void uml_net_poll_controller(struct net_device *dev)
static void uml_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
- strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, "42", sizeof(info->version));
+ strcpy(info->driver, DRIVER_NAME);
+ strcpy(info->version, "42");
}
static const struct ethtool_ops uml_net_ethtool_ops = {
.get_drvinfo = uml_net_get_drvinfo,
.get_link = ethtool_op_get_link,
- .get_ts_info = ethtool_op_get_ts_info,
};
static void uml_net_user_timer_expire(unsigned long _conn)
@@ -447,6 +463,7 @@ static void eth_configure(int n, void *init, char *mac,
* These just fill in a data structure, so there's no failure
* to be worried about.
*/
+ dev->ethtool_ops = ¨_net_ethtool_ops;
(*transport->kern->init)(dev, init);
*lp = ((struct uml_net_private)
@@ -459,7 +476,9 @@ static void eth_configure(int n, void *init, char *mac,
.open = transport->user->open,
.close = transport->user->close,
.remove = transport->user->remove,
+ .options = transport->kern->options,
.read = transport->kern->read,
+ .skb_read = transport->kern->skb_read,
.write = transport->kern->write,
.add_address = transport->user->add_address,
.delete_address = transport->user->delete_address });
@@ -475,9 +494,9 @@ static void eth_configure(int n, void *init, char *mac,
dev->mtu = transport->user->mtu;
dev->netdev_ops = ¨_netdev_ops;
- dev->ethtool_ops = ¨_net_ethtool_ops;
dev->watchdog_timeo = (HZ >> 1);
- dev->irq = UM_ETH_IRQ;
+ dev->irq = UM_ETH_BASE_IRQ + (rr_counter % UM_ETH_IRQ_RR);
+ rr_counter++;
err = update_drop_skb(lp->max_packet);
if (err)
@@ -829,7 +848,7 @@ static void close_devices(void)
spin_lock(&opened_lock);
list_for_each(ele, &opened) {
lp = list_entry(ele, struct uml_net_private, list);
- um_free_irq(lp->dev->irq, lp->dev);
+ free_irq(lp->dev->irq, lp->dev);
if ((lp->close != NULL) && (lp->fd >= 0))
(*lp->close)(lp->fd, &lp->user);
if (lp->remove != NULL)
diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
index 4a2037f..be9128b 100644
--- a/arch/um/include/asm/irq.h
+++ b/arch/um/include/asm/irq.h
@@ -1,21 +1,27 @@
+
#ifndef __UM_IRQ_H
#define __UM_IRQ_H
+#define UM_ETH_IRQ_RR 32
+
#define TIMER_IRQ 0
#define UMN_IRQ 1
#define CONSOLE_IRQ 2
#define CONSOLE_WRITE_IRQ 3
#define UBD_IRQ 4
-#define UM_ETH_IRQ 5
-#define SSL_IRQ 6
-#define SSL_WRITE_IRQ 7
-#define ACCEPT_IRQ 8
-#define MCONSOLE_IRQ 9
-#define WINCH_IRQ 10
-#define SIGIO_WRITE_IRQ 11
-#define TELNETD_IRQ 12
-#define XTERM_IRQ 13
-#define RANDOM_IRQ 14
+#define UM_ETH_BASE_IRQ 5
+
+#define UM_END_ETH_IRQ UM_ETH_BASE_IRQ + UM_ETH_IRQ_RR
+
+#define SSL_IRQ UM_END_ETH_IRQ + 1
+#define SSL_WRITE_IRQ UM_END_ETH_IRQ + 2
+#define ACCEPT_IRQ UM_END_ETH_IRQ + 3
+#define MCONSOLE_IRQ UM_END_ETH_IRQ + 4
+#define WINCH_IRQ UM_END_ETH_IRQ + 5
+#define SIGIO_WRITE_IRQ UM_END_ETH_IRQ + 6
+#define TELNETD_IRQ UM_END_ETH_IRQ + 7
+#define XTERM_IRQ UM_END_ETH_IRQ + 8
+#define RANDOM_IRQ UM_END_ETH_IRQ + 9
#define LAST_IRQ RANDOM_IRQ
#define NR_IRQS (LAST_IRQ + 1)
diff --git a/arch/um/include/shared/net_kern.h b/arch/um/include/shared/net_kern.h
index 012ac87..1e64658 100644
--- a/arch/um/include/shared/net_kern.h
+++ b/arch/um/include/shared/net_kern.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2002 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -13,6 +14,8 @@
#include <linux/list.h>
#include <linux/workqueue.h>
+#define UML_NET_USE_SKB_READ 1
+
struct uml_net {
struct list_head list;
struct net_device *dev;
@@ -28,6 +31,7 @@ struct uml_net_private {
struct work_struct work;
int fd;
+ unsigned int options;
unsigned char mac[ETH_ALEN];
int max_packet;
unsigned short (*protocol)(struct sk_buff *);
@@ -36,6 +40,7 @@ struct uml_net_private {
void (*remove)(void *);
int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+ struct sk_buff * (*skb_read)(struct uml_net_private *);
void (*add_address)(unsigned char *, unsigned char *, void *);
void (*delete_address)(unsigned char *, unsigned char *, void *);
@@ -47,6 +52,8 @@ struct net_kern_info {
unsigned short (*protocol)(struct sk_buff *);
int (*read)(int, struct sk_buff *skb, struct uml_net_private *);
int (*write)(int, struct sk_buff *skb, struct uml_net_private *);
+ struct sk_buff * (*skb_read)(struct uml_net_private *);
+ unsigned int options;
};
struct transport {
@@ -59,11 +66,35 @@ struct transport {
const int setup_size;
};
+struct mmsg_queue_info {
+ int fd;
+ struct mmsghdr * mmsg_send_vector;
+ void ** skb_send_vector;
+ int queue_depth, head, tail, max_depth;
+ spinlock_t head_lock;
+ spinlock_t tail_lock;
+};
+
extern struct net_device *ether_init(int);
extern unsigned short ether_protocol(struct sk_buff *);
extern int tap_setup_common(char *str, char *type, char **dev_name,
char **mac_out, char **gate_addr);
extern void register_transport(struct transport *new);
extern unsigned short eth_protocol(struct sk_buff *skb);
+extern struct sk_buff *my_build_skb(void * head, void *data, unsigned int frag_size);
+
+extern void flush_pending_netio(void);
+
+extern int uml_net_advance_tail( struct mmsg_queue_info * queue_info, int advance);
+extern int uml_net_advance_head( struct mmsg_queue_info * queue_info, int advance);
+extern int uml_net_flush_mmsg_queue(struct mmsg_queue_info * queue_info, int queue_depth);
+
+extern int uml_net_enqueue (
+ struct mmsg_queue_info * queue_info,
+ struct sk_buff * skb,
+ struct uml_net_private *lp,
+ void (*form_header)(void * header, struct sk_buff * skb, struct uml_net_private * lp),
+ void * remote_addr,
+ int remote_addr_size);
#endif
diff --git a/arch/um/include/shared/net_user.h b/arch/um/include/shared/net_user.h
index 3dabbe1..4b46f37 100644
--- a/arch/um/include/shared/net_user.h
+++ b/arch/um/include/shared/net_user.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -38,10 +39,15 @@ extern void tap_check_ips(char *gate_addr, unsigned char *eth_addr);
extern void read_output(int fd, char *output_out, int len);
extern int net_read(int fd, void *buf, int len);
+extern int net_readv(int fd, void *iov, int iovcnt);
extern int net_recvfrom(int fd, void *buf, int len);
+extern int net_recvfrom2(int fd, void *buf, int len, void *src_addr, int *addrlen);
extern int net_write(int fd, void *buf, int len);
+extern int net_writev(int fd, void *iov, int iovcnt);
extern int net_send(int fd, void *buf, int len);
extern int net_sendto(int fd, void *buf, int len, void *to, int sock_len);
+extern int net_sendmessage(int fd, void *msg, int flags);
+extern int net_recvmessage(int fd, void *msg, int flags);
extern void open_addr(unsigned char *addr, unsigned char *netmask, void *arg);
extern void close_addr(unsigned char *addr, unsigned char *netmask, void *arg);
@@ -50,4 +56,22 @@ extern char *split_if_spec(char *str, ...);
extern int dev_netmask(void *d, void *m);
+
+extern void uml_net_destroy_skb(void * skb);
+extern void * uml_net_build_skb (void * dev);
+extern void * uml_net_skb_data (void * skb);
+
+extern void add_skbuffs(void * msgvec, void ** skbvec, int size, int skb_size, int offset);
+extern void add_header_buffers(void * msgvec, int size, int header_size);
+extern void * build_mmsg_vector(int size, int iovsize);
+extern void rebuild_skbuf_vector(void ** skbvec, int size, void * dev);
+extern void * build_skbuf_vector(int size, void * dev);
+extern int net_recvmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags, struct timespec *timeout);
+extern int net_sendmmsg(int fd, void *msgvec, unsigned int vlen,
+ unsigned int flags);
+extern void repair_mmsg (void *msgvec, int iovsize, int header_size);
+extern void destroy_skb_vector(void ** vector, int size);
+extern void destroy_mmsg_vector(void * mmsgvector, int size, int free_iov_base);
+
#endif
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 2869160..a67a551 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -17,6 +17,7 @@
#include <as-layout.h>
#include <kern_util.h>
#include <os.h>
+#include <net_kern.h>
/*
* We are on the "kernel side" so we cannot pick up the sys/epoll.h
@@ -136,6 +137,10 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
spin_unlock_irqrestore(¨_sigio_lock, flags);
}
+#ifdef CONFIG_UML_NET_VECTOR_TX
+ flush_pending_netio();
+#endif
+
/* This needs a better way - it slows down the event loop */
free_irqs();
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 04/10] L2TPv3 Transport Driver for UML
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (2 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 03/10] High performance networking subsystem anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 05/10] GRE transport " anton.ivanov
` (6 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
This transport allows a UML to connect to another UML local
or remote, the Linux host or any other network device running
the industry standard Ethernet over L2TPv3 protocol as per
RFC 3931 (and successors).
The transport supports a common set of features with the kernel
implementation as well as the Cisco contributed L2TPv3 transport
for QEMU/KVM. In all cases this is static tunnels only, no L2TPv3
control plane.
Additionally, the transport supports the so called "soft"
termination where it can listen for an incoming connection
which does not require the remote endpoint to be specified
at configuration time.
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/Kconfig.net | 10 +
arch/um/drivers/Makefile | 2 +
arch/um/drivers/uml_l2tpv3.h | 111 ++++++++++
arch/um/drivers/uml_l2tpv3_kern.c | 434 +++++++++++++++++++++++++++++++++++++
arch/um/drivers/uml_l2tpv3_user.c | 409 ++++++++++++++++++++++++++++++++++
5 files changed, 966 insertions(+)
create mode 100644 arch/um/drivers/uml_l2tpv3.h
create mode 100644 arch/um/drivers/uml_l2tpv3_kern.c
create mode 100644 arch/um/drivers/uml_l2tpv3_user.c
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index e4a7cf2..d84a1ee 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -93,6 +93,16 @@ config UML_NET_SLIP
UMLs on a single host). You may choose more than one without
conflict. If you don't need UML networking, say N.
+config UML_NET_L2TPV3
+ bool "L2TPV3 transport"
+ depends on UML_NET
+ help
+ This User-Mode Linux network transport allows one or more running
+ UMLs on single or multiple hosts to communicate with each other,
+ the host as well as other remote or local network devices supporting
+ the industry standard Ethernet over L2TPv3 protocol as described in
+ the applicable RFCs
+
config UML_NET_DAEMON
bool "Daemon transport"
depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 836baaf..e2dcd85 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -9,6 +9,7 @@
slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
+uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -43,6 +44,7 @@ obj-$(CONFIG_STDERR_CONSOLE) += stderr_console.o
obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
+obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o
obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_l2tpv3.h b/arch/um/drivers/uml_l2tpv3.h
new file mode 100644
index 0000000..6351590
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_L2TPV3_H__
+#define __UML_L2TPV3_H__
+
+#include "net_user.h"
+
+
+#define NEW_MODE_IP_VERSION 1 /* on for v6, off for v4 */
+#define NEW_MODE_UDP 2 /* on for udp, off for raw ip */
+#define NEW_MODE_COOKIE 4 /* cookie present */
+#define NEW_MODE_COOKIE_SIZE 8 /* on for 64 bit */
+#define NEW_MODE_NO_COUNTER 16 /* draft-constan - no counter */
+
+#define L2TPV3_HEADER 16
+
+
+struct temphtonl {
+ uint32_t low;
+ uint32_t high;
+};
+
+
+struct uml_l2tpv3_data {
+
+ /* destination (if known) */
+
+ void *remote_addr;
+ int remote_addr_size;
+
+ /* passed to us by init */
+
+ char *remote_addr_string;
+ char *local_addr_string;
+ char *local_service;
+ char *remote_service;
+ char *local_session_string;
+ char *remote_session_string;
+
+ uint32_t local_session;
+ uint32_t remote_session;
+ char *rx_cookie_string;
+ char *tx_cookie_string;
+ uint64_t rx_cookie;
+ uint64_t tx_cookie;
+
+
+
+ int fd;
+ void *dev;
+
+ uint32_t uml_l2tpv3_flags;
+ uint32_t mode;
+ uint32_t new_mode; /* listening, sending, etc */
+ uint32_t counter;
+
+ /* Precomputed offsets */
+
+ uint32_t offset; /* main offset == header offset */
+ uint32_t cookie_offset;
+ uint32_t counter_offset;
+ uint32_t session_offset;
+
+ /* vector IO RX */
+
+ void ** skb_recv_vector;
+ void * mmsg_recv_vector;
+
+ /* high speed vector io data */
+
+ uint32_t vector_len;
+ uint32_t recv_index;
+ uint32_t recv_enqueued;
+
+ void ** skb_send_vector;
+ void * mmsg_send_vector;
+ void * send_queue_info;
+
+/* buffer to form headers in one-at-a time packet write mode */
+ uint8_t *network_buffer;
+
+ /* normally same as offset, add size of
+ * struct ipv4 header in ipv4 raw - API stupiditities
+ */
+ uint32_t header_size;
+
+};
+
+
+extern const struct net_user_info uml_l2tpv3_user_info;
+
+extern int uml_l2tpv3_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri);
+
+extern int uml_l2tpv3_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri);
+
+/* initialize mutexes and queueing */
+
+extern void l2tpv3_complete_init(void * dev_id, int max_depth);
+
+/* flush queue and destroy kernel side structures */
+
+extern void l2tpv3_kern_destroy(struct uml_l2tpv3_data *pri);
+
+#define UML_L2TPV3_FLAG_TX_CHECKSUMS 0x00000001
+#define UML_L2TPV3_FLAG_RX_CHECKSUMS 0x00000002
+
+#endif
diff --git a/arch/um/drivers/uml_l2tpv3_kern.c b/arch/um/drivers/uml_l2tpv3_kern.c
new file mode 100644
index 0000000..c24b1b5
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3_kern.c
@@ -0,0 +1,434 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net_kern.h>
+#include <irq_kern.h>
+#include <irq_user.h>
+#include "uml_l2tpv3.h"
+
+#define DRIVER_NAME "uml-l2tpv3"
+
+struct uml_l2tpv3_init {
+ char *local_addr_string;
+ char *remote_addr_string;
+ char *local_service;
+ char *remote_service;
+ char *rx_cookie_string;
+ char *tx_cookie_string;
+ char *local_session_string;
+ char *remote_session_string;
+ char *mode_string;
+};
+
+static void uml_l2tpv3_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, DRIVER_NAME);
+ strcpy(info->version, "42");
+}
+
+
+static const struct ethtool_ops uml_l2tpv3_ethtool_ops =
+{
+ .get_drvinfo = uml_l2tpv3_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+static void uml_l2tpv3_init(struct net_device *dev, void *data)
+{
+ struct uml_net_private *pri;
+ struct uml_l2tpv3_data *dpri;
+ struct uml_l2tpv3_init *init = data;
+
+ pri = netdev_priv(dev);
+ dpri = (struct uml_l2tpv3_data *) pri->user;
+
+ /*
+ * these are as is, we keep them for future reference
+ * and parse them in userspace
+ */
+
+ dpri->local_addr_string = init->local_addr_string;
+ dpri->remote_addr_string = init->remote_addr_string;
+ dpri->local_service = init->local_service;
+ dpri->remote_service = init->remote_service;
+ dpri->rx_cookie_string = init->rx_cookie_string;
+ dpri->tx_cookie_string = init->tx_cookie_string;
+ dpri->local_session_string = init->local_session_string;
+ dpri->remote_session_string = init->remote_session_string;
+
+ /* the only ones we pre-parse */
+
+ if (init->mode_string != NULL) {
+ sscanf(init->mode_string, "%x", &dpri->new_mode);
+ } else {
+ dpri->new_mode = 0;
+ printk("warning: failed to parse l2tpv3 mode %s\n", init->mode_string);
+ }
+
+ printk("l2tpv3 mode %x\n", dpri->new_mode);
+ dpri->fd = -1;
+ dpri->dev = dev;
+ printk("l2tpv3 backend - %s:%s<->%s:%s, rxcookie: %s, txcookie:%s, local_session: %s, peer_session: %s\n",
+ dpri->local_addr_string,
+ dpri->local_service,
+ dpri->remote_addr_string,
+ dpri->remote_service,
+ dpri->rx_cookie_string,
+ dpri->tx_cookie_string,
+ dpri->local_session_string,
+ dpri->remote_session_string
+ );
+ dpri->uml_l2tpv3_flags = 0; /* we have everything turned off initially */
+ SET_ETHTOOL_OPS(dev, ¨_l2tpv3_ethtool_ops);
+}
+
+static int uml_l2tpv3_verify_header(uint8_t * buffer, struct uml_l2tpv3_data *dpri )
+{
+ uint64_t *cookie64;
+ uint32_t *cookie32;
+ uint32_t *session_id;
+
+
+ if ((!(dpri->new_mode & NEW_MODE_IP_VERSION)) && (!(dpri->new_mode & NEW_MODE_UDP))){
+ buffer += sizeof(struct iphdr) /* fix for ipv4 raw */;
+ }
+
+ session_id = (uint32_t *)(buffer + dpri->session_offset);
+ if (*session_id != dpri->remote_session) {
+ printk("Unknown Sesion id\n");
+ return 0;
+ }
+
+ if (dpri->new_mode & NEW_MODE_COOKIE) {
+ if (dpri->new_mode & NEW_MODE_COOKIE_SIZE) {
+ /* 64 bit cookie */
+ cookie64 = (uint64_t *)(buffer + dpri->cookie_offset);
+ if (*cookie64 != dpri->rx_cookie) {
+ printk("unknown cookie id\n");
+ return 0; /* we need to return 0, otherwise barfus */
+ }
+ } else {
+ cookie32 = (uint32_t *)(buffer + dpri->cookie_offset);
+ if (*cookie32 != * (uint32_t *) &dpri->rx_cookie) {
+ printk("unknown cookie id\n");
+ return 0; /* we need to return 0, otherwise barfus */
+ }
+ }
+ }
+ return 1;
+}
+
+static struct sk_buff * uml_l2tpv3_multiread (struct uml_net_private * lp) {
+ struct uml_l2tpv3_data *dpri = (struct uml_l2tpv3_data *) &lp->user;
+ void ** skb_recv_vector = dpri->skb_recv_vector;
+ struct mmsghdr * mmsg_recv_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+ struct sk_buff * result;
+ struct iovec * iov;
+ int ret;
+
+
+ /* Are we done processing the enqueued buffers */
+
+
+ if (dpri->recv_index >= dpri->recv_enqueued) {
+ /* Do we need to refresh the buffer list */
+ if (dpri->recv_enqueued) {
+ /* Replace dpri->recv_enqueued skbuffs */
+ rebuild_skbuf_vector(skb_recv_vector, dpri->recv_enqueued, lp->dev);
+ /* Rebuild message vector */
+ add_skbuffs(dpri->mmsg_recv_vector, skb_recv_vector, dpri->recv_enqueued, lp->max_packet, 1);
+ }
+ ret = net_recvmmsg(
+ dpri->fd, dpri->mmsg_recv_vector, dpri->vector_len, 0,NULL);
+ if (ret >= 0) {
+ dpri->recv_enqueued = ret;
+ } else {
+ printk("Error in multi-packet receive %d\n", ret);
+ return NULL;
+ }
+ dpri->recv_index = 0;
+ }
+
+ /* check if we are done processing the enqueued buffers */
+
+ skb_recv_vector += dpri->recv_index;
+ mmsg_recv_vector += dpri->recv_index;
+ while (dpri->recv_index < dpri->recv_enqueued) {
+ dpri->recv_index ++;
+ iov = mmsg_recv_vector->msg_hdr.msg_iov;
+ if (
+ (iov) &&
+ (mmsg_recv_vector->msg_len > dpri->header_size) &&
+ (uml_l2tpv3_verify_header(iov->iov_base, dpri))
+ ) {
+ if ((!dpri->remote_addr) && (mmsg_recv_vector->msg_hdr.msg_name)) {
+ dpri->remote_addr = mmsg_recv_vector->msg_hdr.msg_name;
+ dpri->remote_addr_size = mmsg_recv_vector->msg_hdr.msg_namelen;
+ mmsg_recv_vector->msg_hdr.msg_name = NULL;
+ mmsg_recv_vector->msg_hdr.msg_namelen = 0;
+ }
+ result = (struct sk_buff *)(* skb_recv_vector);
+ if (result) {
+ skb_trim(result, mmsg_recv_vector->msg_len - dpri->header_size);
+ result->protocol = (*lp->protocol)(result);
+ return result;
+ } else {
+ printk("encountered failed atomic allocation, skipping to next\n");
+ }
+ } else {
+ uml_net_destroy_skb(* skb_recv_vector ) ; /* otherwise we leak it */
+ result = NULL;
+ }
+ skb_recv_vector ++;
+ mmsg_recv_vector ++;
+ }
+ return result;
+}
+
+static int uml_l2tpv3_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ int result;
+ struct uml_l2tpv3_data *dpri = (struct uml_l2tpv3_data *) &lp->user;
+ uint8_t *buffer ;
+
+
+ int offset = dpri->offset;
+
+ buffer = dpri->network_buffer;
+
+ if (!(dpri->new_mode & NEW_MODE_UDP) && !(dpri->new_mode & NEW_MODE_IP_VERSION))
+ {
+ /* IPv4 RAW mode: Account for the IP header that will be received */
+ offset += sizeof(struct iphdr);
+ }
+
+
+ result = uml_l2tpv3_user_recvmsg(
+ fd,
+ buffer, offset,
+ skb->data, skb->dev->mtu + ETH_HEADER_OTHER,
+ dpri
+ );
+ if (result <= 0) {
+ return result;
+ }
+ if (
+ !(dpri->new_mode & NEW_MODE_UDP) &&
+ !(dpri->new_mode & NEW_MODE_IP_VERSION)
+ ) {
+ /* IPv4 RAW mode: Ignore the IP header */
+ buffer += sizeof(struct iphdr);
+ }
+
+ if ((result > offset) && (uml_l2tpv3_verify_header(buffer, dpri))) {
+ if ((dpri->uml_l2tpv3_flags & UML_L2TPV3_FLAG_RX_CHECKSUMS) != 0) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ }
+ return result - offset;
+ } else {
+ return 0;
+ }
+}
+
+static void uml_l2tpv3_form_header(uint8_t * buffer, struct uml_l2tpv3_data *pri) {
+ uint32_t *header;
+ uint32_t *session;
+ uint64_t *cookie64;
+ uint32_t *cookie32;
+ uint32_t *counter;
+ if (pri->new_mode & NEW_MODE_UDP) {
+ header = (uint32_t *) buffer;
+ * header = htonl(0x30000);
+ }
+ session = (uint32_t *) (buffer + pri->session_offset);
+ *session = pri->local_session;
+
+ if (pri->new_mode & NEW_MODE_COOKIE) {
+ if (pri->new_mode & NEW_MODE_COOKIE_SIZE) {
+ cookie64 = (uint64_t *)(buffer + pri->cookie_offset);
+ * cookie64 = pri->tx_cookie;
+ } else {
+ cookie32 = (uint32_t *) (buffer + pri->cookie_offset);
+ * cookie32 = * ((uint32_t *) &pri->tx_cookie);
+ }
+ }
+
+ if (!(pri->new_mode & NEW_MODE_NO_COUNTER)) {
+ counter = (uint32_t *)(buffer + pri->counter_offset);
+ * counter = htonl(++pri->counter);
+ }
+}
+
+void l2tpv3_complete_init(void * dev_id, int max_depth) {
+
+ struct net_device *dev = dev_id;
+ struct uml_net_private *lp = netdev_priv(dev);
+ struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+ struct mmsg_queue_info * queue_info ;
+
+ queue_info =
+ kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+ if (queue_info) {
+ queue_info->fd = pri->fd;
+ queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+ queue_info->skb_send_vector = pri->skb_send_vector;
+ queue_info->head = 0;
+ queue_info->tail = 0;
+ queue_info->queue_depth = 0;
+ queue_info->max_depth = max_depth;
+ spin_lock_init(&queue_info->head_lock);
+ spin_lock_init(&queue_info->tail_lock);
+ }
+ pri->send_queue_info = queue_info;
+}
+
+
+void l2tpv3_kern_destroy(struct uml_l2tpv3_data *pri) {
+
+ int ret = -1;
+ struct mmsg_queue_info * queue_info = pri->send_queue_info;
+ /* flush queue */
+ do {
+ ret = uml_net_flush_mmsg_queue(queue_info, 1);
+ } while (ret != 0);
+ pri->send_queue_info = NULL;
+ kfree(queue_info);
+}
+
+
+static void unified_form_header (void * header, struct sk_buff * skb, struct uml_net_private * lp) {
+ struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+ uml_l2tpv3_form_header(header, pri);
+}
+
+static int uml_l2tpv3_multiwrite(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+ int queue_depth;
+
+ if (pri->remote_addr) {
+
+ queue_depth = uml_net_enqueue (
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ skb,
+ lp,
+ unified_form_header,
+ pri->remote_addr,
+ pri->remote_addr_size
+ );
+
+ uml_net_flush_mmsg_queue(
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ queue_depth
+ );
+ }
+ return skb->len; /* not particularly correct */
+}
+
+static int uml_l2tpv3_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ struct uml_l2tpv3_data *pri = (struct uml_l2tpv3_data *) &lp->user;
+ uint8_t *buffer = pri->network_buffer;
+ int result;
+
+
+ buffer = (uint8_t *) pri->network_buffer;
+
+ uml_l2tpv3_form_header(buffer, pri);
+
+ result = uml_l2tpv3_user_sendmsg(
+ fd,
+ buffer, pri->offset,
+ skb->data, skb->len,
+ pri
+ );
+
+ if (result > pri->offset) {
+ return result - pri->offset;
+ } else {
+ return result; /* not particularly correct */
+ }
+}
+
+static const struct net_kern_info uml_l2tpv3_kern_info = {
+ .options = UML_NET_USE_SKB_READ,
+ .init = uml_l2tpv3_init,
+ .protocol = eth_protocol,
+ .read = uml_l2tpv3_read,
+ .skb_read = uml_l2tpv3_multiread,
+#ifdef CONFIG_UML_NET_VECTOR_TX
+ .write = uml_l2tpv3_multiwrite,
+#else
+ .write = uml_l2tpv3_write,
+#endif
+};
+
+
+
+static int uml_l2tpv3_setup(char *str, char **mac_out, void *data)
+{
+ struct uml_l2tpv3_init *init = data;
+ char *remain;
+
+ *init = (
+ (struct uml_l2tpv3_init)
+ {
+ .local_addr_string = "::1",
+ .local_service = "1701",
+ .remote_service = "1702",
+ .rx_cookie_string = "0xdeadbeefdeadbeef",
+ .tx_cookie_string = "0xdeadbeefdeadbeef",
+ .local_session_string = "0xFFFFFFFF",
+ .remote_session_string = "0xFFFFFFFF",
+ .mode_string = "0",
+ }
+ );
+
+ remain = split_if_spec(str,
+ mac_out,
+ &init->local_addr_string,
+ &init->local_service,
+ &init->remote_addr_string,
+ &init->remote_service,
+ &init->rx_cookie_string,
+ &init->tx_cookie_string,
+ &init->local_session_string,
+ &init->remote_session_string,
+ &init->mode_string,
+ NULL
+ );
+ if (remain != NULL)
+ printk(KERN_WARNING " Strange interface spec \n");
+ return 1;
+}
+
+static struct transport uml_l2tpv3_transport = {
+ .list = LIST_HEAD_INIT(uml_l2tpv3_transport.list),
+ .name = "l2tpv3",
+ .setup = uml_l2tpv3_setup,
+ .user = ¨_l2tpv3_user_info,
+ .kern = ¨_l2tpv3_kern_info,
+ .private_size = sizeof(struct uml_l2tpv3_data),
+ .setup_size = sizeof(struct uml_l2tpv3_init),
+};
+
+static int register_uml_l2tpv3(void)
+{
+ register_transport(¨_l2tpv3_transport);
+ return 0;
+}
+
+late_initcall(register_uml_l2tpv3);
diff --git a/arch/um/drivers/uml_l2tpv3_user.c b/arch/um/drivers/uml_l2tpv3_user.c
new file mode 100644
index 0000000..37d6f9d
--- /dev/null
+++ b/arch/um/drivers/uml_l2tpv3_user.c
@@ -0,0 +1,409 @@
+/*
+ * Copyright (C) 2012-2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+#include <net_user.h>
+#include <os.h>
+#include <um_malloc.h>
+#include <user.h>
+#include "uml_l2tpv3.h"
+
+#define VECTOR_SIZE 32
+
+int l2tpv3_parse_cookie32(char *src , void * dst)
+{
+ if (
+ (src == NULL) ||
+ (sscanf(src, "%x", (unsigned int *) dst) != 1)
+ ) {
+ printk(UM_KERN_ERR "cannot parse cookie!!!: %s\n", src);
+ return -1;
+ }
+ * (( uint32_t *) dst) = htonl(* ((uint32_t* )dst));
+ return 0;
+}
+
+int l2tpv3_parse_cookie64(char *src , void * dst)
+{
+ struct temphtonl temph;
+ uint32_t temp;
+ const int num = 42;
+ if (
+ (src == NULL) ||
+ (sscanf(src, "%llx", (long unsigned int *) &temph) != 1)
+ ) {
+ printk(UM_KERN_ERR "cannot parse cookie!!!: %s\n", src);
+ return -1;
+ }
+ if(*(char *)&num == 42) {
+ // why oh why there is no htonll
+ temp = htonl(temph.high);
+ temph.high = htonl(temph.low);
+ temph.low = temp;
+ } else {
+ temph.low = htonl(temph.low);
+ temph.high = htonl(temph.high);
+ }
+ memcpy(dst, &temph, sizeof (uint64_t));
+ return 0;
+}
+
+static void uml_l2tpv3_remove(void *data)
+{
+ struct uml_l2tpv3_data *pri = data;
+
+ l2tpv3_kern_destroy(pri);
+ if (pri->fd > 0) {
+ close(pri->fd);
+ }
+ pri->fd = -1;
+ if (pri->skb_send_vector) {
+ /* this one should be empty - we flushed it so we just free it */
+ kfree(pri->skb_send_vector);
+ pri->skb_send_vector = NULL;
+ }
+ if (pri->mmsg_send_vector) {
+ destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 1);
+ pri->mmsg_send_vector = NULL;
+ }
+ if (pri->network_buffer) {
+ kfree(pri->network_buffer);
+ pri->network_buffer = NULL;
+ }
+ if (pri->skb_recv_vector) {
+ destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+ pri->skb_recv_vector = NULL;
+ }
+ if (pri->mmsg_recv_vector) {
+ destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 1);
+ pri->mmsg_recv_vector = NULL;
+ }
+}
+
+static int uml_l2tpv3_user_init(void *data, void *dev)
+{
+ struct uml_l2tpv3_data *pri = data;
+ int fd;
+ int sock_family, sock_type, sock_proto;
+ int ret;
+ struct addrinfo hints;
+ struct addrinfo *result;
+ char service[NI_MAXSERV];
+ struct mmsghdr * mmsghdr;
+
+ pri->offset = 4;
+ pri->session_offset = 0;
+ pri->cookie_offset = 4;
+ pri->counter_offset = 4;
+
+ pri->fd = -1;
+
+ /* basic variable parsing */
+
+ pri->local_session = 0;
+ if (l2tpv3_parse_cookie32(pri->local_session_string,&pri->local_session) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ pri->remote_session = 0;
+ if (l2tpv3_parse_cookie32(pri->remote_session_string,&pri->remote_session) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ if (pri->new_mode & NEW_MODE_COOKIE) {
+ if (pri->new_mode & NEW_MODE_COOKIE_SIZE) {
+ /* 64 bit cookie */
+ pri->offset += 8;
+ pri->counter_offset += 8;
+ if (l2tpv3_parse_cookie64(pri->tx_cookie_string,&pri->tx_cookie) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ if (l2tpv3_parse_cookie64(pri->rx_cookie_string,&pri->rx_cookie) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ } else {
+ /* 32 bit cookie */
+ pri->offset += 4;
+ pri->counter_offset +=4;
+ pri->tx_cookie = 0;
+ if (l2tpv3_parse_cookie32(pri->tx_cookie_string,&pri->tx_cookie) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ pri->rx_cookie = 0;
+ if (l2tpv3_parse_cookie32(pri->rx_cookie_string,&pri->rx_cookie) !=0) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ }
+ }
+ if (pri->remote_addr_string) {
+ /* we now allocate it only if it we are not "listening" */
+ pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+ } else {
+ pri->remote_addr = NULL;
+ }
+
+ if (pri->new_mode & NEW_MODE_IP_VERSION) {
+ /* IPv6 */
+ sock_family = AF_INET6;
+ } else {
+ /* IPv4 */
+ sock_family = AF_INET;
+ }
+ if (pri->new_mode & NEW_MODE_UDP) {
+ printk(UM_KERN_ERR "uml_l2tpv3_user_init: preparing udp socket for mode %x\n ", pri->new_mode);
+ sock_type = SOCK_DGRAM;
+ sock_proto = 0;
+ /* space for header. In UDP mode, the
+ * egress packet also includes the
+ * 'Ver' and 'Reserved' fields.
+ */
+
+ pri->offset += 4;
+ pri->counter_offset += 4;
+ pri->session_offset += 4;
+ pri->cookie_offset += 4;
+ } else {
+ printk(UM_KERN_ERR "uml_l2tpv3_user_init: preparing raw socket for mode %x\n ", pri->new_mode);
+ sock_type = SOCK_RAW;
+ sock_proto = 0x73;
+ }
+
+ if (!(pri->new_mode & NEW_MODE_NO_COUNTER)) {
+ pri->offset += 4;
+ }
+
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_flags = AI_PASSIVE;
+ hints.ai_family = sock_family;
+ hints.ai_socktype = sock_type;
+ hints.ai_protocol = sock_proto;
+
+ if ((fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol)) == -1) {
+ fd = -errno;
+ printk(UM_KERN_ERR "uml_l2tpv3_user_init: socket creation failed, "
+ "errno = %d\n", -fd);
+ uml_l2tpv3_remove(pri);
+ return fd;
+ } else {
+ pri->fd = fd;
+ }
+
+ /* Get the details of the local endpoint, and bind it. */
+ memset(service, '\0', NI_MAXSERV);
+ if (pri->new_mode & NEW_MODE_UDP) {
+ strncpy(service, pri->local_service, NI_MAXSERV - 1);
+ service[NI_MAXSERV - 1] = '\0';
+ }
+
+ ret = getaddrinfo(pri->local_addr_string, service, &hints, &result);
+
+ if ((ret != 0) || (result == NULL)) {
+ printk(UM_KERN_ERR "uml_l2tpv3_user_init: Unable to parse the local endpoint: %d\n", ret);
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ if (bind(fd, (struct sockaddr *)result->ai_addr, result->ai_addrlen)) {
+ printk("uml_l2tpv3_user_init: could not bind socket: %d\n", errno);
+ freeaddrinfo(result);
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ printk("uml_l2tpv3_user_init: socket bound\n");
+ freeaddrinfo(result);
+
+ if (pri->remote_addr != NULL) {
+ /* Get the details of the remote endpoint. */
+ memset(service, '\0', NI_MAXSERV);
+ if (pri->new_mode & NEW_MODE_UDP) {
+ strncpy(service, pri->remote_service, NI_MAXSERV - 1);
+ service[NI_MAXSERV - 1] = '\0';
+ }
+ memset(&hints, 0, sizeof(hints));
+ hints.ai_flags = AI_PASSIVE;
+ hints.ai_family = sock_family;
+ hints.ai_socktype = sock_type;
+ hints.ai_protocol = sock_proto;
+ ret = getaddrinfo(pri->remote_addr_string, service, &hints, &result);
+ if ((ret != 0) || (result == NULL)) {
+ printk(UM_KERN_ERR "uml_l2tpv3_user_init: Unable to parse the remote endpoint: %d\n", ret);
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ memset(pri->remote_addr, '\0' , sizeof(struct sockaddr_storage));
+ memcpy(pri->remote_addr, result->ai_addr, result->ai_addrlen);
+ pri->remote_addr_size = result->ai_addrlen;
+ freeaddrinfo(result);
+ }
+
+ /* vector IO init */
+
+ pri->vector_len = VECTOR_SIZE;
+ pri->recv_index = 0;
+ pri->recv_enqueued = 0;
+ pri->header_size = pri->offset /* fix for ipv4 raw */;
+
+ if ((!(pri->new_mode & NEW_MODE_IP_VERSION)) && (!(pri->new_mode & NEW_MODE_UDP))){
+ pri->header_size += sizeof(struct iphdr) /* fix for ipv4 raw */;
+ }
+
+ pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+ pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+ add_header_buffers(pri->mmsg_recv_vector, VECTOR_SIZE, pri->header_size);
+ add_skbuffs(
+ pri->mmsg_recv_vector,
+ pri->skb_recv_vector,
+ VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER,
+ 1
+ );
+ pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+ if (pri->skb_send_vector) {
+ memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+ } else {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+ if (! pri->mmsg_send_vector) {
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+ /* note - we do not need to do the ipv4 header size correction here */
+ add_header_buffers(pri->mmsg_send_vector, VECTOR_SIZE, pri->offset);
+ /* used only in single packet modes, should be obsoleted one day */
+ pri->network_buffer = uml_kmalloc(pri->header_size, UM_GFP_KERNEL);
+ if (!pri->network_buffer) {
+ printk("uml_l2tpv3_user_init: could not allocate buffer\n");
+ return -1;
+ }
+
+ if (!pri->remote_addr) {
+ mmsghdr = (struct mmsghdr *) pri->mmsg_recv_vector;
+ mmsghdr->msg_hdr.msg_name = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+ if (mmsghdr->msg_hdr.msg_name) {
+ mmsghdr->msg_hdr.msg_namelen = sizeof(struct sockaddr_storage);
+ } else {
+ printk("uml_l2tpv3_user_init: Failed to allocate remote address name\n");
+ }
+ }
+ pri->dev = dev;
+
+ /* init kernel side structures that are opaque to userspace -
+ * locks, timers, state machine, etc
+ */
+
+
+ l2tpv3_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+ if (!pri->send_queue_info) {
+ printk("uml_l2tpv3:queue control allocation failed\n");
+ uml_l2tpv3_remove(pri);
+ return -1;
+ }
+
+ if (pri->fd < 0) {
+ return pri->fd;
+ }
+ return 0;
+}
+
+static int uml_l2tpv3_open(void *data)
+{
+ struct uml_l2tpv3_data *pri = data;
+ return pri->fd;
+}
+
+
+int uml_l2tpv3_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri)
+{
+ struct msghdr message;
+ struct iovec vec[2];
+
+ vec[0].iov_base = header;
+ vec[0].iov_len = headerlen;
+ vec[1].iov_base = data;
+ vec[1].iov_len = datalen;
+
+ message.msg_name = pri->remote_addr;
+ message.msg_namelen = pri->remote_addr_size;
+ message.msg_iov = (struct iovec *) &vec;
+ message.msg_iovlen = 2;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = MSG_DONTWAIT;
+
+
+ if (pri->remote_addr != NULL) {
+ return net_sendmessage(fd, &message, MSG_DONTWAIT);
+ } else {
+ return -1;
+ }
+}
+int uml_l2tpv3_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_l2tpv3_data *pri)
+{
+ struct msghdr message;
+ struct iovec vec[2];
+
+ vec[0].iov_base = header;
+ vec[0].iov_len = headerlen;
+ vec[1].iov_base = data;
+ vec[1].iov_len = datalen;
+
+ if (!pri->remote_addr) {
+ pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_ATOMIC);
+ if (pri->remote_addr) {
+ message.msg_name = pri->remote_addr;
+ message.msg_namelen = pri->remote_addr_size;
+ } else {
+ message.msg_name = NULL;
+ message.msg_namelen = 0;
+ }
+ } else {
+ message.msg_name = NULL;
+ message.msg_namelen = 0;
+ }
+
+ message.msg_iov = (struct iovec *) &vec;
+ message.msg_iovlen = 2;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = MSG_DONTWAIT;
+
+ return net_recvmessage(fd, &message, MSG_DONTWAIT);
+}
+const struct net_user_info uml_l2tpv3_user_info = {
+ .init = uml_l2tpv3_user_init,
+ .open = uml_l2tpv3_open,
+ .close = NULL,
+ .remove = uml_l2tpv3_remove,
+ .add_address = NULL,
+ .delete_address = NULL,
+ .mtu = ETH_MAX_PACKET,
+ .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER + L2TPV3_HEADER,
+};
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 05/10] GRE transport for UML
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (3 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 04/10] L2TPv3 Transport Driver for UML anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 06/10] RAW Ethernet " anton.ivanov
` (5 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
This transport allows a UML to connect to another UML local
or remote, the Linux host or any other network device running
the industry standard Ethernet over GRE protocol. The transport
supports all features of RFC 2784.
The transport supports a common set of features with the kernel
implementation. Checksum offload is supported on RX, TODO on TX.
Additionally, the transport supports the so called "soft"
termination where it can listen for an incoming connection
which does not require the remote endpoint to be specified
at configuration time.
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/Kconfig.net | 11 +
arch/um/drivers/Makefile | 2 +
arch/um/drivers/uml_gre.h | 87 ++++++++
arch/um/drivers/uml_gre_kern.c | 446 ++++++++++++++++++++++++++++++++++++++++
arch/um/drivers/uml_gre_user.c | 347 +++++++++++++++++++++++++++++++
5 files changed, 893 insertions(+)
create mode 100644 arch/um/drivers/uml_gre.h
create mode 100644 arch/um/drivers/uml_gre_kern.c
create mode 100644 arch/um/drivers/uml_gre_user.c
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index d84a1ee..e372c06 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -103,6 +103,17 @@ config UML_NET_L2TPV3
the industry standard Ethernet over L2TPv3 protocol as described in
the applicable RFCs
+config UML_NET_GRE
+ bool "GRE transport"
+ depends on UML_NET
+ help
+ This User-Mode Linux network transport allows one or more running
+ UMLs on single or multiple hosts to communicate with each other,
+ the host as well as other remote or local network devices supporting
+ the industry standard Ethernet over GRE protocol as described in
+ the applicable RFCs. The driver supports Soft GRE (wait for connect)
+ as used in Cable systems, etc.
+
config UML_NET_DAEMON
bool "Daemon transport"
depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index e2dcd85..c5427e1 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -10,6 +10,7 @@ slip-objs := slip_kern.o slip_user.o
slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
+uml_gre-objs := uml_gre_kern.o uml_gre_user.o
umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -45,6 +46,7 @@ obj-$(CONFIG_UML_NET_SLIP) += slip.o slip_common.o
obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o
+obj-$(CONFIG_UML_NET_GRE) += uml_gre.o
obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_gre.h b/arch/um/drivers/uml_gre.h
new file mode 100644
index 0000000..1889842
--- /dev/null
+++ b/arch/um/drivers/uml_gre.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_GRE_H__
+#define __UML_GRE_H__
+
+#include "net_user.h"
+
+/* header bits */
+
+#define GRE_MODE_CHECKSUM 8 /* checksum */
+#define GRE_MODE_RESERVED 4 /* unused */
+#define GRE_MODE_KEY 2 /* KEY present */
+#define GRE_MODE_SEQUENCE 1 /* no sequence */
+
+/* flags (internal use */
+
+#define GRE_MODE_IP_VERSION 16 /* on for v6, off for v4 */
+
+
+/* legacy modes */
+
+
+#define MAX_GRE_HEADER 16
+
+
+struct uml_gre_data {
+ void *remote_addr;
+ int remote_addr_size;
+ char *remote_addr_string;
+ char *local_addr_string;
+ char *rx_key_string;
+ char *tx_key_string;
+ uint32_t rx_key;
+ uint32_t tx_key;
+ uint8_t *network_buffer;
+ int fd;
+ void *dev;
+
+ uint32_t sequence;
+
+ /* verbatim header bits + control bits */
+
+ uint32_t mode;
+
+ /* Precomputed offsets */
+
+ uint32_t offset; /* main offset == header offset */
+ uint32_t protocol_offset;
+ uint32_t checksum_offset;
+ uint32_t key_offset;
+ uint32_t sequence_offset;
+
+ void ** skb_recv_vector;
+ void * mmsg_recv_vector;
+
+ void ** skb_send_vector;
+ void * mmsg_send_vector;
+ void * send_queue_info;
+
+ uint32_t vector_len;
+ uint32_t recv_index;
+ uint32_t recv_enqueued;
+ /* normally same as offset, add size of struct ipv4 header in ipv4 raw - API stupiditities */
+ uint32_t header_size;
+
+};
+
+struct gre_minimal_header {
+ uint16_t header;
+ uint16_t arptype;
+};
+
+
+extern const struct net_user_info uml_gre_user_info;
+
+extern int uml_gre_user_sendmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri);
+
+extern int uml_gre_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri);
+
+extern void gre_complete_init(void * dev_id, int max_depth);
+extern void gre_kern_destroy(struct uml_gre_data *pri);
+
+#endif
diff --git a/arch/um/drivers/uml_gre_kern.c b/arch/um/drivers/uml_gre_kern.c
new file mode 100644
index 0000000..ee5d732
--- /dev/null
+++ b/arch/um/drivers/uml_gre_kern.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/ip.h>
+#include "net_kern.h"
+#include "uml_gre.h"
+
+#define DRIVER_NAME "uml-gre"
+
+#define GRE_IRB htons(0x6558)
+#define ETHER_HEADER_SIZE 14
+
+struct uml_gre_init {
+ char *local_addr_string;
+ char *remote_addr_string;
+ char *rx_key_string;
+ char *tx_key_string;
+ char *mode_string;
+};
+
+static void uml_gre_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, DRIVER_NAME);
+ strcpy(info->version, "42");
+}
+
+
+
+static const struct ethtool_ops uml_gre_ethtool_ops =
+{
+ .get_drvinfo = uml_gre_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+
+
+static void uml_gre_init(struct net_device *dev, void *data)
+{
+ struct uml_net_private *pri;
+ struct uml_gre_data *dpri;
+ struct uml_gre_init *init = data;
+
+ pri = netdev_priv(dev);
+ dpri = (struct uml_gre_data *) pri->user;
+
+ /*
+ these are as is, we keep them for future reference
+ and parse them in userspace
+
+ */
+
+ dpri->local_addr_string = init->local_addr_string;
+ dpri->remote_addr_string = init->remote_addr_string;
+ dpri->rx_key_string = init->rx_key_string;
+ dpri->tx_key_string = init->tx_key_string;
+
+ if (init->mode_string != NULL) {
+ sscanf(init->mode_string, "%x", &dpri->mode);
+ } else {
+ dpri->mode = 0;
+ }
+ dpri->fd = -1;
+ dpri->dev = dev;
+ printk("gre backend - %s<->%s, rx_key: %s tx_key: %s, mode %i\n",
+ dpri->local_addr_string,
+ dpri->remote_addr_string,
+ dpri->rx_key_string,
+ dpri->tx_key_string,
+ dpri->mode
+ );
+ SET_ETHTOOL_OPS(dev, ¨_gre_ethtool_ops);
+}
+
+static int uml_gre_verify_header(uint8_t *header_buffer,
+ struct sk_buff *skb,
+ struct uml_gre_data *dpri)
+{
+ struct gre_minimal_header * header;
+ uint16_t old_checksum;
+ uint32_t data_sum;
+ uint32_t and_ether_sum;
+
+ /* this is never called with a NULL SKB, the SKB must be trimmed
+ to correct size prior to calling */
+
+ if (!(dpri->mode & GRE_MODE_IP_VERSION)) {
+ header_buffer += sizeof(struct iphdr) /* fix for ipv4 raw */;
+ }
+
+ header = (struct gre_minimal_header *) header_buffer;
+
+ if (
+ (header->header == htons((dpri->mode & 0xF) << 12)) &&
+ (header->arptype == GRE_IRB)
+ ) {
+ /* header bits and type match, check key if present */
+ if (dpri->mode & GRE_MODE_KEY) {
+ if (*((uint32_t *)(header_buffer + dpri->key_offset)) != dpri->rx_key) {
+ /* key mismatch, drop frame */
+ skb->dev->stats.rx_dropped++;
+ return 0;
+ }
+ }
+ /*
+ We compute the checksum if there is GRE checksum
+ and supply it to the kernel as "checksum offload" in a
+ CHECKSUM_COMPLETE form so it can be used for any protocol
+ */
+
+ if (dpri->mode & GRE_MODE_CHECKSUM) {
+ old_checksum = * ((uint16_t *) (header_buffer + dpri->checksum_offset));
+ * ((uint32_t *) (header_buffer + dpri->checksum_offset)) = 0;
+
+ /* this will break with VLAN tags */
+
+ data_sum = csum_partial(skb->data + ETHER_HEADER_SIZE, skb->len - ETHER_HEADER_SIZE, 0);
+ and_ether_sum = csum_partial(skb->data, ETHER_HEADER_SIZE, data_sum);
+
+ if (old_checksum != csum_fold(csum_partial(header_buffer, dpri->offset, and_ether_sum))) {
+ skb->dev->stats.rx_dropped++;
+ return 0;
+ } else {
+ skb->csum=data_sum;
+ skb->ip_summed = CHECKSUM_COMPLETE;
+ }
+ }
+ return 1;
+ } else {
+ skb->dev->stats.rx_dropped++;
+ }
+ return 0;
+}
+
+static struct sk_buff * uml_gre_multiread (struct uml_net_private * lp) {
+ struct uml_gre_data *dpri = (struct uml_gre_data *) &lp->user;
+ void ** skb_recv_vector = dpri->skb_recv_vector;
+ struct mmsghdr * mmsg_recv_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+ struct sk_buff * result;
+ struct iovec * iov;
+ int ret;
+
+
+ /* Are we done processing the enqueued buffers */
+
+
+ if (dpri->recv_index >= dpri->recv_enqueued) {
+ ret = net_recvmmsg(
+ dpri->fd, mmsg_recv_vector, dpri->vector_len, 0,NULL);
+ if (ret >= 0) {
+ dpri->recv_enqueued = ret;
+ } else {
+ printk("Error in multi-packet receive %d\n", ret);
+ return NULL;
+ }
+ dpri->recv_index = 0;
+ }
+
+ /* check if we are done processing the enqueued buffers */
+
+ skb_recv_vector += dpri->recv_index;
+ mmsg_recv_vector += dpri->recv_index;
+ while (dpri->recv_index < dpri->recv_enqueued) {
+ dpri->recv_index ++;
+ iov = mmsg_recv_vector->msg_hdr.msg_iov;
+ if (
+ (iov) &&
+ (mmsg_recv_vector->msg_len > dpri->header_size) &&
+ (uml_gre_verify_header(iov->iov_base, result, dpri))
+ ) {
+ if (!dpri->remote_addr) {
+ if (mmsg_recv_vector->msg_hdr.msg_name) {
+ dpri->remote_addr = mmsg_recv_vector->msg_hdr.msg_name;
+ dpri->remote_addr_size =
+ mmsg_recv_vector->msg_hdr.msg_namelen;
+ mmsg_recv_vector->msg_hdr.msg_namelen = sizeof (struct sockaddr_storage);
+ }
+ }
+ result = (struct sk_buff *)(* skb_recv_vector);
+ if (result) {
+ skb_trim(result, mmsg_recv_vector->msg_len - dpri->header_size);
+ result->protocol = (*lp->protocol)(result);
+ /* replace the buffer we just (ab)used */
+ (* skb_recv_vector) = uml_net_build_skb(lp->dev);
+ add_skbuffs(mmsg_recv_vector, skb_recv_vector, 1, lp->max_packet, 1);
+ return result;
+ } else {
+ printk("encountered failed atomic allocation @%i, skipping to next\n", dpri->recv_index);
+ }
+ } else {
+ if (mmsg_recv_vector->msg_hdr.msg_name) {
+ /* reset size */
+ mmsg_recv_vector->msg_hdr.msg_namelen =
+ sizeof (struct sockaddr_storage);
+ }
+ result = NULL;
+ }
+ skb_recv_vector ++;
+ mmsg_recv_vector ++;
+ }
+ return result;
+}
+
+static int uml_gre_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ int result;
+ struct uml_gre_data *dpri = (struct uml_gre_data *) &lp->user;
+ uint8_t *buffer ;
+
+
+ int offset = dpri->offset;
+
+ buffer = dpri->network_buffer;
+
+ if (dpri->mode & GRE_MODE_IP_VERSION)
+ {
+ /* IPv4 RAW mode: Account for the IP header that will be received */
+ offset += sizeof(struct iphdr);
+ }
+
+ result = uml_gre_user_recvmsg(
+ fd,
+ buffer, offset,
+ skb->data, skb->dev->mtu + ETH_HEADER_OTHER,
+ dpri
+ );
+ if (result <= 0) {
+ return result;
+ }
+ if (!(dpri->mode & GRE_MODE_IP_VERSION)) {
+ /* IPv4 RAW mode: Ignore the IP header */
+ buffer += sizeof(struct iphdr);
+ }
+
+ if ((result > offset) && (uml_gre_verify_header(buffer, skb, dpri))) {
+ return result - offset;
+ } else {
+ return 0;
+ }
+}
+
+static void uml_gre_form_header(uint8_t * header_buffer,
+ struct sk_buff* skb,
+ struct uml_gre_data *pri)
+{
+ struct gre_minimal_header *header;
+
+ __wsum partial_sum;
+
+ if (!header_buffer) {
+ return;
+ }
+
+ header = (struct gre_minimal_header *) header_buffer;
+
+ header->header = htons((pri->mode & 0xF)<<12);
+ header->arptype = GRE_IRB;
+
+ if (pri->mode & GRE_MODE_SEQUENCE) {
+ * ((uint32_t *)(header_buffer + pri->sequence_offset)) = htonl(++pri->sequence);
+ }
+
+ if (pri->mode & GRE_MODE_KEY) {
+ * ((uint32_t *)(header_buffer + pri->key_offset)) = pri->tx_key; /* we will keep 'em htonled */
+ }
+
+ /* TODO: The methodology here should be:
+ * 1. Report the driver as NETIF_F_HW_CSUM
+ * 2. We will get a start csum and an end csum and where to put it
+ * 3. Compute the csum, stash it
+ * 4. Write where we are told
+ * 5. Determine what else do we need to csum on either side of the HW_CSUM instructions
+ * 6. Adjust for the fact that we may have modified the packet as part of csum computation
+ * 7. Store the newly computed gre csum
+ * In the meantime we are just doing brute force on xmit
+ */
+
+ if (pri->mode & GRE_MODE_CHECKSUM) {
+ * ((uint32_t *) (header_buffer + pri->checksum_offset)) = 0;
+ partial_sum = csum_partial(skb->data,skb->len, 0);
+ partial_sum = csum_partial(header_buffer, pri->offset, partial_sum);
+ * ((uint16_t *) (header_buffer + pri->checksum_offset))
+ = csum_fold(partial_sum);
+ }
+}
+
+void gre_complete_init(void * dev_id, int max_depth) {
+
+ struct net_device *dev = dev_id;
+ struct uml_net_private *lp = netdev_priv(dev);
+ struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+ struct mmsg_queue_info * queue_info ;
+ int err;
+
+ queue_info =
+ kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+ if (queue_info) {
+ queue_info->fd = pri->fd;
+ queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+ queue_info->skb_send_vector = pri->skb_send_vector;
+ queue_info->head = 0;
+ queue_info->tail = 0;
+ queue_info->queue_depth = 0;
+ queue_info->max_depth = max_depth;
+ spin_lock_init(&queue_info->head_lock);
+ spin_lock_init(&queue_info->tail_lock);
+ }
+ pri->send_queue_info = queue_info;
+}
+
+void gre_kern_destroy(struct uml_gre_data *pri) {
+
+ int ret = -1;
+ struct mmsg_queue_info * queue_info = pri->send_queue_info;
+ /* flush queue */
+ do {
+ ret = uml_net_flush_mmsg_queue(queue_info, 1);
+ } while (ret != 0);
+ pri->send_queue_info = NULL;
+ kfree(queue_info);
+}
+
+static void unified_form_header (void * header, struct sk_buff * skb, struct uml_net_private * lp) {
+ struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+ uml_gre_form_header(header, skb, pri);
+}
+
+
+static int uml_gre_multiwrite(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+
+ struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+ int queue_depth;
+
+ if (pri->remote_addr) {
+
+ queue_depth = uml_net_enqueue (
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ skb,
+ lp,
+ unified_form_header,
+ pri->remote_addr,
+ pri->remote_addr_size
+ );
+
+ uml_net_flush_mmsg_queue(
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ queue_depth
+ );
+ }
+
+ return skb->len; /* not particularly correct */
+}
+
+static int uml_gre_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ struct uml_gre_data *pri = (struct uml_gre_data *) &lp->user;
+ uint8_t *buffer = pri->network_buffer;
+ int result;
+
+
+ buffer = (uint8_t *) pri->network_buffer;
+
+ uml_gre_form_header(buffer, skb, pri);
+
+ result = uml_gre_user_sendmsg(
+ fd,
+ buffer, pri->offset,
+ skb->data, skb->len,
+ pri
+ );
+
+ if (result > pri->offset) {
+ return result - pri->offset;
+ } else {
+ return result; /* not particularly correct */
+ }
+}
+
+static const struct net_kern_info uml_gre_kern_info = {
+ .options = UML_NET_USE_SKB_READ,
+ .init = uml_gre_init,
+ .protocol = eth_protocol,
+ .read = uml_gre_read,
+ .skb_read = uml_gre_multiread,
+#ifdef CONFIG_UML_NET_VECTOR_TX
+ .write = uml_gre_multiwrite,
+#else
+ .write = uml_gre_write,
+#endif
+
+};
+
+static int uml_gre_setup(char *str, char **mac_out, void *data)
+{
+ struct uml_gre_init *init = data;
+ char *remain;
+
+ *init = (
+ (struct uml_gre_init)
+ {
+ .local_addr_string = "::1",
+ .mode_string = "0",
+ }
+ );
+
+ remain = split_if_spec(str,
+ mac_out,
+ &init->local_addr_string,
+ &init->remote_addr_string,
+ &init->rx_key_string,
+ &init->tx_key_string,
+ &init->mode_string,
+ NULL
+ );
+ if (remain != NULL)
+ printk(KERN_WARNING " Strange interface spec \n");
+ return 1;
+}
+
+static struct transport uml_gre_transport = {
+ .list = LIST_HEAD_INIT(uml_gre_transport.list),
+ .name = "gre",
+ .setup = uml_gre_setup,
+ .user = ¨_gre_user_info,
+ .kern = ¨_gre_kern_info,
+ .private_size = sizeof(struct uml_gre_data),
+ .setup_size = sizeof(struct uml_gre_init),
+};
+
+static int register_uml_gre(void)
+{
+ register_transport(¨_gre_transport);
+ return 0;
+}
+
+late_initcall(register_uml_gre);
diff --git a/arch/um/drivers/uml_gre_user.c b/arch/um/drivers/uml_gre_user.c
new file mode 100644
index 0000000..cf5dd5e
--- /dev/null
+++ b/arch/um/drivers/uml_gre_user.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (C) 2012-2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <arpa/inet.h>
+
+#include "uml_gre.h"
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+#include "user.h"
+
+#define VECTOR_SIZE 32
+#define PROTO_GRE 47
+
+static int gre_parse_key(char *src , void * dst) {
+ if ((src == NULL) || (sscanf(src, "%x", (unsigned int *) dst) != 1)) {
+ printk(UM_KERN_ERR "cannot parse key!!!: %s\n", src);
+ return -1;
+ }
+ * (( uint32_t *) dst) = htonl(* ((uint32_t* )dst));
+ return 0;
+}
+
+static void uml_gre_remove(void *data)
+{
+ struct uml_gre_data *pri = data;
+
+
+ gre_kern_destroy(pri);
+
+ if (pri->fd > 0) {
+ close(pri->fd);
+ }
+ pri->fd = -1;
+ if (pri->skb_send_vector) {
+ /* this one should be empty - we flushed it so we just free it */
+ kfree(pri->skb_send_vector);
+ pri->skb_send_vector = NULL;
+ }
+ if (pri->mmsg_send_vector) {
+ destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 1);
+ pri->mmsg_send_vector = NULL;
+ }
+ if (pri->skb_recv_vector) {
+ destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+ pri->skb_recv_vector = NULL;
+ }
+ if (pri->mmsg_recv_vector) {
+ destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 1);
+ pri->mmsg_recv_vector = NULL;
+ }
+ if (pri->network_buffer) {
+ kfree(pri->network_buffer);
+ pri->network_buffer = NULL;
+ }
+}
+
+static int uml_gre_user_init(void *data, void *dev)
+{
+ struct uml_gre_data *pri = data;
+ int fd;
+
+ int sock_family;
+ int ret;
+ struct addrinfo hints;
+ struct addrinfo *result;
+ char service[NI_MAXSERV];
+ struct mmsghdr * mmsghdr;
+
+ printk(UM_KERN_INFO "gre user init mode %i\n", pri->mode);
+
+ pri->offset = sizeof(struct gre_minimal_header);
+ pri->checksum_offset = pri->offset;
+ pri->key_offset = pri->offset;
+ pri->sequence_offset = pri->offset;
+
+ pri->fd = -1;
+
+ if (pri->mode & GRE_MODE_CHECKSUM) {
+ pri->offset += 4;
+ pri->key_offset += 4;
+ pri->sequence_offset += 4;
+ }
+
+ if (pri->mode & GRE_MODE_KEY) {
+ pri->offset += 4;
+ pri->sequence_offset +=4;
+ pri->tx_key = 0;
+ pri->rx_key = 0;
+ if (gre_parse_key(pri->tx_key_string,&pri->tx_key) !=0) {
+ return -1;
+ }
+ if (gre_parse_key(pri->rx_key_string,&pri->rx_key) !=0) {
+ return -1;
+ }
+ }
+
+ if (pri->mode & GRE_MODE_SEQUENCE) {
+ pri->offset += 4;
+ }
+
+ /* basic variable parsing */
+
+ if (pri->remote_addr_string) {
+ /* we now allocate it only if it we are not "listening" */
+ pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+ } else {
+ pri->remote_addr = NULL;
+ }
+
+ if (pri->mode & GRE_MODE_IP_VERSION) {
+ /* IPv6 */
+ sock_family = AF_INET6;
+ } else {
+ /* IPv4 */
+ sock_family = AF_INET;
+ }
+
+ printk(UM_KERN_ERR "uml_gre_user_init: preparing raw socket for mode %x\n ", pri->mode);
+
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_flags = AI_PASSIVE;
+ hints.ai_family = sock_family;
+ hints.ai_socktype = SOCK_RAW;
+ hints.ai_protocol = PROTO_GRE;
+
+ if ((fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol)) == -1) {
+ fd = -errno;
+ printk(UM_KERN_ERR "uml_gre_user_init: socket creation failed, "
+ "errno = %d\n", -fd);
+ return fd;
+ }
+
+ pri->fd = fd;
+
+ memset(service, '\0', NI_MAXSERV);
+ ret = getaddrinfo(pri->local_addr_string, service, &hints, &result);
+ if ((ret != 0) || (result == NULL)) {
+ printk(UM_KERN_ERR "uml_gre_user_init: Unable to parse the local endpoint: %d\n", ret);
+ uml_gre_remove(pri);
+ return -1;
+ }
+ if (bind(fd, (struct sockaddr *)result->ai_addr, result->ai_addrlen)) {
+ printk("uml_gre_user_init: could not bind socket: %d\n", errno);
+ freeaddrinfo(result);
+ uml_gre_remove(pri);
+ return -1;
+ }
+
+ printk("uml_gre_user_init: socket bound\n");
+ freeaddrinfo(result);
+
+ if (pri->remote_addr) {
+ memset(service, '\0', NI_MAXSERV);
+ memset(&hints, 0, sizeof(hints));
+
+ hints.ai_flags = AI_PASSIVE;
+ hints.ai_family = sock_family;
+ hints.ai_socktype = SOCK_RAW;
+ hints.ai_protocol = PROTO_GRE;
+
+ ret = getaddrinfo(pri->remote_addr_string, service, &hints, &result);
+
+ if ((ret != 0) || (result == NULL)) {
+ printk(UM_KERN_ERR "uml_gre_user_init: Unable to parse the remote endpoint: %d\n", ret);
+ uml_gre_remove(pri);
+ return -1;
+ }
+
+ memset(pri->remote_addr, '\0' , sizeof(struct sockaddr_storage));
+ memcpy(pri->remote_addr, result->ai_addr, result->ai_addrlen);
+ pri->remote_addr_size = result->ai_addrlen;
+ freeaddrinfo(result);
+ }
+
+ /* vector IO init */
+
+ pri->vector_len = VECTOR_SIZE;
+ pri->recv_index = 0;
+ pri->recv_enqueued = 0;
+ pri->header_size = pri->offset /* fix for ipv4 raw */;
+
+ if (!(pri->mode & GRE_MODE_IP_VERSION)){
+ pri->header_size += sizeof(struct iphdr) /* fix for ipv4 raw */;
+ }
+
+ pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+ if (! pri->skb_recv_vector) {
+ uml_gre_remove(pri);
+ return -1;
+ }
+ pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+ if (! pri->mmsg_recv_vector) {
+ uml_gre_remove(pri);
+ return -1;
+ }
+ add_header_buffers(pri->mmsg_recv_vector, VECTOR_SIZE, pri->header_size);
+ add_skbuffs(
+ pri->mmsg_recv_vector,
+ pri->skb_recv_vector,
+ VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER,
+ 1
+ );
+
+ pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+ if (pri->skb_send_vector) {
+ memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+ } else {
+ uml_gre_remove(pri);
+ return -1;
+ }
+ pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 2);
+ if (! pri->mmsg_send_vector) {
+ uml_gre_remove(pri);
+ return -1;
+ }
+ add_header_buffers(pri->mmsg_send_vector, VECTOR_SIZE, pri->offset);
+
+ pri->network_buffer = uml_kmalloc(pri->header_size, UM_GFP_KERNEL); /* enough for any header, regardless how stupid */
+
+ if (!pri->network_buffer) {
+ printk("uml_gre_user_init: could not allocate buffer\n");
+ close(fd);
+ return -1;
+ }
+
+ if (!pri->remote_addr) {
+ mmsghdr = (struct mmsghdr *) pri->mmsg_recv_vector;
+ mmsghdr->msg_hdr.msg_name = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+ if (mmsghdr->msg_hdr.msg_name) {
+ mmsghdr->msg_hdr.msg_namelen = sizeof(struct sockaddr_storage);
+ } else {
+ printk("uml_gre_user_init: Failed to allocate remote address name\n");
+ }
+ }
+
+ pri->dev = dev;
+
+ gre_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+ if (pri->fd < 0) {
+ return pri->fd;
+ }
+
+ printk("uml_gre_user_init: init complete, fd %i\n", fd);
+
+ return 0;
+}
+
+static int uml_gre_open(void *data)
+{
+ struct uml_gre_data *pri = data;
+ return pri->fd;
+}
+
+
+int uml_gre_user_sendmsg(int fd,
+ void *header, int headerlen, void *data,
+ int datalen, struct uml_gre_data *pri)
+{
+ struct msghdr message;
+ struct iovec vec[2];
+ vec[0].iov_base = header;
+ vec[0].iov_len = headerlen;
+ vec[1].iov_base = data;
+ vec[1].iov_len = datalen;
+
+
+ message.msg_name = pri->remote_addr;
+ message.msg_namelen = pri->remote_addr_size;
+ message.msg_iov = (struct iovec *) &vec;
+ message.msg_iovlen = 2;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = MSG_DONTWAIT;
+
+
+ if (pri->remote_addr != NULL) {
+ return net_sendmessage(fd, &message, MSG_DONTWAIT);
+ } else {
+ return -1;
+ }
+}
+int uml_gre_user_recvmsg(int fd, void *header, int headerlen, void *data, int datalen, struct uml_gre_data *pri)
+{
+ struct msghdr message;
+ struct iovec vec[2];
+ vec[0].iov_base = header;
+ vec[0].iov_len = headerlen;
+ vec[1].iov_base = data;
+ vec[1].iov_len = datalen;
+
+ if (!pri->remote_addr) {
+ pri->remote_addr = uml_kmalloc(sizeof(struct sockaddr_storage), UM_GFP_KERNEL);
+ if (pri->remote_addr) {
+ message.msg_name = pri->remote_addr;
+ message.msg_namelen = pri->remote_addr_size;
+ } else {
+ message.msg_name = NULL;
+ message.msg_namelen = 0;
+ }
+ } else {
+ message.msg_name = NULL;
+ message.msg_namelen = 0;
+ }
+
+ message.msg_iov = (struct iovec *) &vec;
+ message.msg_iovlen = 2;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = MSG_DONTWAIT;
+
+ return net_recvmessage(fd, &message, MSG_DONTWAIT);
+}
+const struct net_user_info uml_gre_user_info = {
+ .init = uml_gre_user_init,
+ .open = uml_gre_open,
+ .close = NULL,
+ .remove = uml_gre_remove,
+ .add_address = NULL,
+ .delete_address = NULL,
+ .mtu = ETH_MAX_PACKET,
+ .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER + MAX_GRE_HEADER,
+};
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 06/10] RAW Ethernet transport for UML
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (4 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 05/10] GRE transport " anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 07/10] Performance and NUMA improvements for ubd anton.ivanov
` (4 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
This is an alternative to the well known pcap transport.
In the absense of special hardware support pcap is slow,
guaranteed to be slow and with significant penalties on
NUMA/SMP systems due to the timestamping of every packet.
This transport does not incur any of these timestamping
penalties. It reads and writes packets directly using
recvmmsg and sendmmsg calls.
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/Kconfig.net | 12 +++
arch/um/drivers/Makefile | 2 +
arch/um/drivers/uml_raw.h | 52 +++++++++
arch/um/drivers/uml_raw_kern.c | 232 ++++++++++++++++++++++++++++++++++++++++
arch/um/drivers/uml_raw_user.c | 167 +++++++++++++++++++++++++++++
5 files changed, 465 insertions(+)
create mode 100644 arch/um/drivers/uml_raw.h
create mode 100644 arch/um/drivers/uml_raw_kern.c
create mode 100644 arch/um/drivers/uml_raw_user.c
diff --git a/arch/um/Kconfig.net b/arch/um/Kconfig.net
index e372c06..112df79 100644
--- a/arch/um/Kconfig.net
+++ b/arch/um/Kconfig.net
@@ -114,6 +114,18 @@ config UML_NET_GRE
the applicable RFCs. The driver supports Soft GRE (wait for connect)
as used in Cable systems, etc.
+config UML_NET_RAW
+ bool "RAW transport"
+ depends on UML_NET
+ help
+ This User-Mode Linux network transport allows UML to bind a raw
+ Ethernet interface using a high-performance non-capture oriented
+ method to read and write traffic. The difference between this driver
+ and any form of PCAP is that this driver does not incur the cost
+ of getting the timestamp for every packet read. This allows it to
+ reach higher performance levels (in the Gigabit range).
+
+
config UML_NET_DAEMON
bool "Daemon transport"
depends on UML_NET
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index c5427e1..b1c0ab0 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -11,6 +11,7 @@ slirp-objs := slirp_kern.o slirp_user.o
daemon-objs := daemon_kern.o daemon_user.o
uml_l2tpv3-objs := uml_l2tpv3_kern.o uml_l2tpv3_user.o
uml_gre-objs := uml_gre_kern.o uml_gre_user.o
+uml_raw-objs := uml_raw_kern.o uml_raw_user.o
umcast-objs := umcast_kern.o umcast_user.o
net-objs := net_kern.o net_user.o net_extra_user.o net_extra_kern.o
mconsole-objs := mconsole_kern.o mconsole_user.o
@@ -47,6 +48,7 @@ obj-$(CONFIG_UML_NET_SLIRP) += slirp.o slip_common.o
obj-$(CONFIG_UML_NET_DAEMON) += daemon.o
obj-$(CONFIG_UML_NET_L2TPV3) += uml_l2tpv3.o
obj-$(CONFIG_UML_NET_GRE) += uml_gre.o
+obj-$(CONFIG_UML_NET_RAW) += uml_raw.o
obj-$(CONFIG_UML_NET_VDE) += vde.o
obj-$(CONFIG_UML_NET_MCAST) += umcast.o
obj-$(CONFIG_UML_NET_PCAP) += pcap.o
diff --git a/arch/um/drivers/uml_raw.h b/arch/um/drivers/uml_raw.h
new file mode 100644
index 0000000..f85e599
--- /dev/null
+++ b/arch/um/drivers/uml_raw.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __UML_RAW_H__
+#define __UML_RAW_H__
+
+#include "net_user.h"
+
+struct uml_raw_data {
+ char *host_iface;
+ int fd;
+ void *dev;
+ uint32_t uml_raw_flags;
+
+ /* multi-rx read */
+
+ void ** skb_recv_vector;
+ void * mmsg_recv_vector;
+
+ void ** skb_send_vector;
+ void * mmsg_send_vector;
+ void * send_queue_info;
+
+ uint32_t vector_len;
+ uint32_t recv_index;
+ uint32_t recv_enqueued;
+
+
+};
+
+extern const struct net_user_info uml_raw_user_info;
+
+extern int uml_raw_user_write(int fd, void *buf, int len,
+ struct uml_raw_data *pri);
+
+extern void raw_complete_init(void * dev_id, int max_depth);
+extern void raw_kern_destroy(struct uml_raw_data *pri);
+
+#define UML_RAW_FLAG_TX_CHECKSUMS 0x00000001
+#define UML_RAW_FLAG_RX_CHECKSUMS 0x00000002
+
+
+#define UML_RAW_TP_BLOCK_SIZE 4096
+#define UML_RAW_TP_FRAME_SIZE 2048
+#define UML_RAW_TP_BLOCK_NR 32
+#define UML_RAW_TP_FRAME_NR 64
+
+
+#endif
diff --git a/arch/um/drivers/uml_raw_kern.c b/arch/um/drivers/uml_raw_kern.c
new file mode 100644
index 0000000..700ff01
--- /dev/null
+++ b/arch/um/drivers/uml_raw_kern.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include "linux/init.h"
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_packet.h>
+#include "net_kern.h"
+#include "uml_raw.h"
+#include "linux/mutex.h"
+
+#define DRIVER_NAME "uml-raw"
+
+
+struct uml_raw_init {
+ char *host_iface;
+};
+
+static void uml_raw_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, DRIVER_NAME);
+ strcpy(info->version, "42");
+}
+
+
+static const struct ethtool_ops uml_raw_ethtool_ops = {
+ .get_drvinfo = uml_raw_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+};
+
+
+static void uml_raw_init(struct net_device *dev, void *data)
+{
+ struct uml_net_private *pri;
+ struct uml_raw_data *dpri;
+ struct uml_raw_init *init = data;
+
+ pri = netdev_priv(dev);
+ dpri = (struct uml_raw_data *) pri->user;
+ dpri->host_iface = init->host_iface;
+ dpri->fd = -1;
+ dpri->dev = dev;
+
+ /* We will free this pointer. If it contains crap we're burned. */
+
+ printk("raw backend - host iface: %s", dpri->host_iface);
+ printk("\n");
+ printk("enabling ethtool support\n");
+ dpri->uml_raw_flags = 0; /* we have everything turned off initially */
+ SET_ETHTOOL_OPS(dev, ¨_raw_ethtool_ops);
+
+}
+
+static int uml_raw_read(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ int result;
+ struct uml_raw_data *dpri;
+ dpri = (struct uml_raw_data *) lp->user;
+
+ result = net_read(fd, skb_mac_header(skb),
+ skb->dev->mtu + ETH_HEADER_OTHER);
+
+ return result;
+}
+
+static struct sk_buff * uml_raw_multiread (struct uml_net_private * lp) {
+ struct uml_raw_data *dpri = (struct uml_raw_data *) &lp->user;
+ void ** skb_vector = dpri->skb_recv_vector;
+ struct mmsghdr * mmsg_vector = (struct mmsghdr *) dpri->mmsg_recv_vector;
+ struct sk_buff * result = NULL;
+ struct iovec * iov;
+ int ret;
+
+ if (dpri->recv_index >= dpri->recv_enqueued) {
+ dpri->recv_index = 0;
+ if (dpri->recv_enqueued) {
+ rebuild_skbuf_vector(skb_vector, dpri->recv_enqueued, lp->dev);
+ add_skbuffs(mmsg_vector, skb_vector, dpri->recv_enqueued, lp->max_packet, 0);
+ }
+ ret = net_recvmmsg(
+ dpri->fd, dpri->mmsg_recv_vector, dpri->vector_len, 0, NULL);
+ if (ret >= 0) {
+ dpri->recv_enqueued = ret;
+ } else {
+ dpri->recv_enqueued = 0;
+ return NULL;
+ }
+ }
+ skb_vector += dpri->recv_index;
+ mmsg_vector += dpri->recv_index;
+ while (dpri->recv_index < dpri->recv_enqueued) {
+ dpri->recv_index ++;
+ iov = mmsg_vector->msg_hdr.msg_iov;
+ if ((mmsg_vector->msg_len) && (iov)) {
+ result = (struct sk_buff *)(* skb_vector);
+ if (result) {
+ skb_trim(result, mmsg_vector->msg_len);
+ result->protocol = (*lp->protocol)(result);
+ return result;
+ } else {
+ printk("encountered failed atomic allocation, skipping to next\n");
+ }
+ } else {
+ uml_net_destroy_skb(* skb_vector ) ; /* otherwise we leak it */
+ result = NULL;
+ }
+ mmsg_vector++;
+ skb_vector++;
+ }
+ return result;
+}
+
+void raw_complete_init(void * dev_id, int max_depth) {
+
+ struct net_device *dev = dev_id;
+ struct uml_net_private *lp = netdev_priv(dev);
+ struct uml_raw_data *pri = (struct uml_raw_data *) &lp->user;
+ struct mmsg_queue_info * queue_info ;
+ int err;
+
+ queue_info =
+ kmalloc(sizeof(struct mmsg_queue_info), GFP_KERNEL);
+ if (queue_info) {
+ queue_info->fd = pri->fd;
+ queue_info->mmsg_send_vector = pri->mmsg_send_vector;
+ queue_info->skb_send_vector = pri->skb_send_vector;
+ queue_info->head = 0;
+ queue_info->tail = 0;
+ queue_info->queue_depth = 0;
+ queue_info->max_depth = max_depth;
+ spin_lock_init(&queue_info->head_lock);
+ spin_lock_init(&queue_info->tail_lock);
+ }
+ pri->send_queue_info = queue_info;
+}
+
+
+void raw_kern_destroy(struct uml_raw_data *pri) {
+
+ int ret = -1;
+ struct mmsg_queue_info * queue_info = pri->send_queue_info;
+ /* flush queue */
+ do {
+ ret = uml_net_flush_mmsg_queue(queue_info, -1);
+ } while (ret != 0);
+ pri->send_queue_info = NULL;
+ kfree(queue_info);
+}
+
+static int uml_raw_multiwrite(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ struct uml_raw_data *pri = (struct uml_raw_data *) &lp->user;
+ int queue_depth;
+
+ queue_depth = uml_net_enqueue (
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ skb,
+ lp,
+ NULL,
+ NULL,
+ 0
+ );
+
+ uml_net_flush_mmsg_queue(
+ (struct mmsg_queue_info *) pri->send_queue_info,
+ queue_depth
+ );
+ return skb->len; /* not particularly correct */
+}
+
+static int uml_raw_write(int fd, struct sk_buff *skb, struct uml_net_private *lp)
+{
+ return uml_raw_user_write(fd, skb->data, skb->len,
+ (struct uml_raw_data *) &lp->user);
+}
+
+static int uml_raw_setup(char *str, char **mac_out, void *data)
+{
+ struct uml_raw_init *init = data;
+ char *remain;
+
+ *init = (
+ (struct uml_raw_init)
+ { .host_iface = "eth0"}
+ );
+
+ remain = split_if_spec(str, mac_out, &init->host_iface, NULL);
+ if (remain != NULL)
+ printk(KERN_WARNING " Strange interface spec \n");
+
+ return 1;
+}
+
+
+static const struct net_kern_info uml_raw_kern_info = {
+ .options = UML_NET_USE_SKB_READ,
+ .init = uml_raw_init,
+ .protocol = eth_protocol,
+ .read = uml_raw_read,
+ .skb_read = uml_raw_multiread,
+#ifdef CONFIG_UML_NET_VECTOR_TX
+ .write = uml_raw_multiwrite,
+#else
+ .write = uml_raw_write,
+#endif
+
+};
+
+static struct transport uml_raw_transport = {
+ .list = LIST_HEAD_INIT(uml_raw_transport.list),
+ .name = "raw",
+ .setup = uml_raw_setup,
+ .user = ¨_raw_user_info,
+ .kern = ¨_raw_kern_info,
+ .private_size = sizeof(struct uml_raw_data),
+ .setup_size = sizeof(struct uml_raw_init),
+};
+
+static int register_uml_raw(void)
+{
+ register_transport(¨_raw_transport);
+ return 0;
+}
+
+late_initcall(register_uml_raw);
diff --git a/arch/um/drivers/uml_raw_user.c b/arch/um/drivers/uml_raw_user.c
new file mode 100644
index 0000000..084d875
--- /dev/null
+++ b/arch/um/drivers/uml_raw_user.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2001 Lennert Buytenhek (buytenh@gnu.org) and
+ * James Leu (jleu@mindspring.net).
+ * Copyright (C) 2001 by various other people who didn't put their name here.
+ * Licensed under the GPL.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <net/ethernet.h>
+#include <netinet/ip.h>
+#include <netinet/ether.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+#include <sys/mman.h>
+
+
+#include "uml_raw.h"
+#include "net_user.h"
+#include "os.h"
+#include "um_malloc.h"
+#include "user.h"
+
+#define VECTOR_SIZE 32
+
+static void uml_raw_remove(void *data)
+{
+ struct uml_raw_data *pri = data;
+
+ raw_kern_destroy(pri);
+
+ if (pri->fd > 0) {
+ close(pri->fd);
+ }
+ pri->fd = -1;
+
+ if (pri->skb_send_vector) {
+ /* this one should be empty - we flushed it so we just free it */
+ kfree(pri->skb_send_vector);
+ pri->skb_send_vector = NULL;
+ }
+ if (pri->mmsg_send_vector) {
+ destroy_mmsg_vector(pri->mmsg_send_vector, VECTOR_SIZE, 0);
+ pri->mmsg_send_vector = NULL;
+ }
+
+ if (pri->skb_recv_vector) {
+ destroy_skb_vector(pri->skb_recv_vector, VECTOR_SIZE);
+ pri->skb_recv_vector = NULL;
+ }
+ if (pri->mmsg_recv_vector) {
+ destroy_mmsg_vector(pri->mmsg_recv_vector, VECTOR_SIZE, 0);
+ pri->mmsg_recv_vector = NULL;
+ }
+}
+
+static int uml_raw_user_init(void *data, void *dev)
+{
+ struct uml_raw_data *pri = data;
+ struct ifreq ifr;
+ int fd;
+ struct sockaddr_ll sock;
+ int err;
+
+ if ((fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL))) == -1) {
+ err = -errno;
+ printk(UM_KERN_ERR "uml_raw_open : raw socket creation failed, "
+ "errno = %d\n", -err);
+ return err;
+ }
+
+ pri->fd = fd;
+ pri->vector_len = VECTOR_SIZE;
+ pri->recv_index = 0;
+ pri->recv_enqueued = 0;
+ pri->skb_recv_vector = build_skbuf_vector(VECTOR_SIZE, dev);
+ if (! pri->skb_recv_vector) {
+ uml_raw_remove(pri);
+ return -1;
+ }
+ pri->mmsg_recv_vector = build_mmsg_vector(VECTOR_SIZE, 1);
+ if (! pri->mmsg_recv_vector) {
+ uml_raw_remove(pri);
+ return -1;
+ }
+ add_skbuffs(
+ pri->mmsg_recv_vector,
+ pri->skb_recv_vector,
+ VECTOR_SIZE, ETH_MAX_PACKET + ETH_HEADER_OTHER,
+ 0
+ );
+
+ pri->skb_send_vector = uml_kmalloc(VECTOR_SIZE * sizeof(void *), UM_GFP_KERNEL);
+ if (pri->skb_send_vector) {
+ memset(pri->skb_send_vector, 0, sizeof(void *) * VECTOR_SIZE);
+ } else {
+ uml_raw_remove(pri);
+ return -1;
+ }
+ pri->mmsg_send_vector = build_mmsg_vector(VECTOR_SIZE, 1);
+ if (! pri->mmsg_send_vector) {
+ uml_raw_remove(pri);
+ return -1;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ strncpy(&ifr.ifr_name, pri->host_iface, sizeof(ifr.ifr_name) - 1);
+ if(ioctl(fd, SIOCGIFINDEX, (void *) &ifr) < 0) {
+ err = -errno;
+ printk(UM_KERN_ERR "SIOCGIFINDEX, failed to get raw interface index for %s", pri->host_iface);
+ uml_raw_remove(pri);
+ return(-1);
+ }
+
+ sock.sll_family = AF_PACKET;
+ sock.sll_protocol = htons(ETH_P_ALL);
+ sock.sll_ifindex = ifr.ifr_ifindex;
+
+ printk(UM_KERN_INFO "uml_raw: binding raw on interface index: %i\n", ifr.ifr_ifindex);
+ if (bind(fd, (struct sockaddr *) &sock, sizeof(struct sockaddr_ll)) < 0) {
+ printk(UM_KERN_ERR "uml_raw: failed to bind raw socket");
+ uml_raw_remove(pri);
+ return(-1);
+ }
+
+ pri->dev = dev;
+
+ raw_complete_init(dev, VECTOR_SIZE); /* we really need error checking here */
+
+ if (pri->fd < 0) {
+ return pri->fd;
+ }
+
+ return 0;
+}
+
+static int uml_raw_open(void *data)
+{
+ struct uml_raw_data *pri = data;
+ return pri->fd;
+}
+
+
+int uml_raw_user_write(int fd, void *buf, int len, struct uml_raw_data *pri)
+{
+ return net_write(fd, buf, len);
+}
+
+const struct net_user_info uml_raw_user_info = {
+ .init = uml_raw_user_init,
+ .open = uml_raw_open,
+ .close = NULL,
+ .remove = uml_raw_remove,
+ .add_address = NULL,
+ .delete_address = NULL,
+ .mtu = ETH_MAX_PACKET,
+ .max_packet = ETH_MAX_PACKET + ETH_HEADER_OTHER,
+};
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 07/10] Performance and NUMA improvements for ubd
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (5 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 06/10] RAW Ethernet " anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 08/10] Minor performance optimization " anton.ivanov
` (3 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
The use of the seek()/read() and seek()/write() is a terminal
disease on NUMA. Intense use of this on shared files (f.e.
the master for a COW image) can cause anything up to and including
killing CPUs on unhandled NMIs.
This patch deals with this UML major issue (and one of UML biggest
performance pitfalls). As a result you can now run (subject to
correct pinning) 2000+ UMLs on a NUMA box without crashing it.
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/drivers/ubd_kern.c | 29 ++++++++---------------------
arch/um/include/shared/os.h | 2 ++
arch/um/os-Linux/file.c | 18 ++++++++++++++++++
3 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 1cc72ae5..35ba00b 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 Jeff Dike (jdike@karaya.com)
* Licensed under the GPL
*/
@@ -534,11 +535,7 @@ static int read_cow_bitmap(int fd, void *buf, int offset, int len)
{
int err;
- err = os_seek_file(fd, offset);
- if (err < 0)
- return err;
-
- err = os_read_file(fd, buf, len);
+ err = os_pread_file(fd, buf, len, offset);
if (err < 0)
return err;
@@ -1375,14 +1372,10 @@ static int update_bitmap(struct io_thread_req *req)
if(req->cow_offset == -1)
return 0;
- n = os_seek_file(req->fds[1], req->cow_offset);
- if(n < 0){
- printk("do_io - bitmap lseek failed : err = %d\n", -n);
- return 1;
- }
-
- n = os_write_file(req->fds[1], &req->bitmap_words,
- sizeof(req->bitmap_words));
+ n = os_pwrite_file(req->fds[1], &req->bitmap_words,
+ sizeof(req->bitmap_words),
+ req->cow_offset
+ );
if(n != sizeof(req->bitmap_words)){
printk("do_io - bitmap update failed, err = %d fd = %d\n", -n,
req->fds[1]);
@@ -1426,18 +1419,12 @@ static void do_io(struct io_thread_req *req)
len = (end - start) * req->sectorsize;
buf = &req->buffer[start * req->sectorsize];
- err = os_seek_file(req->fds[bit], off);
- if(err < 0){
- printk("do_io - lseek failed : err = %d\n", -err);
- req->error = 1;
- return;
- }
if(req->op == UBD_READ){
n = 0;
do {
buf = &buf[n];
len -= n;
- n = os_read_file(req->fds[bit], buf, len);
+ n = os_pread_file(req->fds[bit], buf, len, off);
if (n < 0) {
printk("do_io - read failed, err = %d "
"fd = %d\n", -n, req->fds[bit]);
@@ -1447,7 +1434,7 @@ static void do_io(struct io_thread_req *req)
} while((n < len) && (n != 0));
if (n < len) memset(&buf[n], 0, len - n);
} else {
- n = os_write_file(req->fds[bit], buf, len);
+ n = os_pwrite_file(req->fds[bit], buf, len, off);
if(n != len){
printk("do_io - write failed err = %d "
"fd = %d\n", -n, req->fds[bit]);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 17b4e9f..7f544f4 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -142,6 +142,8 @@ extern int os_seek_file(int fd, unsigned long long offset);
extern int os_open_file(const char *file, struct openflags flags, int mode);
extern int os_read_file(int fd, void *buf, int len);
extern int os_write_file(int fd, const void *buf, int count);
+extern int os_pread_file(int fd, void *buf, int len, unsigned long long offset);
+extern int os_pwrite_file(int fd, const void *buf, int count, unsigned long long offset);
extern int os_sync_file(int fd);
extern int os_file_size(const char *file, unsigned long long *size_out);
extern int os_file_modtime(const char *file, unsigned long *modtime);
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index 07a7501..64951fd 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -266,6 +266,24 @@ int os_write_file(int fd, const void *buf, int len)
return n;
}
+int os_pread_file(int fd, void *buf, int len, unsigned long long offset)
+{
+ int n = pread(fd, buf, len, offset);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
+int os_pwrite_file(int fd, const void *buf, int len, unsigned long long offset)
+{
+ int n = pwrite(fd, (void *) buf, len, offset);
+
+ if (n < 0)
+ return -errno;
+ return n;
+}
+
int os_sync_file(int fd)
{
int n = fsync(fd);
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 08/10] Minor performance optimization for ubd
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (6 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 07/10] Performance and NUMA improvements for ubd anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 09/10] Better IPC for UBD anton.ivanov
` (2 subsequent siblings)
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
Obvious performance optimization - it is not necessary
to read the requests one at a time in the IRQ handler
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/drivers/ubd_kern.c | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 35ba00b..8568290 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -443,6 +443,8 @@ static void do_ubd_request(struct request_queue * q);
static int thread_fd = -1;
static LIST_HEAD(restart);
+static struct io_thread_req * ubd_request_list[MAX_SG];
+
/* XXX - move this inside ubd_intr. */
/* Called without dev->lock held, and only in interrupt context. */
static void ubd_handler(void)
@@ -451,21 +453,34 @@ static void ubd_handler(void)
struct ubd *ubd;
struct list_head *list, *next_ele;
unsigned long flags;
- int n;
+ int n, i;
+
+ /*
+ * obvious optimization - we do not need to read the reqs one at a time
+ * we can read all pending reqs in one interrupt and handle them in bulk
+ */
while(1){
- n = os_read_file(thread_fd, &req,
- sizeof(struct io_thread_req *));
- if(n != sizeof(req)){
+ do {
+ n = os_read_file(thread_fd, &ubd_request_list,
+ sizeof(struct io_thread_req *) * MAX_SG);
+ } while (n == -EINTR);
+ if(n < 0){
if(n == -EAGAIN)
break;
printk(KERN_ERR "spurious interrupt in ubd_handler, "
"err = %d\n", -n);
return;
+ } else if (n % sizeof(struct io_thread_req *) != 0) {
+ printk(KERN_ERR "spurious interrupt in ubd_handler, "
+ "err = %d\n", -n);
+ return;
+ }
+ for (i = 0; i < n / sizeof(struct io_thread_req *); i++) {
+ req = ubd_request_list[i];
+ blk_end_request(req->req, 0, req->length);
+ kfree(req);
}
-
- blk_end_request(req->req, 0, req->length);
- kfree(req);
}
list_for_each_safe(list, next_ele, &restart){
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 09/10] Better IPC for UBD
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (7 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 08/10] Minor performance optimization " anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:00 ` [uml-devel] [PATCH v3 10/10] High Resolution Timer subsystem for UML anton.ivanov
2014-09-04 19:14 ` [uml-devel] UML Performance improvement patchset Richard Weinberger
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
socketpair() is a better IPC choice for lots of small requests
as it allows deeper (and configurable) queues than pipe()
As a result UBD will process nearly all of the requests submitted
to it instead of bouncing a significant percentage under load
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/drivers/ubd_kern.c | 2 +-
arch/um/drivers/ubd_user.c | 4 ++--
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 8568290..80f8655 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1268,7 +1268,7 @@ static bool submit_request(struct io_thread_req *io_req, struct ubd *dev)
int n = os_write_file(thread_fd, &io_req,
sizeof(io_req));
if (n != sizeof(io_req)) {
- if (n != -EAGAIN)
+ if ((n != -EAGAIN) && (n != -ENOBUFS))
printk("write to io thread failed, "
"errno = %d\n", -n);
else if (list_empty(&dev->restart))
diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c
index e376f9b..6833fc5 100644
--- a/arch/um/drivers/ubd_user.c
+++ b/arch/um/drivers/ubd_user.c
@@ -1,4 +1,4 @@
-/*
+/*
* Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
* Copyright (C) 2001 Ridgerun,Inc (glonnon@ridgerun.com)
* Licensed under the GPL
@@ -25,7 +25,7 @@ int start_io_thread(unsigned long sp, int *fd_out)
{
int pid, fds[2], err;
- err = os_pipe(fds, 1, 1);
+ err = socketpair(AF_UNIX, SOCK_STREAM, 0, (int *) &fds);
if(err < 0){
printk("start_io_thread - os_pipe failed, err = %d\n", -err);
goto out;
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* [uml-devel] [PATCH v3 10/10] High Resolution Timer subsystem for UML
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (8 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 09/10] Better IPC for UBD anton.ivanov
@ 2014-09-04 19:00 ` anton.ivanov
2014-09-04 19:14 ` [uml-devel] UML Performance improvement patchset Richard Weinberger
10 siblings, 0 replies; 15+ messages in thread
From: anton.ivanov @ 2014-09-04 19:00 UTC (permalink / raw)
To: user-mode-linux-devel
From: Anton Ivanov <antivano@cisco.com>
This patch adds an extra timer source which has correct timing
and uses an up-to-date OS API and.
Results - correct kernel behaviour on timer related tasks.
1. Improvement in network performance (TCP state machines
are now fed correct time).
2. Correct QoS and traffic shaping.
This improvement does not (and cannot) fix UML userspace. Its
timer/time related behaviour is heavily dependent on getting
VTALRM pacing which is instantiated on a per userspace thread
basis. This patch does not fix this!!! It sorts out only the
kernel side - forwarding, qos, tcp, etc.
Signed-off-by: Anton Ivanov <antivano@cisco.com>
---
arch/um/Makefile | 2 +-
arch/um/include/asm/irq.h | 3 +-
arch/um/include/shared/kern_util.h | 1 +
arch/um/include/shared/os.h | 5 +
arch/um/include/shared/timer-internal.h | 19 +++
arch/um/kernel/irq.c | 11 ++
arch/um/kernel/process.c | 9 +-
arch/um/kernel/time.c | 42 +++++--
arch/um/os-Linux/signal.c | 49 +++++++-
arch/um/os-Linux/skas/process.c | 22 ++--
arch/um/os-Linux/time.c | 196 +++++++++++++++++++++++--------
11 files changed, 277 insertions(+), 82 deletions(-)
create mode 100644 arch/um/include/shared/timer-internal.h
diff --git a/arch/um/Makefile b/arch/um/Makefile
index 133f7de..9864fb7 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -121,7 +121,7 @@ export LDS_ELF_FORMAT := $(ELF_FORMAT)
# The wrappers will select whether using "malloc" or the kernel allocator.
LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc
-LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt))
+LD_FLAGS_CMDLINE = $(foreach opt,$(LDFLAGS),-Wl,$(opt)) -lrt
# Used by link-vmlinux.sh which has special support for um link
export CFLAGS_vmlinux := $(LINK-y) $(LINK_WRAPS) $(LD_FLAGS_CMDLINE)
diff --git a/arch/um/include/asm/irq.h b/arch/um/include/asm/irq.h
index be9128b..4dd2f07 100644
--- a/arch/um/include/asm/irq.h
+++ b/arch/um/include/asm/irq.h
@@ -22,8 +22,9 @@
#define TELNETD_IRQ UM_END_ETH_IRQ + 7
#define XTERM_IRQ UM_END_ETH_IRQ + 8
#define RANDOM_IRQ UM_END_ETH_IRQ + 9
+#define HRTIMER_IRQ UM_END_ETH_IRQ + 10
-#define LAST_IRQ RANDOM_IRQ
+#define LAST_IRQ HRTIMER_IRQ
#define NR_IRQS (LAST_IRQ + 1)
#endif
diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index 83a91f9..0282b36 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -37,6 +37,7 @@ extern void initial_thread_cb(void (*proc)(void *), void *arg);
extern int is_syscall(unsigned long addr);
extern void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
+extern void hrtimer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
extern int start_uml(void);
extern void paging_init(void);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 7f544f4..d4fefb9 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -222,6 +222,7 @@ extern char *get_umid(void);
/* signal.c */
extern void timer_init(void);
+extern void uml_hrtimer_init(void);
extern void set_sigstack(void *sig_stack, int size);
extern void remove_sigstack(void);
extern void set_handler(int sig);
@@ -245,8 +246,12 @@ extern void idle_sleep(unsigned long long nsecs);
extern int set_interval(void);
extern int timer_one_shot(int ticks);
extern long long disable_timer(void);
+extern long long timer_remain(void);
extern void uml_idle_timer(void);
+extern long long persistent_clock_emulation(void);
extern long long os_nsecs(void);
+extern long long os_vnsecs(void);
+extern int itimer_init(void);
/* skas/mem.c */
extern long run_syscall_stub(struct mm_id * mm_idp,
diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/shared/timer-internal.h
new file mode 100644
index 0000000..3e78d83
--- /dev/null
+++ b/arch/um/include/shared/timer-internal.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Licensed under the GPL
+ */
+
+#ifndef __TIMER_INTERNAL_H__
+#define __TIMER_INTERNAL_H__
+
+#define TIMER_MULTIPLIER 256
+#define TIMER_MIN_DELTA 500
+
+extern void timer_lock(void);
+extern void timer_unlock(void);
+
+extern long long hrtimer_disable(void);
+extern long long tracingtimer_disable(void);
+
+#endif
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index a67a551..26f29d1 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -531,11 +531,22 @@ static struct irq_chip SIGVTALRM_irq_type = {
.irq_unmask = dummy,
};
+static struct irq_chip SIGUSR2_irq_type = {
+ .name = "SIGUSR2",
+ .irq_disable = dummy,
+ .irq_enable = dummy,
+ .irq_ack = dummy,
+ .irq_mask = dummy,
+ .irq_unmask = dummy,
+};
+
+
void __init init_IRQ(void)
{
int i;
irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
+ irq_set_chip_and_handler(HRTIMER_IRQ, &SIGUSR2_irq_type, handle_edge_irq);
for (i = 1; i < NR_IRQS - 1 ; i++)
irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
os_setup_epoll(MAX_EPOLL_EVENTS);
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index bbcef52..3daf56b 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -27,6 +27,7 @@
#include <kern_util.h>
#include <os.h>
#include <skas.h>
+#include <timer-internal.h>
/*
* This is a per-cpu array. A processor only modifies its entry and it only
@@ -215,7 +216,13 @@ void arch_cpu_idle(void)
unsigned long long nsecs;
cpu_tasks[current_thread_info()->cpu].pid = os_getpid();
- nsecs = disable_timer();
+
+ /* there is no benefit whatsoever in disabling a pending
+ * hrtimer and setting a nanowait for the same value instead
+ * so we do timer disable + wait only for the tracing one here
+ */
+
+ nsecs = tracingtimer_disable();
idle_sleep(nsecs);
local_irq_enable();
}
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 117568d..e714ecf 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -12,6 +13,8 @@
#include <asm/param.h>
#include <kern_util.h>
#include <os.h>
+#include <timer-internal.h>
+
void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
@@ -22,6 +25,15 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
local_irq_restore(flags);
}
+void hrtimer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ do_IRQ(HRTIMER_IRQ, regs);
+ local_irq_restore(flags);
+}
+
static void itimer_set_mode(enum clock_event_mode mode,
struct clock_event_device *evt)
{
@@ -44,7 +56,7 @@ static void itimer_set_mode(enum clock_event_mode mode,
static int itimer_next_event(unsigned long delta,
struct clock_event_device *evt)
{
- return timer_one_shot(delta + 1);
+ return timer_one_shot(delta);
}
static struct clock_event_device itimer_clockevent = {
@@ -54,8 +66,11 @@ static struct clock_event_device itimer_clockevent = {
.features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
.set_mode = itimer_set_mode,
.set_next_event = itimer_next_event,
- .shift = 32,
+ .shift = 0,
+ .max_delta_ns = 0xffffffff,
+ .min_delta_ns = TIMER_MIN_DELTA, //microsecond resolution should be enough for anyone, same as 640K RAM
.irq = 0,
+ .mult = 1,
};
static irqreturn_t um_timer(int irq, void *dev)
@@ -67,7 +82,7 @@ static irqreturn_t um_timer(int irq, void *dev)
static cycle_t itimer_read(struct clocksource *cs)
{
- return os_nsecs() / 1000;
+ return os_nsecs() / TIMER_MULTIPLIER;
}
static struct clocksource itimer_clocksource = {
@@ -82,17 +97,21 @@ static void __init setup_itimer(void)
{
int err;
- err = request_irq(TIMER_IRQ, um_timer, 0, "timer", NULL);
+ err = request_irq(TIMER_IRQ, um_timer, IRQF_DISABLED, "timer", NULL);
+ if (err != 0)
+ printk(KERN_ERR "register_timer : request_irq failed - "
+ "errno = %d\n", -err);
+ err = request_irq(HRTIMER_IRQ, um_timer, IRQF_DISABLED, "hr timer", NULL);
if (err != 0)
printk(KERN_ERR "register_timer : request_irq failed - "
"errno = %d\n", -err);
+ err = itimer_init();
+
+ if (err != 0)
+ printk(KERN_ERR "init itimer failed - "
+ "errno = %d\n", -err);
- itimer_clockevent.mult = div_sc(HZ, NSEC_PER_SEC, 32);
- itimer_clockevent.max_delta_ns =
- clockevent_delta2ns(60 * HZ, &itimer_clockevent);
- itimer_clockevent.min_delta_ns =
- clockevent_delta2ns(1, &itimer_clockevent);
- err = clocksource_register_hz(&itimer_clocksource, USEC_PER_SEC);
+ err = clocksource_register_hz(&itimer_clocksource, NSEC_PER_SEC/TIMER_MULTIPLIER);
if (err) {
printk(KERN_ERR "clocksource_register_hz returned %d\n", err);
return;
@@ -102,7 +121,7 @@ static void __init setup_itimer(void)
void read_persistent_clock(struct timespec *ts)
{
- long long nsecs = os_nsecs();
+ long long nsecs = persistent_clock_emulation();
set_normalized_timespec(ts, nsecs / NSEC_PER_SEC,
nsecs % NSEC_PER_SEC);
@@ -111,5 +130,6 @@ void read_persistent_clock(struct timespec *ts)
void __init time_init(void)
{
timer_init();
+ uml_hrtimer_init();
late_time_init = setup_itimer;
}
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 905924b..5edbc8d 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -23,7 +23,8 @@ void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
[SIGBUS] = bus_handler,
[SIGSEGV] = segv_handler,
[SIGIO] = sigio_handler,
- [SIGVTALRM] = timer_handler };
+ [SIGVTALRM] = timer_handler,
+ [SIGUSR2] = hrtimer_handler };
static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
{
@@ -38,7 +39,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
}
/* enable signals if sig isn't IRQ signal */
- if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM))
+ if ((sig != SIGIO) && (sig != SIGWINCH) && (sig != SIGVTALRM) && (sig != SIGUSR2))
unblock_signals();
(*sig_info[sig])(sig, si, &r);
@@ -58,6 +59,10 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
#define SIGVTALRM_BIT 1
#define SIGVTALRM_MASK (1 << SIGVTALRM_BIT)
+#define SIGUSR2_BIT 1
+#define SIGUSR2_MASK (1 << SIGUSR2_BIT)
+
+
static int signals_enabled;
static unsigned int signals_pending;
@@ -89,6 +94,17 @@ static void real_alarm_handler(mcontext_t *mc)
timer_handler(SIGVTALRM, NULL, ®s);
}
+static void real_hralarm_handler(mcontext_t *mc)
+{
+ struct uml_pt_regs regs;
+
+ if (mc != NULL)
+ get_regs_from_mc(®s, mc);
+ regs.is_user = 0;
+ hrtimer_handler(SIGUSR2, NULL, ®s);
+}
+
+
void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
{
int enabled;
@@ -105,11 +121,33 @@ void alarm_handler(int sig, struct siginfo *unused_si, mcontext_t *mc)
set_signals(enabled);
}
+void hralarm_handler(int sig, mcontext_t *mc)
+{
+ int enabled;
+
+ enabled = signals_enabled;
+ if (!signals_enabled) {
+ signals_pending |= SIGUSR2_MASK;
+ return;
+ }
+
+ block_signals();
+
+ real_hralarm_handler(mc);
+ set_signals(enabled);
+}
+
+
void timer_init(void)
{
set_handler(SIGVTALRM);
}
+void uml_hrtimer_init(void)
+{
+ set_handler(SIGUSR2);
+}
+
void set_sigstack(void *sig_stack, int size)
{
stack_t stack = ((stack_t) { .ss_flags = 0,
@@ -129,7 +167,8 @@ static void (*handlers[_NSIG])(int sig, struct siginfo *si, mcontext_t *mc) = {
[SIGIO] = sig_handler,
[SIGWINCH] = sig_handler,
- [SIGVTALRM] = alarm_handler
+ [SIGVTALRM] = alarm_handler,
+ [SIGUSR2] = hralarm_handler
};
@@ -189,6 +228,7 @@ void set_handler(int sig)
sigaddset(&action.sa_mask, SIGVTALRM);
sigaddset(&action.sa_mask, SIGIO);
sigaddset(&action.sa_mask, SIGWINCH);
+ sigaddset(&action.sa_mask, SIGUSR2);
if (sig == SIGSEGV)
flags |= SA_NODEFER;
@@ -283,6 +323,9 @@ void unblock_signals(void)
if (save_pending & SIGVTALRM_MASK)
real_alarm_handler(NULL);
+
+ if (save_pending & SIGUSR2_MASK)
+ real_hralarm_handler(NULL);
}
}
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index d531879..a2fd9407 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -346,8 +346,7 @@ int start_userspace(unsigned long stub_stack)
void userspace(struct uml_pt_regs *regs)
{
- struct itimerval timer;
- unsigned long long nsecs, now;
+ unsigned long long nsecs;
int err, status, op, pid = userspace_pid[0];
/* To prevent races if using_sysemu changes under us.*/
int local_using_sysemu;
@@ -356,13 +355,11 @@ void userspace(struct uml_pt_regs *regs)
/* Handle any immediate reschedules or signals */
interrupt_end();
- if (getitimer(ITIMER_VIRTUAL, &timer))
- printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
- nsecs = timer.it_value.tv_sec * UM_NSEC_PER_SEC +
- timer.it_value.tv_usec * UM_NSEC_PER_USEC;
- nsecs += os_nsecs();
-
while (1) {
+
+ nsecs = timer_remain();
+ nsecs += os_nsecs();
+
/*
* This can legitimately fail if the process loads a
* bogus value into a segment register. It will
@@ -434,23 +431,18 @@ void userspace(struct uml_pt_regs *regs)
relay_signal(SIGTRAP, (struct siginfo *)&si, regs);
break;
case SIGVTALRM:
- now = os_nsecs();
- if (now < nsecs)
+ if (nsecs < os_nsecs())
break;
block_signals();
(*sig_info[sig])(sig, (struct siginfo *)&si, regs);
unblock_signals();
- nsecs = timer.it_value.tv_sec *
- UM_NSEC_PER_SEC +
- timer.it_value.tv_usec *
- UM_NSEC_PER_USEC;
- nsecs += os_nsecs();
break;
case SIGIO:
case SIGILL:
case SIGBUS:
case SIGFPE:
case SIGWINCH:
+ case SIGUSR2:
block_signals();
(*sig_info[sig])(sig, (struct siginfo *)&si, regs);
unblock_signals();
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index e9824d5..1dd9c53 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -10,13 +11,59 @@
#include <sys/time.h>
#include <kern_util.h>
#include <os.h>
+#include <string.h>
#include "internal.h"
+#include <timer-internal.h>
+
+static timer_t event_high_res_timer = 0;
+
+static inline long long timeval_to_ns(const struct timeval *tv)
+{
+ return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) +
+ tv->tv_usec * UM_NSEC_PER_USEC;
+}
+
+static inline long long timespec_to_ns(const struct timespec *ts)
+{
+ return ((long long) ts->tv_sec * UM_NSEC_PER_SEC) +
+ ts->tv_nsec;
+}
+
+long long persistent_clock_emulation (void) {
+ struct timespec realtime_tp;
+
+ clock_gettime(CLOCK_REALTIME, &realtime_tp);
+ return timespec_to_ns(&realtime_tp);
+}
+
+
+int itimer_init(void) {
+ struct sigevent sev, bbev;
+ sev.sigev_notify = SIGEV_SIGNAL;
+ sev.sigev_signo = SIGUSR2; /* note - hrtimer now has its own signal */
+ sev.sigev_value.sival_ptr = &event_high_res_timer;
+ if (timer_create(
+ CLOCK_MONOTONIC,
+ &sev,
+ &event_high_res_timer) == -1
+ ) {
+ printk("Failed to create Timer");
+ return -1;
+ } else {
+ printk("Event timer ID is 0x%lx\n", (long) event_high_res_timer);
+ }
+ return 0;
+}
+
+/*
+* This is used for tracing and cannot be removed at this point (TODO)
+*/
int set_interval(void)
{
int usec = UM_USEC_PER_SEC / UM_HZ;
struct itimerval interval = ((struct itimerval) { { 0, usec },
- { 0, usec } });
+ { 0, usec } });
if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
return -errno;
@@ -24,61 +71,104 @@ int set_interval(void)
return 0;
}
-int timer_one_shot(int ticks)
+long long timer_remain (void)
{
- unsigned long usec = ticks * UM_USEC_PER_SEC / UM_HZ;
- unsigned long sec = usec / UM_USEC_PER_SEC;
struct itimerval interval;
+ long long remain = 0;
+ if (getitimer(ITIMER_VIRTUAL, &interval)) {
+ printk(UM_KERN_ERR "Failed to get itimer, errno = %d\n", errno);
+ } else {
+ remain = timeval_to_ns(&interval.it_value);
+ }
+ return remain;
+}
- usec %= UM_USEC_PER_SEC;
- interval = ((struct itimerval) { { 0, 0 }, { sec, usec } });
+int timer_one_shot(int ticks)
+{
+ struct itimerspec its;
+ unsigned long long nsec;
+ unsigned long sec;
- if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
- return -errno;
+
+ nsec = (ticks + 1);
+
+ sec = nsec / UM_NSEC_PER_SEC;
+
+ nsec = nsec % UM_NSEC_PER_SEC;
+
+ its.it_value.tv_sec = nsec / UM_NSEC_PER_SEC;
+ its.it_value.tv_nsec = nsec ;
+
+ its.it_interval.tv_sec = 0;
+ its.it_interval.tv_nsec = 0; // we cheat here
+
+ timer_settime(event_high_res_timer, 0, &its, NULL);
return 0;
}
-/**
- * timeval_to_ns - Convert timeval to nanoseconds
- * @ts: pointer to the timeval variable to be converted
- *
- * Returns the scalar nanosecond representation of the timeval
- * parameter.
- *
- * Ripped from linux/time.h because it's a kernel header, and thus
- * unusable from here.
- */
-static inline long long timeval_to_ns(const struct timeval *tv)
+long long hrtimer_disable(void)
{
- return ((long long) tv->tv_sec * UM_NSEC_PER_SEC) +
- tv->tv_usec * UM_NSEC_PER_USEC;
+ struct itimerspec its;
+
+ memset(&its, 0, sizeof(struct itimerspec));
+ timer_settime(event_high_res_timer, 0, &its, &its);
+
+ return its.it_value.tv_sec * UM_NSEC_PER_SEC + its.it_value.tv_nsec;
+}
+
+long long tracingtimer_disable(void)
+{
+ struct itimerval itv;
+
+ memset(&itv, 0, sizeof(struct itimerval));
+ setitimer(ITIMER_VIRTUAL, &itv, &itv);
+
+ return itv.it_value.tv_sec * UM_NSEC_PER_SEC + itv.it_value.tv_usec * 1000;
}
long long disable_timer(void)
{
- struct itimerval time = ((struct itimerval) { { 0, 0 }, { 0, 0 } });
- long long remain, max = UM_NSEC_PER_SEC / UM_HZ;
+ long long nsec;
+ long long tnsec;
+
+ /*
+ * This is now fixed in the main idle loop so we really kill
+ * both timers here to ensure that UML can exit cleanly and
+ * not die on a spurious SIG_VTALRM
+ */
+
+
+ nsec = hrtimer_disable();
+ tnsec = tracingtimer_disable();
+ if (nsec > tnsec) {
+ return tnsec;
+ } else {
+ return nsec;
+ }
+}
- if (setitimer(ITIMER_VIRTUAL, &time, &time) < 0)
- printk(UM_KERN_ERR "disable_timer - setitimer failed, "
- "errno = %d\n", errno);
+long long os_vnsecs(void)
+{
+ struct timespec ts;
- remain = timeval_to_ns(&time.it_value);
- if (remain > max)
- remain = max;
+ clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&ts);
+ return timespec_to_ns(&ts);
- return remain;
}
long long os_nsecs(void)
{
- struct timeval tv;
- gettimeofday(&tv, NULL);
- return timeval_to_ns(&tv);
+ struct timespec ts;
+
+ clock_gettime(CLOCK_MONOTONIC,&ts);
+ return timespec_to_ns(&ts);
+
}
+
+
#ifdef UML_CONFIG_NO_HZ_COMMON
static int after_sleep_interval(struct timespec *ts)
{
@@ -140,12 +230,12 @@ static int after_sleep_interval(struct timespec *ts)
struct itimerval interval;
/*
- * It seems that rounding can increase the value returned from
- * setitimer to larger than the one passed in. Over time,
- * this will cause the remaining time to be greater than the
- * tick interval. If this happens, then just reduce the first
- * tick to the interval value.
- */
+ * It seems that rounding can increase the value returned from
+ * setitimer to larger than the one passed in. Over time,
+ * this will cause the remaining time to be greater than the
+ * tick interval. If this happens, then just reduce the first
+ * tick to the interval value.
+ */
if (start_usecs > usec)
start_usecs = usec;
@@ -154,7 +244,7 @@ static int after_sleep_interval(struct timespec *ts)
start_usecs = 0;
tv = ((struct timeval) { .tv_sec = start_usecs / UM_USEC_PER_SEC,
- .tv_usec = start_usecs % UM_USEC_PER_SEC });
+ .tv_usec = start_usecs % UM_USEC_PER_SEC });
interval = ((struct itimerval) { { 0, usec }, tv });
if (setitimer(ITIMER_VIRTUAL, &interval, NULL) == -1)
@@ -169,18 +259,24 @@ void idle_sleep(unsigned long long nsecs)
struct timespec ts;
/*
- * nsecs can come in as zero, in which case, this starts a
- * busy loop. To prevent this, reset nsecs to the tick
- * interval if it is zero.
- */
- if (nsecs == 0)
- nsecs = UM_NSEC_PER_SEC / UM_HZ;
-
- nsecs = sleep_time(nsecs);
+ * We sleep here for an interval that is not greater than HZ
+ * We did not disable the timer in "disable" so if there is a timer
+ * active it will wake us up right on time instead of doing
+ * stupid things trying to program nanosleep in a race condition
+ * manner.
+ */
+
+ if ((nsecs == 0) || (nsecs > UM_NSEC_PER_SEC / UM_HZ)) {
+ nsecs = UM_NSEC_PER_SEC / UM_HZ ;
+ }
+
ts = ((struct timespec) { .tv_sec = nsecs / UM_NSEC_PER_SEC,
- .tv_nsec = nsecs % UM_NSEC_PER_SEC });
+ .tv_nsec = nsecs % UM_NSEC_PER_SEC });
+
- if (nanosleep(&ts, &ts) == 0)
+ if (clock_nanosleep(CLOCK_MONOTONIC, 0, &ts, &ts) == 0) {
deliver_alarm();
+ }
+ set_interval();
after_sleep_interval(&ts);
}
--
1.7.10.4
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 15+ messages in thread* Re: [uml-devel] UML Performance improvement patchset
2014-09-04 19:00 [uml-devel] UML Performance improvement patchset anton.ivanov
` (9 preceding siblings ...)
2014-09-04 19:00 ` [uml-devel] [PATCH v3 10/10] High Resolution Timer subsystem for UML anton.ivanov
@ 2014-09-04 19:14 ` Richard Weinberger
2014-09-04 20:46 ` Anton Ivanov (antivano)
` (2 more replies)
10 siblings, 3 replies; 15+ messages in thread
From: Richard Weinberger @ 2014-09-04 19:14 UTC (permalink / raw)
To: anton.ivanov; +Cc: user-mode-linux-devel@lists.sourceforge.net
On Thu, Sep 4, 2014 at 9:00 PM, <anton.ivanov@kot-begemot.co.uk> wrote:
>
> Patch dependencies:
>
> [PATCH v3 01/10] Epoll based interrupt controller
>
> Full redesign of the existing UML poll based controller. The old
> poll controller incurs huge penalties for IRQ sharing and many devices
> setup due to the device list being walked twice.
>
> Additionally, the current controller has no notion of true Edge,
> Level and Write completion IRQs.
>
> This patch fixes the list walking bottleneck and adds all of
> the above alowing for UML to be scaled to 100s of devices
> (tested with 512+ network devices).
>
> [PATCH v3 02/10] Remove unnecessary 'reactivate' statements
>
> As a result of adding true Edge/Level semantics in the epoll
> controller there is no need to do the "reactivate fd" any more.
>
> This one is an enhancement of 1 and depends on it.
>
> [PATCH v3 03/10] High performance networking subsystem
>
> This patchset adds vector IO ops for xmit and receive. Xmit
> is optional (as it depends on a 3.0+ host), receive is always on.
>
> The result is that UML can now hit 1G+ rates for transports
> which have been enabled to use these. Presently this patchset
> is kept as "legacy" as possible without leveraging the possibility
> to do a true write completion poll from the new IRQ controller.
> This further performance improvement will be submitted separately.
>
> This patch has been tested extensively only with patchsets 1 and 2.
>
> [PATCH v3 04/10] L2TPv3 Transport Driver for UML
>
> This is an implementation of the Ethernet over L2TPv3 protocol
> leveraging both the epoll controller and the high perf vector IO.
> It has been extensively tested to interop versus a set of
> other implementations including Linux kernel, our port of the
> same concept to QEMU/KVM, routers, etc.
>
> Depends on 3.
>
> [PATCH v3 05/10] GRE transport for UML
>
> Same as L2TPv3 for GRE. Depends on 3
>
> [PATCH v3 06/10] RAW Ethernet transport for UML
>
> True raw driver (note - all TSO/GSO options in the NIC must
> be turned off). Breaks through the 1G barrier with a vengeance
> and CPU to spare. Depends on 3.
>
> [PATCH v3 07/10] Performance and NUMA improvements for ubd
>
> This is a well known issue/fix, qemu has the same one. If you
> do not use pwrite you can kill a machine on cache sync with
> ease. This patch is independent of the others.
>
> [PATCH v3 08/10] Minor performance optimization for ubd
>
> Obvious minor optimization, independent of the others.
>
> [PATCH v3 09/10] Better IPC for UBD
>
> Obvious optimization, independent of the others. Pipe has a
> very short queue which has 4k granularity. It is a bad IPC
> for passing a lot of small chunks one at a time as used in UBD.
>
> [PATCH v3 10/10] High Resolution Timer subsystem for UML
>
> This version of the patch applies only to the epoll controller.
> Otherwise, the patch with minimal modifications can be applied to
> stock UML. It fixes UML as far as its use for network appliance
> on all counts - TCP performance, QoS, traffic shaping, etc.
>
> The patch is not pretty (I would have preferred to kill itimer
> completely). It however does what it says on the tin and has been
> doing it in testing for 2 years or so now.
>
> Enjoy
Thanks a lot for your work!
As I'm horrible backlogged I'll at best have next week the time to
look at your patches.
Thanks,
//richard
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 15+ messages in thread* Re: [uml-devel] UML Performance improvement patchset
2014-09-04 19:14 ` [uml-devel] UML Performance improvement patchset Richard Weinberger
@ 2014-09-04 20:46 ` Anton Ivanov (antivano)
2014-09-15 6:09 ` Anton Ivanov (antivano)
2014-09-20 18:17 ` Anton Ivanov
2 siblings, 0 replies; 15+ messages in thread
From: Anton Ivanov (antivano) @ 2014-09-04 20:46 UTC (permalink / raw)
To: user-mode-linux-devel@lists.sourceforge.net
On 04/09/14 20:14, Richard Weinberger wrote:
> On Thu, Sep 4, 2014 at 9:00 PM, <anton.ivanov@kot-begemot.co.uk> wrote:
>> Patch dependencies:
>>
>> [PATCH v3 01/10] Epoll based interrupt controller
>>
>> Full redesign of the existing UML poll based controller. The old
>> poll controller incurs huge penalties for IRQ sharing and many devices
>> setup due to the device list being walked twice.
>>
>> Additionally, the current controller has no notion of true Edge,
>> Level and Write completion IRQs.
>>
>> This patch fixes the list walking bottleneck and adds all of
>> the above alowing for UML to be scaled to 100s of devices
>> (tested with 512+ network devices).
>>
>> [PATCH v3 02/10] Remove unnecessary 'reactivate' statements
>>
>> As a result of adding true Edge/Level semantics in the epoll
>> controller there is no need to do the "reactivate fd" any more.
>>
>> This one is an enhancement of 1 and depends on it.
>>
>> [PATCH v3 03/10] High performance networking subsystem
>>
>> This patchset adds vector IO ops for xmit and receive. Xmit
>> is optional (as it depends on a 3.0+ host), receive is always on.
>>
>> The result is that UML can now hit 1G+ rates for transports
>> which have been enabled to use these. Presently this patchset
>> is kept as "legacy" as possible without leveraging the possibility
>> to do a true write completion poll from the new IRQ controller.
>> This further performance improvement will be submitted separately.
>>
>> This patch has been tested extensively only with patchsets 1 and 2.
>>
>> [PATCH v3 04/10] L2TPv3 Transport Driver for UML
>>
>> This is an implementation of the Ethernet over L2TPv3 protocol
>> leveraging both the epoll controller and the high perf vector IO.
>> It has been extensively tested to interop versus a set of
>> other implementations including Linux kernel, our port of the
>> same concept to QEMU/KVM, routers, etc.
>>
>> Depends on 3.
>>
>> [PATCH v3 05/10] GRE transport for UML
>>
>> Same as L2TPv3 for GRE. Depends on 3
>>
>> [PATCH v3 06/10] RAW Ethernet transport for UML
>>
>> True raw driver (note - all TSO/GSO options in the NIC must
>> be turned off). Breaks through the 1G barrier with a vengeance
>> and CPU to spare. Depends on 3.
>>
>> [PATCH v3 07/10] Performance and NUMA improvements for ubd
>>
>> This is a well known issue/fix, qemu has the same one. If you
>> do not use pwrite you can kill a machine on cache sync with
>> ease. This patch is independent of the others.
>>
>> [PATCH v3 08/10] Minor performance optimization for ubd
>>
>> Obvious minor optimization, independent of the others.
>>
>> [PATCH v3 09/10] Better IPC for UBD
>>
>> Obvious optimization, independent of the others. Pipe has a
>> very short queue which has 4k granularity. It is a bad IPC
>> for passing a lot of small chunks one at a time as used in UBD.
>>
>> [PATCH v3 10/10] High Resolution Timer subsystem for UML
>>
>> This version of the patch applies only to the epoll controller.
>> Otherwise, the patch with minimal modifications can be applied to
>> stock UML. It fixes UML as far as its use for network appliance
>> on all counts - TCP performance, QoS, traffic shaping, etc.
>>
>> The patch is not pretty (I would have preferred to kill itimer
>> completely). It however does what it says on the tin and has been
>> doing it in testing for 2 years or so now.
>>
>> Enjoy
> Thanks a lot for your work!
> As I'm horrible backlogged I'll at best have next week the time to
> look at your patches.
No worries,
Whenever you can get around.
I need to write a simple README for the GRE, L2TPv3 and RAW drivers
anyway to provide examples of use, this will follow shortly (hopefully
tomorrow).
A.
>
> Thanks,
> //richard
>
> ------------------------------------------------------------------------------
> Slashdot TV.
> Video for Nerds. Stuff that matters.
> http://tv.slashdot.org/
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
------------------------------------------------------------------------------
Slashdot TV.
Video for Nerds. Stuff that matters.
http://tv.slashdot.org/
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [uml-devel] UML Performance improvement patchset
2014-09-04 19:14 ` [uml-devel] UML Performance improvement patchset Richard Weinberger
2014-09-04 20:46 ` Anton Ivanov (antivano)
@ 2014-09-15 6:09 ` Anton Ivanov (antivano)
2014-09-20 18:17 ` Anton Ivanov
2 siblings, 0 replies; 15+ messages in thread
From: Anton Ivanov (antivano) @ 2014-09-15 6:09 UTC (permalink / raw)
To: user-mode-linux-devel@lists.sourceforge.net
Hi Richard, hi list.
I have found a couple of minor issues which have crept up during porting
from 3.3.8 to 3.12-3.15 (I originally wrote all of the improvements for
3.3.8/OpenWRT).
I will reissue the patchset with these fixed sometimes this week -
descriptions of issues/coming fixes inline.
I have tried to keep the network patches not invasive, but that is
preventing the use of the write poll functionality in the new interrupt
controller (there is no way to do that without starting to change the
core uml netdev structures). It also prevents a few other potential
performance improvements such as skb recycling on rx (to avoid
allocating memory every time), etc. I probably will not have the time to
get these done for a few weeks as I am snowed under a backlog of work
too. Once these are done, TX CPU should drop significantly without
losing the performance. RX should pick up as well to match TX for a lot
of use cases.
After patching 3.12 and running it for a while on a live system I have
run again into my favorite UML issue (occasional process in D state on
IO) is back :(. This is something which exists in the stock kernel, but
is much more difficult to trigger because the way it the original timer
(and to a lesser extent SIGIO) handling. I will add a workaround which
does not fix it fully (but at least makes it extremely rare) to the
patchset as well on next reissue.
On 04/09/14 20:14, Richard Weinberger wrote:
> On Thu, Sep 4, 2014 at 9:00 PM, <anton.ivanov@kot-begemot.co.uk> wrote:
>> Patch dependencies:
>>
>> [PATCH v3 01/10] Epoll based interrupt controller
>>
>> Full redesign of the existing UML poll based controller. The old
>> poll controller incurs huge penalties for IRQ sharing and many devices
>> setup due to the device list being walked twice.
>>
>> Additionally, the current controller has no notion of true Edge,
>> Level and Write completion IRQs.
>>
>> This patch fixes the list walking bottleneck and adds all of
>> the above alowing for UML to be scaled to 100s of devices
>> (tested with 512+ network devices).
I have found one minor issue - it needs the "turn off sigio" in the exit
routine.
>>
>> [PATCH v3 02/10] Remove unnecessary 'reactivate' statements
>>
>> As a result of adding true Edge/Level semantics in the epoll
>> controller there is no need to do the "reactivate fd" any more.
>>
>> This one is an enhancement of 1 and depends on it.
>>
>> [PATCH v3 03/10] High performance networking subsystem
>>
>> This patchset adds vector IO ops for xmit and receive. Xmit
>> is optional (as it depends on a 3.0+ host), receive is always on.
>>
>> The result is that UML can now hit 1G+ rates for transports
>> which have been enabled to use these. Presently this patchset
>> is kept as "legacy" as possible without leveraging the possibility
>> to do a true write completion poll from the new IRQ controller.
>> This further performance improvement will be submitted separately.
>>
>> This patch has been tested extensively only with patchsets 1 and 2.
There is a minor issue in the close(), it needs to call both the uml and
linux interrupt release routines and at the moment it does not.
>>
>> [PATCH v3 04/10] L2TPv3 Transport Driver for UML
>>
>> This is an implementation of the Ethernet over L2TPv3 protocol
>> leveraging both the epoll controller and the high perf vector IO.
>> It has been extensively tested to interop versus a set of
>> other implementations including Linux kernel, our port of the
>> same concept to QEMU/KVM, routers, etc.
There are a couple of "belts and braces" checks missing - it needs to
zero the queue counter on a failed read (I have yet to see a single
failed read in all my tests though).
>>
>> Depends on 3.
>>
>> [PATCH v3 05/10] GRE transport for UML
>>
>> Same as L2TPv3 for GRE. Depends on 3
>>
>> [PATCH v3 06/10] RAW Ethernet transport for UML
>>
>> True raw driver (note - all TSO/GSO options in the NIC must
>> be turned off). Breaks through the 1G barrier with a vengeance
>> and CPU to spare. Depends on 3.
>>
>> [PATCH v3 07/10] Performance and NUMA improvements for ubd
>>
>> This is a well known issue/fix, qemu has the same one. If you
>> do not use pwrite you can kill a machine on cache sync with
>> ease. This patch is independent of the others.
>>
>> [PATCH v3 08/10] Minor performance optimization for ubd
>>
>> Obvious minor optimization, independent of the others.
>>
>> [PATCH v3 09/10] Better IPC for UBD
>>
>> Obvious optimization, independent of the others. Pipe has a
>> very short queue which has 4k granularity. It is a bad IPC
>> for passing a lot of small chunks one at a time as used in UBD.
>>
>> [PATCH v3 10/10] High Resolution Timer subsystem for UML
>>
>> This version of the patch applies only to the epoll controller.
>> Otherwise, the patch with minimal modifications can be applied to
>> stock UML. It fixes UML as far as its use for network appliance
>> on all counts - TCP performance, QoS, traffic shaping, etc.
>>
>> The patch is not pretty (I would have preferred to kill itimer
>> completely). It however does what it says on the tin and has been
>> doing it in testing for 2 years or so now.
>>
>> Enjoy
> Thanks a lot for your work!
> As I'm horrible backlogged I'll at best have next week the time to
> look at your patches.
>
> Thanks,
> //richard
>
> ------------------------------------------------------------------------------
> Slashdot TV.
> Video for Nerds. Stuff that matters.
> http://tv.slashdot.org/
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
------------------------------------------------------------------------------
Want excitement?
Manually upgrade your production database.
When you want reliability, choose Perforce
Perforce version control. Predictably reliable.
http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [uml-devel] UML Performance improvement patchset
2014-09-04 19:14 ` [uml-devel] UML Performance improvement patchset Richard Weinberger
2014-09-04 20:46 ` Anton Ivanov (antivano)
2014-09-15 6:09 ` Anton Ivanov (antivano)
@ 2014-09-20 18:17 ` Anton Ivanov
2 siblings, 0 replies; 15+ messages in thread
From: Anton Ivanov @ 2014-09-20 18:17 UTC (permalink / raw)
To: user-mode-linux-devel
Hi list,
Some benchmarks (current version, I believe this can do more once it
gains full epoll write support).
4 UMLs running in parallel, each pinned to a core, new raw driver - vlan
per UML, 10G NIC on 4 core 3.5GHz A8 connected back to back to an 8 core
machine running iperf server.
Server listening on TCP port 5001
TCP window size: 85.3 KByte (default)
------------------------------------------------------------
[ 4] local 192.168.63.1 port 5001 connected with 192.168.63.18 port 36946
[ 5] local 192.168.63.1 port 5001 connected with 192.168.63.3 port 33628
[ 6] local 192.168.63.1 port 5001 connected with 192.168.63.34 port 50217
[ 7] local 192.168.63.1 port 5001 connected with 192.168.63.50 port 40107
[ ID] Interval Transfer Bandwidth
[ 4] 0.0-240.0 sec 40.8 GBytes 1.46 Gbits/sec
[ 5] 0.0-240.0 sec 40.4 GBytes 1.44 Gbits/sec
[ 6] 0.0-240.0 sec 39.9 GBytes 1.43 Gbits/sec
[ 7] 0.0-240.0 sec 39.5 GBytes 1.41 Gbits/sec
For a nice rounded total of: 5.6GBit from a 4 core machine under virtualization, no offloads - true raw power as needed for network applications.
For comparative purposes one CPU with the other ones not loaded is:
root@Hive:~# iperf -t 240 -c 192.168.63.1
------------------------------------------------------------
Client connecting to 192.168.63.1, TCP port 5001
TCP window size: 22.5 KByte (default)
------------------------------------------------------------
[ 3] local 192.168.63.3 port 33629 connected with 192.168.63.1 port 5001
[ ID] Interval Transfer Bandwidth
[ 3] 0.0-240.0 sec 63.3 GBytes 2.26 Gbits/sec
If I use tap on the same machine I get significantly less than that. It
is also more than I can get out kvm on the same machine (with offloads
off so it something which is applicable to network applications).
A.
P.S. I think I have found all the issues that were introduced when
porting the original 3.3.8 patch to the current linux tree, I will
submit a new version which has the fixes on Monday.
A.
On 09/04/14 19:14, Richard Weinberger wrote:
> On Thu, Sep 4, 2014 at 9:00 PM, <anton.ivanov@kot-begemot.co.uk> wrote:
>> Patch dependencies:
>>
>> [PATCH v3 01/10] Epoll based interrupt controller
>>
>> Full redesign of the existing UML poll based controller. The old
>> poll controller incurs huge penalties for IRQ sharing and many devices
>> setup due to the device list being walked twice.
>>
>> Additionally, the current controller has no notion of true Edge,
>> Level and Write completion IRQs.
>>
>> This patch fixes the list walking bottleneck and adds all of
>> the above alowing for UML to be scaled to 100s of devices
>> (tested with 512+ network devices).
>>
>> [PATCH v3 02/10] Remove unnecessary 'reactivate' statements
>>
>> As a result of adding true Edge/Level semantics in the epoll
>> controller there is no need to do the "reactivate fd" any more.
>>
>> This one is an enhancement of 1 and depends on it.
>>
>> [PATCH v3 03/10] High performance networking subsystem
>>
>> This patchset adds vector IO ops for xmit and receive. Xmit
>> is optional (as it depends on a 3.0+ host), receive is always on.
>>
>> The result is that UML can now hit 1G+ rates for transports
>> which have been enabled to use these. Presently this patchset
>> is kept as "legacy" as possible without leveraging the possibility
>> to do a true write completion poll from the new IRQ controller.
>> This further performance improvement will be submitted separately.
>>
>> This patch has been tested extensively only with patchsets 1 and 2.
>>
>> [PATCH v3 04/10] L2TPv3 Transport Driver for UML
>>
>> This is an implementation of the Ethernet over L2TPv3 protocol
>> leveraging both the epoll controller and the high perf vector IO.
>> It has been extensively tested to interop versus a set of
>> other implementations including Linux kernel, our port of the
>> same concept to QEMU/KVM, routers, etc.
>>
>> Depends on 3.
>>
>> [PATCH v3 05/10] GRE transport for UML
>>
>> Same as L2TPv3 for GRE. Depends on 3
>>
>> [PATCH v3 06/10] RAW Ethernet transport for UML
>>
>> True raw driver (note - all TSO/GSO options in the NIC must
>> be turned off). Breaks through the 1G barrier with a vengeance
>> and CPU to spare. Depends on 3.
>>
>> [PATCH v3 07/10] Performance and NUMA improvements for ubd
>>
>> This is a well known issue/fix, qemu has the same one. If you
>> do not use pwrite you can kill a machine on cache sync with
>> ease. This patch is independent of the others.
>>
>> [PATCH v3 08/10] Minor performance optimization for ubd
>>
>> Obvious minor optimization, independent of the others.
>>
>> [PATCH v3 09/10] Better IPC for UBD
>>
>> Obvious optimization, independent of the others. Pipe has a
>> very short queue which has 4k granularity. It is a bad IPC
>> for passing a lot of small chunks one at a time as used in UBD.
>>
>> [PATCH v3 10/10] High Resolution Timer subsystem for UML
>>
>> This version of the patch applies only to the epoll controller.
>> Otherwise, the patch with minimal modifications can be applied to
>> stock UML. It fixes UML as far as its use for network appliance
>> on all counts - TCP performance, QoS, traffic shaping, etc.
>>
>> The patch is not pretty (I would have preferred to kill itimer
>> completely). It however does what it says on the tin and has been
>> doing it in testing for 2 years or so now.
>>
>> Enjoy
> Thanks a lot for your work!
> As I'm horrible backlogged I'll at best have next week the time to
> look at your patches.
>
> Thanks,
> //richard
>
> ------------------------------------------------------------------------------
> Slashdot TV.
> Video for Nerds. Stuff that matters.
> http://tv.slashdot.org/
> _______________________________________________
> User-mode-linux-devel mailing list
> User-mode-linux-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
------------------------------------------------------------------------------
Slashdot TV. Video for Nerds. Stuff that Matters.
http://pubads.g.doubleclick.net/gampad/clk?id=160591471&iu=/4140/ostg.clktrk
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 15+ messages in thread