* [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0
@ 2015-11-08 22:50 Anton Ivanov
2015-11-08 23:00 ` Anton Ivanov
2015-11-08 23:23 ` Anton Ivanov
0 siblings, 2 replies; 3+ messages in thread
From: Anton Ivanov @ 2015-11-08 22:50 UTC (permalink / raw)
To: user-mode-linux-devel; +Cc: Anton Ivanov
Epoll based interrupt controller.
IMPROVES: IO loop performance - no per fd lookups, allowing for
15% IO speedup in minimal config going to 100s of % with many
devices - a N^N lookup is now replaced by a log(N)
ADDS: True Write IRQ functionality
OBSOLETES: The need to call reactivate_fd() in any driver which
has only read IRQ semantics. Write IRQs work, but will need to
be updated to use this fully.
Potentially (with a change in API) will allow both edge and level
IRQ semantics.
Pre-requisite for using packet mmap and multipacket read/write
which do not get along with poll() very well.
Signed-off-by/: Anton Ivanov <aivanov@brocade.com>
---
arch/um/drivers/line.c | 5 +-
arch/um/drivers/mconsole_kern.c | 2 -
arch/um/drivers/net_kern.c | 1 -
arch/um/drivers/port_kern.c | 1 -
arch/um/drivers/random.c | 1 -
arch/um/drivers/ubd_kern.c | 1 -
arch/um/include/shared/irq_user.h | 24 ++-
arch/um/include/shared/os.h | 13 +-
arch/um/kernel/irq.c | 412 ++++++++++++++++++++++----------------
arch/um/os-Linux/irq.c | 145 +++++---------
10 files changed, 321 insertions(+), 284 deletions(-)
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 6208702..84384c8 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -283,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
if (err)
return err;
if (output)
- err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
+ err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
line_write_interrupt, IRQF_SHARED,
driver->write_irq_name, data);
return err;
@@ -666,8 +667,6 @@ static irqreturn_t winch_interrupt(int irq, void *data)
tty_kref_put(tty);
}
out:
- if (winch->fd != -1)
- reactivate_fd(winch->fd, WINCH_IRQ);
return IRQ_HANDLED;
}
diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 29880c9..5e8881c 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -95,7 +95,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id)
}
if (!list_empty(&mc_requests))
schedule_work(&mconsole_work);
- reactivate_fd(fd, MCONSOLE_IRQ);
return IRQ_HANDLED;
}
@@ -243,7 +242,6 @@ void mconsole_stop(struct mc_request *req)
(*req->cmd->handler)(req);
}
os_set_fd_block(req->originating_fd, 0);
- reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
mconsole_reply(req, "", 0, 0);
}
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index f70dd54..82ea3a2 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -137,7 +137,6 @@ static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
schedule_work(&lp->work);
goto out;
}
- reactivate_fd(lp->fd, UM_ETH_IRQ);
out:
spin_unlock(&lp->lock);
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 40ca5cc..b0e9ff3 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -137,7 +137,6 @@ static void port_work_proc(struct work_struct *unused)
if (!port->has_connection)
continue;
- reactivate_fd(port->fd, ACCEPT_IRQ);
while (port_accept(port))
;
port->has_connection = 0;
diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
index dd16c90..a392828 100644
--- a/arch/um/drivers/random.c
+++ b/arch/um/drivers/random.c
@@ -72,7 +72,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
return ret ? : -EAGAIN;
atomic_inc(&host_sleep_count);
- reactivate_fd(random_fd, RANDOM_IRQ);
add_sigio_fd(random_fd);
add_wait_queue(&host_read_wait, &wait);
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index e8ab93c..731982c 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -466,7 +466,6 @@ static void ubd_handler(void)
blk_end_request(req->req, 0, req->length);
kfree(req);
}
- reactivate_fd(thread_fd, UBD_IRQ);
list_for_each_safe(list, next_ele, &restart){
ubd = container_of(list, struct ubd, restart);
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index df56330..0eca64c 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -9,16 +10,23 @@
#include <sysdep/ptrace.h>
struct irq_fd {
- struct irq_fd *next;
- void *id;
- int fd;
- int type;
- int irq;
- int events;
- int current_events;
+ void *id;
+ int irq;
+ int events;
+};
+
+
+#define IRQ_READ 0
+#define IRQ_WRITE 1
+#define IRQ_NONE 2
+#define MAX_IRQ_TYPE (IRQ_NONE + 1)
+
+struct irq_entry {
+ struct irq_entry *next;
+ int fd;
+ struct irq_fd * irq_array[MAX_IRQ_TYPE + 1];
};
-enum { IRQ_READ, IRQ_WRITE };
struct siginfo;
extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 21d704b..3fe1249 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
@@ -284,15 +285,17 @@ extern void halt_skas(void);
extern void reboot_skas(void);
/* irq.c */
-extern int os_waiting_for_events(struct irq_fd *active_fds);
-extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
+
+extern int os_setup_epoll(int maxevents);
+extern int os_waiting_for_events_epoll(void *kernel_events, int maxevents);
+extern int os_add_epoll_fd (int events, int fd, void * data);
+extern int os_mod_epoll_fd (int events, int fd, void * data);
+extern int os_del_epoll_fd (int fd);
+
extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
extern void os_free_irq_later(struct irq_fd *active_fds,
int irq, void *dev_id);
-extern int os_get_pollfd(int i);
-extern void os_set_pollfd(int i, int fd);
-extern void os_set_ioignore(void);
/* sigio.c */
extern int add_sigio_fd(int fd);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 23cb935..516b13b 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -1,4 +1,7 @@
/*
+ * Copyright (C) 2015 Brocade Communications Ltd
+ * Author: Anton Ivanov aivanov@{brocade.com,kot-begemot.co.uk}
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
* Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
@@ -18,6 +21,61 @@
#include <os.h>
/*
+* We are on the "kernel side" so we cannot pick up the sys/epoll.h
+* So we lift out of it the applicable key definitions.
+*/
+
+
+enum EPOLL_EVENTS
+ {
+ EPOLLIN = 0x001,
+#define EPOLLIN EPOLLIN
+ EPOLLPRI = 0x002,
+#define EPOLLPRI EPOLLPRI
+ EPOLLOUT = 0x004,
+#define EPOLLOUT EPOLLOUT
+ EPOLLRDNORM = 0x040,
+#define EPOLLRDNORM EPOLLRDNORM
+ EPOLLRDBAND = 0x080,
+#define EPOLLRDBAND EPOLLRDBAND
+ EPOLLWRNORM = 0x100,
+#define EPOLLWRNORM EPOLLWRNORM
+ EPOLLWRBAND = 0x200,
+#define EPOLLWRBAND EPOLLWRBAND
+ EPOLLMSG = 0x400,
+#define EPOLLMSG EPOLLMSG
+ EPOLLERR = 0x008,
+#define EPOLLERR EPOLLERR
+ EPOLLHUP = 0x010,
+#define EPOLLHUP EPOLLHUP
+ EPOLLRDHUP = 0x2000,
+#define EPOLLRDHUP EPOLLRDHUP
+ EPOLLONESHOT = (1 << 30),
+#define EPOLLONESHOT EPOLLONESHOT
+ EPOLLET = (1 << 31)
+#define EPOLLET EPOLLET
+ };
+
+
+typedef union epoll_data
+{
+ void *ptr;
+ int fd;
+ uint32_t u32;
+ uint64_t u64;
+} epoll_data_t;
+
+struct epoll_event
+{
+ uint32_t events; /* Epoll events */
+ epoll_data_t data; /* User data variable */
+} __attribute__ ((__packed__));
+
+#define MAX_EPOLL_EVENTS 16
+
+static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
+
+/*
* This list is accessed under irq_lock, except in sigio_handler,
* where it is safe from being modified. IRQ handlers won't change it -
* if an IRQ source has vanished, it will be freed by free_irqs just
@@ -25,44 +83,91 @@
* list of irqs to free, with its own locking, coming back here to
* remove list elements, taking the irq_lock to do so.
*/
-static struct irq_fd *active_fds = NULL;
-static struct irq_fd **last_irq_ptr = &active_fds;
+static struct irq_entry *active_fds = NULL;
extern void free_irqs(void);
+
+static DEFINE_SPINLOCK(irq_lock);
+
+
+/*
+ * Principles of Operation:
+ * Each Epoll structure contains a pointer pointing back to an array
+ * with irq entries for read, write and none and their matching event
+ * masks.
+ * This allows us to stop looking up "who talked"
+ * We no longer need to enable/disable any polls while we process them
+ * epoll will take care of that. The exemption to this (for now) are
+ * character devices because of their own internal buffering, which
+ * needs to be updated to leverage the new write IRQ semantics.
+ * We can now support both read and write IRQs and have separate IRQs
+ * for read and write ops.
+ */
+
+
void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
{
struct irq_fd *irq_fd;
- int n;
+ struct irq_entry *irq_entry;
+ unsigned long flags;
+
+ int n, i, j;
while (1) {
- n = os_waiting_for_events(active_fds);
- if (n <= 0) {
- if (n == -EINTR)
- continue;
- else break;
- }
- for (irq_fd = active_fds; irq_fd != NULL;
- irq_fd = irq_fd->next) {
- if (irq_fd->current_events != 0) {
- irq_fd->current_events = 0;
- do_IRQ(irq_fd->irq, regs);
- }
+ spin_lock_irqsave(&irq_lock, flags);
+
+ n = os_waiting_for_events_epoll(
+ &epoll_events, MAX_EPOLL_EVENTS
+ );
+
+
+ if (n <= 0) {
+ if (n == -EINTR) { continue; }
+ else { break; }
}
+
+
+ for (i = 0; i < n ; i++) {
+ /* start from the data ptr, walk the tree branch */
+ irq_entry = (struct irq_entry *) epoll_events[i].data.ptr;
+ for (j = 0; j < MAX_IRQ_TYPE ; j ++ ) {
+ irq_fd = irq_entry->irq_array[j];
+ if (irq_fd != NULL) {
+ if (epoll_events[i].events & irq_fd->events) {
+ do_IRQ(irq_fd->irq, regs);
+ }
+ }
+ }
+ }
+ spin_unlock_irqrestore(&irq_lock, flags);
}
free_irqs();
}
-static DEFINE_SPINLOCK(irq_lock);
+static int update_events(struct irq_entry * irq_entry) {
+ int i;
+ int events = 0;
+ struct irq_fd * irq_fd;
+ for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
+ irq_fd = irq_entry->irq_array[i];
+ if (irq_fd != NULL) {
+ events = irq_fd->events | events;
+ }
+ }
+ /* os_add_epoll will call os_mod_epoll if this already exists */
+ return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
+}
+
static int activate_fd(int irq, int fd, int type, void *dev_id)
{
- struct pollfd *tmp_pfd;
- struct irq_fd *new_fd, *irq_fd;
+ struct irq_fd *new_fd;
+ struct irq_entry * irq_entry;
unsigned long flags;
- int events, err, n;
+ int i, err, events;
err = os_set_fd_async(fd);
if (err < 0)
@@ -74,186 +179,150 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
goto out;
if (type == IRQ_READ)
- events = UM_POLLIN | UM_POLLPRI;
- else events = UM_POLLOUT;
- *new_fd = ((struct irq_fd) { .next = NULL,
- .id = dev_id,
- .fd = fd,
- .type = type,
- .irq = irq,
- .events = events,
- .current_events = 0 } );
-
- err = -EBUSY;
- spin_lock_irqsave(&irq_lock, flags);
- for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
- if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
- printk(KERN_ERR "Registering fd %d twice\n", fd);
- printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
- printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
- dev_id);
- goto out_unlock;
- }
- }
-
+ events |= EPOLLIN | EPOLLPRI;
if (type == IRQ_WRITE)
- fd = -1;
+ events |= EPOLLOUT;
- tmp_pfd = NULL;
- n = 0;
+ *new_fd = ((struct irq_fd) {
+ .id = dev_id,
+ .irq = irq,
+ .events = events
+ });
- while (1) {
- n = os_create_pollfd(fd, events, tmp_pfd, n);
- if (n == 0)
- break;
+ err = -EBUSY;
- /*
- * n > 0
- * It means we couldn't put new pollfd to current pollfds
- * and tmp_fds is NULL or too small for new pollfds array.
- * Needed size is equal to n as minimum.
- *
- * Here we have to drop the lock in order to call
- * kmalloc, which might sleep.
- * If something else came in and changed the pollfds array
- * so we will not be able to put new pollfd struct to pollfds
- * then we free the buffer tmp_fds and try again.
- */
- spin_unlock_irqrestore(&irq_lock, flags);
- kfree(tmp_pfd);
+ spin_lock_irqsave(&irq_lock, flags);
- tmp_pfd = kmalloc(n, GFP_KERNEL);
- if (tmp_pfd == NULL)
- goto out_kfree;
+ for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+ if (irq_entry->fd == fd) break;
+ }
- spin_lock_irqsave(&irq_lock, flags);
+ if (irq_entry == NULL) {
+ irq_entry = kmalloc(sizeof(struct irq_entry), GFP_KERNEL);
+ if (irq_entry == NULL) {
+ printk(KERN_ERR
+ "Failed to allocate new IRQ entry\n");
+ kfree(new_fd);
+ goto out;
+ }
+ irq_entry->fd = fd;
+ for (i = 0; i < MAX_IRQ_TYPE; i++) {
+ irq_entry->irq_array[i] = NULL;
+ }
+ irq_entry->next = active_fds;
+ active_fds = irq_entry;
}
- *last_irq_ptr = new_fd;
- last_irq_ptr = &new_fd->next;
+ if (irq_entry->irq_array[type] != NULL) {
+ printk(KERN_ERR
+ "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
+ irq, fd, type, dev_id
+ );
+ goto out_unlock;
+ } else {
+ irq_entry->irq_array[type] = new_fd;
+ }
+ update_events(irq_entry);
+
spin_unlock_irqrestore(&irq_lock, flags);
- /*
- * This calls activate_fd, so it has to be outside the critical
- * section.
- */
- maybe_sigio_broken(fd, (type == IRQ_READ));
+ maybe_sigio_broken(fd, (type != IRQ_NONE));
return 0;
out_unlock:
spin_unlock_irqrestore(&irq_lock, flags);
- out_kfree:
kfree(new_fd);
out:
return err;
}
-static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&irq_lock, flags);
- os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
- spin_unlock_irqrestore(&irq_lock, flags);
-}
-
-struct irq_and_dev {
- int irq;
- void *dev;
-};
-static int same_irq_and_dev(struct irq_fd *irq, void *d)
+static void do_free_by_irq_and_dev(
+ struct irq_entry* irq_entry,
+ unsigned int irq,
+ void * dev
+)
{
- struct irq_and_dev *data = d;
-
- return ((irq->irq == data->irq) && (irq->id == data->dev));
+ int i;
+ struct irq_fd * to_free;
+ for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
+ if (irq_entry->irq_array[i] != NULL) {
+ if (
+ (irq_entry->irq_array[i]->irq == irq) &&
+ (irq_entry->irq_array[i]->id == dev)
+ ) {
+ to_free = irq_entry->irq_array[i];
+ irq_entry->irq_array[i] = NULL;
+ update_events(irq_entry);
+ kfree(to_free);
+ }
+ }
+ }
}
-static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
-{
- struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq,
- .dev = dev });
+void free_irq_by_fd(int fd) {
- free_irq_by_cb(same_irq_and_dev, &data);
-}
+ struct irq_entry *irq_entry, *prev = NULL;
+ unsigned long flags;
+ int i;
-static int same_fd(struct irq_fd *irq, void *fd)
-{
- return (irq->fd == *((int *)fd));
+ spin_lock_irqsave(&irq_lock, flags);
+ for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+ if (irq_entry->fd == irq_entry->fd) {
+ os_del_epoll_fd(fd); /* ignore err, just do it */
+ for (i = 0; i < MAX_IRQ_TYPE ; i++) {
+ if (irq_entry->irq_array[i] != NULL) {
+ kfree(irq_entry->irq_array[i]);
+ }
+ }
+ if (prev == NULL) {
+ active_fds = irq_entry->next;
+ } else {
+ prev->next = irq_entry->next;
+ }
+ kfree(irq_entry);
+ } else {
+ prev = irq_entry;
+ }
+ }
+ spin_unlock_irqrestore(&irq_lock, flags);
+
}
-void free_irq_by_fd(int fd)
-{
- free_irq_by_cb(same_fd, &fd);
-}
-/* Must be called with irq_lock held */
-static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
-{
- struct irq_fd *irq;
- int i = 0;
- int fdi;
-
- for (irq = active_fds; irq != NULL; irq = irq->next) {
- if ((irq->fd == fd) && (irq->irq == irqnum))
- break;
- i++;
- }
- if (irq == NULL) {
- printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
- fd);
- goto out;
- }
- fdi = os_get_pollfd(i);
- if ((fdi != -1) && (fdi != fd)) {
- printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
- "and pollfds, fd %d vs %d, need %d\n", irq->fd,
- fdi, fd);
- irq = NULL;
- goto out;
- }
- *index_out = i;
- out:
- return irq;
-}
+static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) {
-void reactivate_fd(int fd, int irqnum)
-{
- struct irq_fd *irq;
+ struct irq_entry *irq_entry;
unsigned long flags;
- int i;
spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
- if (irq == NULL) {
- spin_unlock_irqrestore(&irq_lock, flags);
- return;
+ for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+ do_free_by_irq_and_dev(irq_entry, irq, dev);
}
- os_set_pollfd(i, irq->fd);
spin_unlock_irqrestore(&irq_lock, flags);
-
- add_sigio_fd(fd);
+
}
-void deactivate_fd(int fd, int irqnum)
+
+void reactivate_fd(int fd, int irqnum)
{
- struct irq_fd *irq;
+ struct irq_entry *irq_entry;
unsigned long flags;
- int i;
-
spin_lock_irqsave(&irq_lock, flags);
- irq = find_irq_by_fd(fd, irqnum, &i);
- if (irq == NULL) {
- spin_unlock_irqrestore(&irq_lock, flags);
- return;
+ for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+ if (irq_entry->fd == fd) {
+ update_events(irq_entry);
+ }
}
-
- os_set_pollfd(i, -1);
spin_unlock_irqrestore(&irq_lock, flags);
+
+}
- ignore_sigio_fd(fd);
+void deactivate_fd(int fd, int irqnum)
+{
+ os_del_epoll_fd(fd); /* ignore err, just do it */
}
EXPORT_SYMBOL(deactivate_fd);
@@ -265,17 +334,16 @@ EXPORT_SYMBOL(deactivate_fd);
*/
int deactivate_all_fds(void)
{
- struct irq_fd *irq;
+ struct irq_entry * irq_entry;
int err;
- for (irq = active_fds; irq != NULL; irq = irq->next) {
- err = os_clear_fd_async(irq->fd);
- if (err)
- return err;
+ for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
+ os_del_epoll_fd(irq_entry->fd); /* ignore err, just do it */
+ err = os_clear_fd_async(irq_entry->fd);
+ if (err) {
+ printk(KERN_ERR "Clear FD async failed with %d", err);
+ }
}
- /* If there is a signal already queued, after unblocking ignore it */
- os_set_ioignore();
-
return 0;
}
@@ -308,13 +376,13 @@ int um_request_irq(unsigned int irq, int fd, int type,
{
int err;
- if (fd != -1) {
+ err = request_irq(irq, handler, irqflags, devname, dev_id);
+
+ if ((!err) && (fd != -1)) {
err = activate_fd(irq, fd, type, dev_id);
- if (err)
- return err;
}
- return request_irq(irq, handler, irqflags, devname, dev_id);
+ return err;
}
EXPORT_SYMBOL(um_request_irq);
@@ -352,9 +420,9 @@ void __init init_IRQ(void)
int i;
irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
-
- for (i = 1; i < NR_IRQS; i++)
+ for (i = 1; i < NR_IRQS - 1 ; i++)
irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
+ os_setup_epoll(MAX_EPOLL_EVENTS);
}
/*
@@ -382,11 +450,11 @@ void __init init_IRQ(void)
* thread_info.
*
* There are three cases -
- * The first interrupt on the stack - sets up the thread_info and
+ * The first interrupt on the stack - sets up the thread_info and
* handles the interrupt
- * A nested interrupt interrupting the copying of the thread_info -
+ * A nested interrupt interrupting the copying of the thread_info -
* can't handle the interrupt, as the stack is in an unknown state
- * A nested interrupt not interrupting the copying of the
+ * A nested interrupt not interrupting the copying of the
* thread_info - doesn't do any setup, just handles the interrupt
*
* The first job is to figure out whether we interrupted stack setup.
diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
index b9afb74..837aa68 100644
--- a/arch/um/os-Linux/irq.c
+++ b/arch/um/os-Linux/irq.c
@@ -1,4 +1,5 @@
/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*/
@@ -6,6 +7,7 @@
#include <stdlib.h>
#include <errno.h>
#include <poll.h>
+#include <sys/epoll.h>
#include <signal.h>
#include <string.h>
#include <irq_user.h>
@@ -16,117 +18,80 @@
* Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd
* and os_free_irq_by_cb, which are called under irq_lock.
*/
-static struct pollfd *pollfds = NULL;
-static int pollfds_num = 0;
-static int pollfds_size = 0;
-int os_waiting_for_events(struct irq_fd *active_fds)
+/* epoll support */
+
+
+static int epollfd = -1;
+
+int os_setup_epoll(int maxevents) {
+ epollfd = epoll_create(maxevents);
+ return epollfd;
+}
+
+int os_waiting_for_events_epoll(void *kernel_events, int maxevents)
{
- struct irq_fd *irq_fd;
- int i, n, err;
+ int n, err;
- n = poll(pollfds, pollfds_num, 0);
+ n = epoll_wait(epollfd,
+ (struct epoll_event *) kernel_events, maxevents, 0);
if (n < 0) {
err = -errno;
if (errno != EINTR)
- printk(UM_KERN_ERR "os_waiting_for_events:"
- " poll returned %d, errno = %d\n", n, errno);
+ printk(
+ UM_KERN_ERR "os_waiting_for_events:"
+ " poll returned %d, error = %s\n", n,
+ strerror(errno)
+ );
return err;
}
- if (n == 0)
- return 0;
-
- irq_fd = active_fds;
-
- for (i = 0; i < pollfds_num; i++) {
- if (pollfds[i].revents != 0) {
- irq_fd->current_events = pollfds[i].revents;
- pollfds[i].fd = -1;
- }
- irq_fd = irq_fd->next;
- }
return n;
}
-int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
-{
- if (pollfds_num == pollfds_size) {
- if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
- /* return min size needed for new pollfds area */
- return (pollfds_size + 1) * sizeof(pollfds[0]);
- }
-
- if (pollfds != NULL) {
- memcpy(tmp_pfd, pollfds,
- sizeof(pollfds[0]) * pollfds_size);
- /* remove old pollfds */
- kfree(pollfds);
- }
- pollfds = tmp_pfd;
- pollfds_size++;
- } else
- kfree(tmp_pfd); /* remove not used tmp_pfd */
+int os_add_epoll_fd (int events, int fd, void * data) {
+ struct epoll_event event;
+ int result;
- pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
- .events = events,
- .revents = 0 });
- pollfds_num++;
-
- return 0;
+ event.data.ptr = data;
+ event.events = events | EPOLLET;
+ result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
+ if ((result) && (errno == EEXIST)) {
+ result = os_mod_epoll_fd (events, fd, data);
+ }
+ if (result) {
+ printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
+ }
+ return result;
}
-void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
- struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
-{
- struct irq_fd **prev;
- int i = 0;
-
- prev = &active_fds;
- while (*prev != NULL) {
- if ((*test)(*prev, arg)) {
- struct irq_fd *old_fd = *prev;
- if ((pollfds[i].fd != -1) &&
- (pollfds[i].fd != (*prev)->fd)) {
- printk(UM_KERN_ERR "os_free_irq_by_cb - "
- "mismatch between active_fds and "
- "pollfds, fd %d vs %d\n",
- (*prev)->fd, pollfds[i].fd);
- goto out;
- }
-
- pollfds_num--;
-
- /*
- * This moves the *whole* array after pollfds[i]
- * (though it doesn't spot as such)!
- */
- memmove(&pollfds[i], &pollfds[i + 1],
- (pollfds_num - i) * sizeof(pollfds[0]));
- if (*last_irq_ptr2 == &old_fd->next)
- *last_irq_ptr2 = prev;
-
- *prev = (*prev)->next;
- if (old_fd->type == IRQ_WRITE)
- ignore_sigio_fd(old_fd->fd);
- kfree(old_fd);
- continue;
- }
- prev = &(*prev)->next;
- i++;
+int os_mod_epoll_fd (int events, int fd, void * data) {
+ struct epoll_event event;
+ int result;
+ event.data.ptr = data;
+ event.events = events;
+ result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
+ if (result) {
+ printk("epollctl mod err fd %d, %s\n", fd, strerror(errno));
}
- out:
- return;
+ return result;
}
-int os_get_pollfd(int i)
-{
- return pollfds[i].fd;
+int os_del_epoll_fd (int fd) {
+ struct epoll_event event;
+ int result;
+ result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
+ if (result) {
+ printk("epollctl del err %s\n", strerror(errno));
+ }
+ return result;
}
-void os_set_pollfd(int i, int fd)
+void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
+ struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
{
- pollfds[i].fd = fd;
+ printk("Someone invoking obsolete deactivate_by_CB!!!\n");
+ return;
}
void os_set_ioignore(void)
--
2.1.4
------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply related [flat|nested] 3+ messages in thread* Re: [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0
2015-11-08 22:50 [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0 Anton Ivanov
@ 2015-11-08 23:00 ` Anton Ivanov
2015-11-08 23:23 ` Anton Ivanov
1 sibling, 0 replies; 3+ messages in thread
From: Anton Ivanov @ 2015-11-08 23:00 UTC (permalink / raw)
To: user-mode-linux-devel@lists.sourceforge.net
This works cleanly and is understandable (something I would not say
about the original version I wrote a couple of years back).
It emits some minor nags on shutdown related to cleaning up the term
descriptors, but as far as I can see they are mostly harmless. I suspect
that once I do an incremental on top to enable write IRQ semantics in
the line.c family of drivers these will go away naturally.
UBD tests out to 15% + faster, net is also faster even if you have one
device. If you have let's say 20-30 devices, the speed difference
becomes more substantial even without allocating different IRQs to
different network devices.
A.
On 08/11/15 22:50, Anton Ivanov wrote:
> Epoll based interrupt controller.
>
> IMPROVES: IO loop performance - no per fd lookups, allowing for
> 15% IO speedup in minimal config going to 100s of % with many
> devices - a N^N lookup is now replaced by a log(N)
>
> ADDS: True Write IRQ functionality
>
> OBSOLETES: The need to call reactivate_fd() in any driver which
> has only read IRQ semantics. Write IRQs work, but will need to
> be updated to use this fully.
>
> Potentially (with a change in API) will allow both edge and level
> IRQ semantics.
>
> Pre-requisite for using packet mmap and multipacket read/write
> which do not get along with poll() very well.
>
> Signed-off-by/: Anton Ivanov <aivanov@brocade.com>
> ---
> arch/um/drivers/line.c | 5 +-
> arch/um/drivers/mconsole_kern.c | 2 -
> arch/um/drivers/net_kern.c | 1 -
> arch/um/drivers/port_kern.c | 1 -
> arch/um/drivers/random.c | 1 -
> arch/um/drivers/ubd_kern.c | 1 -
> arch/um/include/shared/irq_user.h | 24 ++-
> arch/um/include/shared/os.h | 13 +-
> arch/um/kernel/irq.c | 412 ++++++++++++++++++++++----------------
> arch/um/os-Linux/irq.c | 145 +++++---------
> 10 files changed, 321 insertions(+), 284 deletions(-)
>
> diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
> index 6208702..84384c8 100644
> --- a/arch/um/drivers/line.c
> +++ b/arch/um/drivers/line.c
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
> * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> * Licensed under the GPL
> */
> @@ -283,7 +284,7 @@ int line_setup_irq(int fd, int input, int output, struct line *line, void *data)
> if (err)
> return err;
> if (output)
> - err = um_request_irq(driver->write_irq, fd, IRQ_WRITE,
> + err = um_request_irq(driver->write_irq, fd, IRQ_NONE,
> line_write_interrupt, IRQF_SHARED,
> driver->write_irq_name, data);
> return err;
> @@ -666,8 +667,6 @@ static irqreturn_t winch_interrupt(int irq, void *data)
> tty_kref_put(tty);
> }
> out:
> - if (winch->fd != -1)
> - reactivate_fd(winch->fd, WINCH_IRQ);
> return IRQ_HANDLED;
> }
>
> diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
> index 29880c9..5e8881c 100644
> --- a/arch/um/drivers/mconsole_kern.c
> +++ b/arch/um/drivers/mconsole_kern.c
> @@ -95,7 +95,6 @@ static irqreturn_t mconsole_interrupt(int irq, void *dev_id)
> }
> if (!list_empty(&mc_requests))
> schedule_work(&mconsole_work);
> - reactivate_fd(fd, MCONSOLE_IRQ);
> return IRQ_HANDLED;
> }
>
> @@ -243,7 +242,6 @@ void mconsole_stop(struct mc_request *req)
> (*req->cmd->handler)(req);
> }
> os_set_fd_block(req->originating_fd, 0);
> - reactivate_fd(req->originating_fd, MCONSOLE_IRQ);
> mconsole_reply(req, "", 0, 0);
> }
>
> diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
> index f70dd54..82ea3a2 100644
> --- a/arch/um/drivers/net_kern.c
> +++ b/arch/um/drivers/net_kern.c
> @@ -137,7 +137,6 @@ static irqreturn_t uml_net_interrupt(int irq, void *dev_id)
> schedule_work(&lp->work);
> goto out;
> }
> - reactivate_fd(lp->fd, UM_ETH_IRQ);
>
> out:
> spin_unlock(&lp->lock);
> diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
> index 40ca5cc..b0e9ff3 100644
> --- a/arch/um/drivers/port_kern.c
> +++ b/arch/um/drivers/port_kern.c
> @@ -137,7 +137,6 @@ static void port_work_proc(struct work_struct *unused)
> if (!port->has_connection)
> continue;
>
> - reactivate_fd(port->fd, ACCEPT_IRQ);
> while (port_accept(port))
> ;
> port->has_connection = 0;
> diff --git a/arch/um/drivers/random.c b/arch/um/drivers/random.c
> index dd16c90..a392828 100644
> --- a/arch/um/drivers/random.c
> +++ b/arch/um/drivers/random.c
> @@ -72,7 +72,6 @@ static ssize_t rng_dev_read (struct file *filp, char __user *buf, size_t size,
> return ret ? : -EAGAIN;
>
> atomic_inc(&host_sleep_count);
> - reactivate_fd(random_fd, RANDOM_IRQ);
> add_sigio_fd(random_fd);
>
> add_wait_queue(&host_read_wait, &wait);
> diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
> index e8ab93c..731982c 100644
> --- a/arch/um/drivers/ubd_kern.c
> +++ b/arch/um/drivers/ubd_kern.c
> @@ -466,7 +466,6 @@ static void ubd_handler(void)
> blk_end_request(req->req, 0, req->length);
> kfree(req);
> }
> - reactivate_fd(thread_fd, UBD_IRQ);
>
> list_for_each_safe(list, next_ele, &restart){
> ubd = container_of(list, struct ubd, restart);
> diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
> index df56330..0eca64c 100644
> --- a/arch/um/include/shared/irq_user.h
> +++ b/arch/um/include/shared/irq_user.h
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
> * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> * Licensed under the GPL
> */
> @@ -9,16 +10,23 @@
> #include <sysdep/ptrace.h>
>
> struct irq_fd {
> - struct irq_fd *next;
> - void *id;
> - int fd;
> - int type;
> - int irq;
> - int events;
> - int current_events;
> + void *id;
> + int irq;
> + int events;
> +};
> +
> +
> +#define IRQ_READ 0
> +#define IRQ_WRITE 1
> +#define IRQ_NONE 2
> +#define MAX_IRQ_TYPE (IRQ_NONE + 1)
> +
> +struct irq_entry {
> + struct irq_entry *next;
> + int fd;
> + struct irq_fd * irq_array[MAX_IRQ_TYPE + 1];
> };
>
> -enum { IRQ_READ, IRQ_WRITE };
>
> struct siginfo;
> extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
> diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
> index 21d704b..3fe1249 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -1,5 +1,6 @@
> /*
> * Copyright (C) 2015 Anton Ivanov (aivanov@{brocade.com,kot-begemot.co.uk})
> + * Copyright (C) 2012 - 2014 Cisco Systems
> * Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
> * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> * Licensed under the GPL
> @@ -284,15 +285,17 @@ extern void halt_skas(void);
> extern void reboot_skas(void);
>
> /* irq.c */
> -extern int os_waiting_for_events(struct irq_fd *active_fds);
> -extern int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds);
> +
> +extern int os_setup_epoll(int maxevents);
> +extern int os_waiting_for_events_epoll(void *kernel_events, int maxevents);
> +extern int os_add_epoll_fd (int events, int fd, void * data);
> +extern int os_mod_epoll_fd (int events, int fd, void * data);
> +extern int os_del_epoll_fd (int fd);
> +
> extern void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
> struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2);
> extern void os_free_irq_later(struct irq_fd *active_fds,
> int irq, void *dev_id);
> -extern int os_get_pollfd(int i);
> -extern void os_set_pollfd(int i, int fd);
> -extern void os_set_ioignore(void);
>
> /* sigio.c */
> extern int add_sigio_fd(int fd);
> diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
> index 23cb935..516b13b 100644
> --- a/arch/um/kernel/irq.c
> +++ b/arch/um/kernel/irq.c
> @@ -1,4 +1,7 @@
> /*
> + * Copyright (C) 2015 Brocade Communications Ltd
> + * Author: Anton Ivanov aivanov@{brocade.com,kot-begemot.co.uk}
> + * Copyright (C) 2012 - 2014 Cisco Systems
> * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> * Licensed under the GPL
> * Derived (i.e. mostly copied) from arch/i386/kernel/irq.c:
> @@ -18,6 +21,61 @@
> #include <os.h>
>
> /*
> +* We are on the "kernel side" so we cannot pick up the sys/epoll.h
> +* So we lift out of it the applicable key definitions.
> +*/
> +
> +
> +enum EPOLL_EVENTS
> + {
> + EPOLLIN = 0x001,
> +#define EPOLLIN EPOLLIN
> + EPOLLPRI = 0x002,
> +#define EPOLLPRI EPOLLPRI
> + EPOLLOUT = 0x004,
> +#define EPOLLOUT EPOLLOUT
> + EPOLLRDNORM = 0x040,
> +#define EPOLLRDNORM EPOLLRDNORM
> + EPOLLRDBAND = 0x080,
> +#define EPOLLRDBAND EPOLLRDBAND
> + EPOLLWRNORM = 0x100,
> +#define EPOLLWRNORM EPOLLWRNORM
> + EPOLLWRBAND = 0x200,
> +#define EPOLLWRBAND EPOLLWRBAND
> + EPOLLMSG = 0x400,
> +#define EPOLLMSG EPOLLMSG
> + EPOLLERR = 0x008,
> +#define EPOLLERR EPOLLERR
> + EPOLLHUP = 0x010,
> +#define EPOLLHUP EPOLLHUP
> + EPOLLRDHUP = 0x2000,
> +#define EPOLLRDHUP EPOLLRDHUP
> + EPOLLONESHOT = (1 << 30),
> +#define EPOLLONESHOT EPOLLONESHOT
> + EPOLLET = (1 << 31)
> +#define EPOLLET EPOLLET
> + };
> +
> +
> +typedef union epoll_data
> +{
> + void *ptr;
> + int fd;
> + uint32_t u32;
> + uint64_t u64;
> +} epoll_data_t;
> +
> +struct epoll_event
> +{
> + uint32_t events; /* Epoll events */
> + epoll_data_t data; /* User data variable */
> +} __attribute__ ((__packed__));
> +
> +#define MAX_EPOLL_EVENTS 16
> +
> +static struct epoll_event epoll_events[MAX_EPOLL_EVENTS];
> +
> +/*
> * This list is accessed under irq_lock, except in sigio_handler,
> * where it is safe from being modified. IRQ handlers won't change it -
> * if an IRQ source has vanished, it will be freed by free_irqs just
> @@ -25,44 +83,91 @@
> * list of irqs to free, with its own locking, coming back here to
> * remove list elements, taking the irq_lock to do so.
> */
> -static struct irq_fd *active_fds = NULL;
> -static struct irq_fd **last_irq_ptr = &active_fds;
> +static struct irq_entry *active_fds = NULL;
>
> extern void free_irqs(void);
>
> +
> +static DEFINE_SPINLOCK(irq_lock);
> +
> +
> +/*
> + * Principles of Operation:
> + * Each Epoll structure contains a pointer pointing back to an array
> + * with irq entries for read, write and none and their matching event
> + * masks.
> + * This allows us to stop looking up "who talked"
> + * We no longer need to enable/disable any polls while we process them
> + * epoll will take care of that. The exemption to this (for now) are
> + * character devices because of their own internal buffering, which
> + * needs to be updated to leverage the new write IRQ semantics.
> + * We can now support both read and write IRQs and have separate IRQs
> + * for read and write ops.
> + */
> +
> +
> void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
> {
> struct irq_fd *irq_fd;
> - int n;
> + struct irq_entry *irq_entry;
> + unsigned long flags;
> +
> + int n, i, j;
>
> while (1) {
> - n = os_waiting_for_events(active_fds);
> - if (n <= 0) {
> - if (n == -EINTR)
> - continue;
> - else break;
> - }
>
> - for (irq_fd = active_fds; irq_fd != NULL;
> - irq_fd = irq_fd->next) {
> - if (irq_fd->current_events != 0) {
> - irq_fd->current_events = 0;
> - do_IRQ(irq_fd->irq, regs);
> - }
> + spin_lock_irqsave(&irq_lock, flags);
> +
> + n = os_waiting_for_events_epoll(
> + &epoll_events, MAX_EPOLL_EVENTS
> + );
> +
> +
> + if (n <= 0) {
> + if (n == -EINTR) { continue; }
> + else { break; }
> }
> +
> +
> + for (i = 0; i < n ; i++) {
> + /* start from the data ptr, walk the tree branch */
> + irq_entry = (struct irq_entry *) epoll_events[i].data.ptr;
> + for (j = 0; j < MAX_IRQ_TYPE ; j ++ ) {
> + irq_fd = irq_entry->irq_array[j];
> + if (irq_fd != NULL) {
> + if (epoll_events[i].events & irq_fd->events) {
> + do_IRQ(irq_fd->irq, regs);
> + }
> + }
> + }
> + }
> + spin_unlock_irqrestore(&irq_lock, flags);
> }
>
> free_irqs();
> }
>
> -static DEFINE_SPINLOCK(irq_lock);
> +static int update_events(struct irq_entry * irq_entry) {
> + int i;
> + int events = 0;
> + struct irq_fd * irq_fd;
> + for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
> + irq_fd = irq_entry->irq_array[i];
> + if (irq_fd != NULL) {
> + events = irq_fd->events | events;
> + }
> + }
> + /* os_add_epoll will call os_mod_epoll if this already exists */
> + return os_add_epoll_fd(events, irq_entry->fd, irq_entry);
> +}
> +
>
> static int activate_fd(int irq, int fd, int type, void *dev_id)
> {
> - struct pollfd *tmp_pfd;
> - struct irq_fd *new_fd, *irq_fd;
> + struct irq_fd *new_fd;
> + struct irq_entry * irq_entry;
> unsigned long flags;
> - int events, err, n;
> + int i, err, events;
>
> err = os_set_fd_async(fd);
> if (err < 0)
> @@ -74,186 +179,150 @@ static int activate_fd(int irq, int fd, int type, void *dev_id)
> goto out;
>
> if (type == IRQ_READ)
> - events = UM_POLLIN | UM_POLLPRI;
> - else events = UM_POLLOUT;
> - *new_fd = ((struct irq_fd) { .next = NULL,
> - .id = dev_id,
> - .fd = fd,
> - .type = type,
> - .irq = irq,
> - .events = events,
> - .current_events = 0 } );
> -
> - err = -EBUSY;
> - spin_lock_irqsave(&irq_lock, flags);
> - for (irq_fd = active_fds; irq_fd != NULL; irq_fd = irq_fd->next) {
> - if ((irq_fd->fd == fd) && (irq_fd->type == type)) {
> - printk(KERN_ERR "Registering fd %d twice\n", fd);
> - printk(KERN_ERR "Irqs : %d, %d\n", irq_fd->irq, irq);
> - printk(KERN_ERR "Ids : 0x%p, 0x%p\n", irq_fd->id,
> - dev_id);
> - goto out_unlock;
> - }
> - }
> -
> + events |= EPOLLIN | EPOLLPRI;
> if (type == IRQ_WRITE)
> - fd = -1;
> + events |= EPOLLOUT;
>
> - tmp_pfd = NULL;
> - n = 0;
> + *new_fd = ((struct irq_fd) {
> + .id = dev_id,
> + .irq = irq,
> + .events = events
> + });
>
> - while (1) {
> - n = os_create_pollfd(fd, events, tmp_pfd, n);
> - if (n == 0)
> - break;
> + err = -EBUSY;
>
> - /*
> - * n > 0
> - * It means we couldn't put new pollfd to current pollfds
> - * and tmp_fds is NULL or too small for new pollfds array.
> - * Needed size is equal to n as minimum.
> - *
> - * Here we have to drop the lock in order to call
> - * kmalloc, which might sleep.
> - * If something else came in and changed the pollfds array
> - * so we will not be able to put new pollfd struct to pollfds
> - * then we free the buffer tmp_fds and try again.
> - */
> - spin_unlock_irqrestore(&irq_lock, flags);
> - kfree(tmp_pfd);
> + spin_lock_irqsave(&irq_lock, flags);
>
> - tmp_pfd = kmalloc(n, GFP_KERNEL);
> - if (tmp_pfd == NULL)
> - goto out_kfree;
> + for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
> + if (irq_entry->fd == fd) break;
> + }
>
> - spin_lock_irqsave(&irq_lock, flags);
> + if (irq_entry == NULL) {
> + irq_entry = kmalloc(sizeof(struct irq_entry), GFP_KERNEL);
> + if (irq_entry == NULL) {
> + printk(KERN_ERR
> + "Failed to allocate new IRQ entry\n");
> + kfree(new_fd);
> + goto out;
> + }
> + irq_entry->fd = fd;
> + for (i = 0; i < MAX_IRQ_TYPE; i++) {
> + irq_entry->irq_array[i] = NULL;
> + }
> + irq_entry->next = active_fds;
> + active_fds = irq_entry;
> }
>
> - *last_irq_ptr = new_fd;
> - last_irq_ptr = &new_fd->next;
> + if (irq_entry->irq_array[type] != NULL) {
> + printk(KERN_ERR
> + "Trying to reregister IRQ %d FD %d TYPE %d ID %p\n",
> + irq, fd, type, dev_id
> + );
> + goto out_unlock;
> + } else {
> + irq_entry->irq_array[type] = new_fd;
> + }
>
> + update_events(irq_entry);
> +
> spin_unlock_irqrestore(&irq_lock, flags);
>
> - /*
> - * This calls activate_fd, so it has to be outside the critical
> - * section.
> - */
> - maybe_sigio_broken(fd, (type == IRQ_READ));
> + maybe_sigio_broken(fd, (type != IRQ_NONE));
>
> return 0;
>
> out_unlock:
> spin_unlock_irqrestore(&irq_lock, flags);
> - out_kfree:
> kfree(new_fd);
> out:
> return err;
> }
>
> -static void free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg)
> -{
> - unsigned long flags;
> -
> - spin_lock_irqsave(&irq_lock, flags);
> - os_free_irq_by_cb(test, arg, active_fds, &last_irq_ptr);
> - spin_unlock_irqrestore(&irq_lock, flags);
> -}
> -
> -struct irq_and_dev {
> - int irq;
> - void *dev;
> -};
>
> -static int same_irq_and_dev(struct irq_fd *irq, void *d)
> +static void do_free_by_irq_and_dev(
> + struct irq_entry* irq_entry,
> + unsigned int irq,
> + void * dev
> +)
> {
> - struct irq_and_dev *data = d;
> -
> - return ((irq->irq == data->irq) && (irq->id == data->dev));
> + int i;
> + struct irq_fd * to_free;
> + for (i = 0; i < MAX_IRQ_TYPE ; i ++ ) {
> + if (irq_entry->irq_array[i] != NULL) {
> + if (
> + (irq_entry->irq_array[i]->irq == irq) &&
> + (irq_entry->irq_array[i]->id == dev)
> + ) {
> + to_free = irq_entry->irq_array[i];
> + irq_entry->irq_array[i] = NULL;
> + update_events(irq_entry);
> + kfree(to_free);
> + }
> + }
> + }
> }
>
> -static void free_irq_by_irq_and_dev(unsigned int irq, void *dev)
> -{
> - struct irq_and_dev data = ((struct irq_and_dev) { .irq = irq,
> - .dev = dev });
> +void free_irq_by_fd(int fd) {
>
> - free_irq_by_cb(same_irq_and_dev, &data);
> -}
> + struct irq_entry *irq_entry, *prev = NULL;
> + unsigned long flags;
> + int i;
>
> -static int same_fd(struct irq_fd *irq, void *fd)
> -{
> - return (irq->fd == *((int *)fd));
> + spin_lock_irqsave(&irq_lock, flags);
> + for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
> + if (irq_entry->fd == irq_entry->fd) {
> + os_del_epoll_fd(fd); /* ignore err, just do it */
> + for (i = 0; i < MAX_IRQ_TYPE ; i++) {
> + if (irq_entry->irq_array[i] != NULL) {
> + kfree(irq_entry->irq_array[i]);
> + }
> + }
> + if (prev == NULL) {
> + active_fds = irq_entry->next;
> + } else {
> + prev->next = irq_entry->next;
> + }
> + kfree(irq_entry);
> + } else {
> + prev = irq_entry;
> + }
> + }
> + spin_unlock_irqrestore(&irq_lock, flags);
> +
> }
>
> -void free_irq_by_fd(int fd)
> -{
> - free_irq_by_cb(same_fd, &fd);
> -}
>
> -/* Must be called with irq_lock held */
> -static struct irq_fd *find_irq_by_fd(int fd, int irqnum, int *index_out)
> -{
> - struct irq_fd *irq;
> - int i = 0;
> - int fdi;
> -
> - for (irq = active_fds; irq != NULL; irq = irq->next) {
> - if ((irq->fd == fd) && (irq->irq == irqnum))
> - break;
> - i++;
> - }
> - if (irq == NULL) {
> - printk(KERN_ERR "find_irq_by_fd doesn't have descriptor %d\n",
> - fd);
> - goto out;
> - }
> - fdi = os_get_pollfd(i);
> - if ((fdi != -1) && (fdi != fd)) {
> - printk(KERN_ERR "find_irq_by_fd - mismatch between active_fds "
> - "and pollfds, fd %d vs %d, need %d\n", irq->fd,
> - fdi, fd);
> - irq = NULL;
> - goto out;
> - }
> - *index_out = i;
> - out:
> - return irq;
> -}
> +static void free_irq_by_irq_and_dev(unsigned int irq, void *dev) {
>
> -void reactivate_fd(int fd, int irqnum)
> -{
> - struct irq_fd *irq;
> + struct irq_entry *irq_entry;
> unsigned long flags;
> - int i;
>
> spin_lock_irqsave(&irq_lock, flags);
> - irq = find_irq_by_fd(fd, irqnum, &i);
> - if (irq == NULL) {
> - spin_unlock_irqrestore(&irq_lock, flags);
> - return;
> + for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
> + do_free_by_irq_and_dev(irq_entry, irq, dev);
> }
> - os_set_pollfd(i, irq->fd);
> spin_unlock_irqrestore(&irq_lock, flags);
> -
> - add_sigio_fd(fd);
> +
> }
>
> -void deactivate_fd(int fd, int irqnum)
> +
> +void reactivate_fd(int fd, int irqnum)
> {
> - struct irq_fd *irq;
> + struct irq_entry *irq_entry;
> unsigned long flags;
> - int i;
> -
> spin_lock_irqsave(&irq_lock, flags);
> - irq = find_irq_by_fd(fd, irqnum, &i);
> - if (irq == NULL) {
> - spin_unlock_irqrestore(&irq_lock, flags);
> - return;
> + for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
> + if (irq_entry->fd == fd) {
> + update_events(irq_entry);
> + }
> }
> -
> - os_set_pollfd(i, -1);
> spin_unlock_irqrestore(&irq_lock, flags);
> +
> +}
>
> - ignore_sigio_fd(fd);
> +void deactivate_fd(int fd, int irqnum)
> +{
> + os_del_epoll_fd(fd); /* ignore err, just do it */
> }
> EXPORT_SYMBOL(deactivate_fd);
>
> @@ -265,17 +334,16 @@ EXPORT_SYMBOL(deactivate_fd);
> */
> int deactivate_all_fds(void)
> {
> - struct irq_fd *irq;
> + struct irq_entry * irq_entry;
> int err;
>
> - for (irq = active_fds; irq != NULL; irq = irq->next) {
> - err = os_clear_fd_async(irq->fd);
> - if (err)
> - return err;
> + for (irq_entry = active_fds; irq_entry != NULL; irq_entry = irq_entry->next) {
> + os_del_epoll_fd(irq_entry->fd); /* ignore err, just do it */
> + err = os_clear_fd_async(irq_entry->fd);
> + if (err) {
> + printk(KERN_ERR "Clear FD async failed with %d", err);
> + }
> }
> - /* If there is a signal already queued, after unblocking ignore it */
> - os_set_ioignore();
> -
> return 0;
> }
>
> @@ -308,13 +376,13 @@ int um_request_irq(unsigned int irq, int fd, int type,
> {
> int err;
>
> - if (fd != -1) {
> + err = request_irq(irq, handler, irqflags, devname, dev_id);
> +
> + if ((!err) && (fd != -1)) {
> err = activate_fd(irq, fd, type, dev_id);
> - if (err)
> - return err;
> }
>
> - return request_irq(irq, handler, irqflags, devname, dev_id);
> + return err;
> }
>
> EXPORT_SYMBOL(um_request_irq);
> @@ -352,9 +420,9 @@ void __init init_IRQ(void)
> int i;
>
> irq_set_chip_and_handler(TIMER_IRQ, &SIGVTALRM_irq_type, handle_edge_irq);
> -
> - for (i = 1; i < NR_IRQS; i++)
> + for (i = 1; i < NR_IRQS - 1 ; i++)
> irq_set_chip_and_handler(i, &normal_irq_type, handle_edge_irq);
> + os_setup_epoll(MAX_EPOLL_EVENTS);
> }
>
> /*
> @@ -382,11 +450,11 @@ void __init init_IRQ(void)
> * thread_info.
> *
> * There are three cases -
> - * The first interrupt on the stack - sets up the thread_info and
> + * The first interrupt on the stack - sets up the thread_info and
> * handles the interrupt
> - * A nested interrupt interrupting the copying of the thread_info -
> + * A nested interrupt interrupting the copying of the thread_info -
> * can't handle the interrupt, as the stack is in an unknown state
> - * A nested interrupt not interrupting the copying of the
> + * A nested interrupt not interrupting the copying of the
> * thread_info - doesn't do any setup, just handles the interrupt
> *
> * The first job is to figure out whether we interrupted stack setup.
> diff --git a/arch/um/os-Linux/irq.c b/arch/um/os-Linux/irq.c
> index b9afb74..837aa68 100644
> --- a/arch/um/os-Linux/irq.c
> +++ b/arch/um/os-Linux/irq.c
> @@ -1,4 +1,5 @@
> /*
> + * Copyright (C) 2012 - 2014 Cisco Systems
> * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
> * Licensed under the GPL
> */
> @@ -6,6 +7,7 @@
> #include <stdlib.h>
> #include <errno.h>
> #include <poll.h>
> +#include <sys/epoll.h>
> #include <signal.h>
> #include <string.h>
> #include <irq_user.h>
> @@ -16,117 +18,80 @@
> * Locked by irq_lock in arch/um/kernel/irq.c. Changed by os_create_pollfd
> * and os_free_irq_by_cb, which are called under irq_lock.
> */
> -static struct pollfd *pollfds = NULL;
> -static int pollfds_num = 0;
> -static int pollfds_size = 0;
>
> -int os_waiting_for_events(struct irq_fd *active_fds)
> +/* epoll support */
> +
> +
> +static int epollfd = -1;
> +
> +int os_setup_epoll(int maxevents) {
> + epollfd = epoll_create(maxevents);
> + return epollfd;
> +}
> +
> +int os_waiting_for_events_epoll(void *kernel_events, int maxevents)
> {
> - struct irq_fd *irq_fd;
> - int i, n, err;
> + int n, err;
>
> - n = poll(pollfds, pollfds_num, 0);
> + n = epoll_wait(epollfd,
> + (struct epoll_event *) kernel_events, maxevents, 0);
> if (n < 0) {
> err = -errno;
> if (errno != EINTR)
> - printk(UM_KERN_ERR "os_waiting_for_events:"
> - " poll returned %d, errno = %d\n", n, errno);
> + printk(
> + UM_KERN_ERR "os_waiting_for_events:"
> + " poll returned %d, error = %s\n", n,
> + strerror(errno)
> + );
> return err;
> }
>
> - if (n == 0)
> - return 0;
> -
> - irq_fd = active_fds;
> -
> - for (i = 0; i < pollfds_num; i++) {
> - if (pollfds[i].revents != 0) {
> - irq_fd->current_events = pollfds[i].revents;
> - pollfds[i].fd = -1;
> - }
> - irq_fd = irq_fd->next;
> - }
> return n;
> }
>
> -int os_create_pollfd(int fd, int events, void *tmp_pfd, int size_tmpfds)
> -{
> - if (pollfds_num == pollfds_size) {
> - if (size_tmpfds <= pollfds_size * sizeof(pollfds[0])) {
> - /* return min size needed for new pollfds area */
> - return (pollfds_size + 1) * sizeof(pollfds[0]);
> - }
> -
> - if (pollfds != NULL) {
> - memcpy(tmp_pfd, pollfds,
> - sizeof(pollfds[0]) * pollfds_size);
> - /* remove old pollfds */
> - kfree(pollfds);
> - }
> - pollfds = tmp_pfd;
> - pollfds_size++;
> - } else
> - kfree(tmp_pfd); /* remove not used tmp_pfd */
> +int os_add_epoll_fd (int events, int fd, void * data) {
> + struct epoll_event event;
> + int result;
>
> - pollfds[pollfds_num] = ((struct pollfd) { .fd = fd,
> - .events = events,
> - .revents = 0 });
> - pollfds_num++;
> -
> - return 0;
> + event.data.ptr = data;
> + event.events = events | EPOLLET;
> + result = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &event);
> + if ((result) && (errno == EEXIST)) {
> + result = os_mod_epoll_fd (events, fd, data);
> + }
> + if (result) {
> + printk("epollctl add err fd %d, %s\n", fd, strerror(errno));
> + }
> + return result;
> }
>
> -void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
> - struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
> -{
> - struct irq_fd **prev;
> - int i = 0;
> -
> - prev = &active_fds;
> - while (*prev != NULL) {
> - if ((*test)(*prev, arg)) {
> - struct irq_fd *old_fd = *prev;
> - if ((pollfds[i].fd != -1) &&
> - (pollfds[i].fd != (*prev)->fd)) {
> - printk(UM_KERN_ERR "os_free_irq_by_cb - "
> - "mismatch between active_fds and "
> - "pollfds, fd %d vs %d\n",
> - (*prev)->fd, pollfds[i].fd);
> - goto out;
> - }
> -
> - pollfds_num--;
> -
> - /*
> - * This moves the *whole* array after pollfds[i]
> - * (though it doesn't spot as such)!
> - */
> - memmove(&pollfds[i], &pollfds[i + 1],
> - (pollfds_num - i) * sizeof(pollfds[0]));
> - if (*last_irq_ptr2 == &old_fd->next)
> - *last_irq_ptr2 = prev;
> -
> - *prev = (*prev)->next;
> - if (old_fd->type == IRQ_WRITE)
> - ignore_sigio_fd(old_fd->fd);
> - kfree(old_fd);
> - continue;
> - }
> - prev = &(*prev)->next;
> - i++;
> +int os_mod_epoll_fd (int events, int fd, void * data) {
> + struct epoll_event event;
> + int result;
> + event.data.ptr = data;
> + event.events = events;
> + result = epoll_ctl(epollfd, EPOLL_CTL_MOD, fd, &event);
> + if (result) {
> + printk("epollctl mod err fd %d, %s\n", fd, strerror(errno));
> }
> - out:
> - return;
> + return result;
> }
>
> -int os_get_pollfd(int i)
> -{
> - return pollfds[i].fd;
> +int os_del_epoll_fd (int fd) {
> + struct epoll_event event;
> + int result;
> + result = epoll_ctl(epollfd, EPOLL_CTL_DEL, fd, &event);
> + if (result) {
> + printk("epollctl del err %s\n", strerror(errno));
> + }
> + return result;
> }
>
> -void os_set_pollfd(int i, int fd)
> +void os_free_irq_by_cb(int (*test)(struct irq_fd *, void *), void *arg,
> + struct irq_fd *active_fds, struct irq_fd ***last_irq_ptr2)
> {
> - pollfds[i].fd = fd;
> + printk("Someone invoking obsolete deactivate_by_CB!!!\n");
> + return;
> }
>
> void os_set_ioignore(void)
------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 3+ messages in thread* Re: [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0
2015-11-08 22:50 [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0 Anton Ivanov
2015-11-08 23:00 ` Anton Ivanov
@ 2015-11-08 23:23 ` Anton Ivanov
1 sibling, 0 replies; 3+ messages in thread
From: Anton Ivanov @ 2015-11-08 23:23 UTC (permalink / raw)
To: user-mode-linux-devel
I just noted one minor issue with it (which existed in the earlier
version as well) - it leaks one FD per reboot.
I will fix it later on during the week in a revised version.
A.
On 08/11/15 22:50, Anton Ivanov wrote:
> Epoll based interrupt controller.
>
> IMPROVES: IO loop performance - no per fd lookups, allowing for
> 15% IO speedup in minimal config going to 100s of % with many
> devices - a N^N lookup is now replaced by a log(N)
>
> ADDS: True Write IRQ functionality
>
> OBSOLETES: The need to call reactivate_fd() in any driver which
> has only read IRQ semantics. Write IRQs work, but will need to
> be updated to use this fully.
>
> Potentially (with a change in API) will allow both edge and level
> IRQ semantics.
>
> Pre-requisite for using packet mmap and multipacket read/write
> which do not get along with poll() very well.
>
> Signed-off-by/: Anton Ivanov <aivanov@brocade.com>
> ---
> arch/um/drivers/line.c | 5 +-
> arch/um/drivers/mconsole_kern.c | 2 -
> arch/um/drivers/net_kern.c | 1 -
> arch/um/drivers/port_kern.c | 1 -
> arch/um/drivers/random.c | 1 -
> arch/um/drivers/ubd_kern.c | 1 -
> arch/um/include/shared/irq_user.h | 24 ++-
> arch/um/include/shared/os.h | 13 +-
> arch/um/kernel/irq.c | 412 ++++++++++++++++++++++----------------
> arch/um/os-Linux/irq.c | 145 +++++---------
> 10 files changed, 321 insertions(+), 284 deletions(-)
>
------------------------------------------------------------------------------
_______________________________________________
User-mode-linux-devel mailing list
User-mode-linux-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/user-mode-linux-devel
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2015-11-08 23:28 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-11-08 22:50 [uml-devel] [PATCH] EPOLL Interrupt Controller V2.0 Anton Ivanov
2015-11-08 23:00 ` Anton Ivanov
2015-11-08 23:23 ` Anton Ivanov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).