* [RFC][PATCH 1/5] genirq: Add IRQ affinity notifiers
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
@ 2010-11-19 18:44 ` Ben Hutchings
2010-11-19 18:44 ` [RFC][PATCH 2/5] lib: cpu_rmap: CPU affinity reverse-mapping Ben Hutchings
` (4 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 18:44 UTC (permalink / raw)
To: David Miller, Tom Herbert, Thomas Gleixner
Cc: netdev, linux-kernel, linux-net-drivers
When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU. This requires a reverse-map of IRQ affinity. Add a
notification mechanism to support this.
This is based closely on work by Thomas Gleixner <tglx@linutronix.de>.
---
include/linux/interrupt.h | 41 +++++++++++++++++++++++
include/linux/irqdesc.h | 3 ++
kernel/irq/manage.c | 81 +++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 125 insertions(+), 0 deletions(-)
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79d0c4f..1649b30 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,6 +14,8 @@
#include <linux/smp.h>
#include <linux/percpu.h>
#include <linux/hrtimer.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
#include <asm/atomic.h>
#include <asm/ptrace.h>
@@ -231,6 +233,28 @@ static inline void resume_device_irqs(void) { };
static inline int check_wakeup_irqs(void) { return 0; }
#endif
+/**
+ * struct irq_affinity_notify - context for notification of IRQ affinity changes
+ * @irq: Interrupt to which notification applies
+ * @kref: Reference count, for internal use
+ * @work: Work item, for internal use
+ * @notify: Function to be called on change. This will be
+ * called in process context.
+ * @release: Function to be called on release. This will be
+ * called in process context. Once registered, the
+ * structure must only be freed when this function is
+ * called or later.
+ */
+struct irq_affinity_notify {
+ unsigned int irq;
+ struct kref kref;
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+ struct work_struct work;
+#endif
+ void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
+ void (*release)(struct kref *ref);
+};
+
#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
extern cpumask_var_t irq_default_affinity;
@@ -240,6 +264,13 @@ extern int irq_can_set_affinity(unsigned int irq);
extern int irq_select_affinity(unsigned int irq);
extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
+extern int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
+
+static inline void irq_run_affinity_notifiers(void)
+{
+ flush_scheduled_work();
+}
#else /* CONFIG_SMP */
static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
@@ -259,6 +290,16 @@ static inline int irq_set_affinity_hint(unsigned int irq,
{
return -EINVAL;
}
+
+static inline int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+ return 0;
+}
+
+static inline void irq_run_affinity_notifiers(void)
+{
+}
#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
#ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 979c68c..5e0d2e4 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -8,6 +8,7 @@
* For now it's included from <linux/irq.h>
*/
+struct irq_affinity_notify;
struct proc_dir_entry;
struct timer_rand_state;
/**
@@ -24,6 +25,7 @@ struct timer_rand_state;
* @last_unhandled: aging timer for unhandled count
* @irqs_unhandled: stats field for spurious unhandled interrupts
* @lock: locking for SMP
+ * @affinity_notify: context for notification of affinity changes
* @pending_mask: pending rebalanced interrupts
* @threads_active: number of irqaction threads currently running
* @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
@@ -70,6 +72,7 @@ struct irq_desc {
raw_spinlock_t lock;
#ifdef CONFIG_SMP
const struct cpumask *affinity_hint;
+ struct irq_affinity_notify *affinity_notify;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc..82b48d0 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
irq_set_thread_affinity(desc);
}
#endif
+ if (desc->affinity_notify) {
+ kref_get(&desc->affinity_notify->kref);
+ schedule_work(&desc->affinity_notify->work);
+ }
desc->status |= IRQ_AFFINITY_SET;
raw_spin_unlock_irqrestore(&desc->lock, flags);
return 0;
@@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+static void irq_affinity_notify(struct work_struct *work)
+{
+ struct irq_affinity_notify *notify =
+ container_of(work, struct irq_affinity_notify, work);
+ struct irq_desc *desc = irq_to_desc(notify->irq);
+ cpumask_var_t cpumask;
+ unsigned long flags;
+
+ if (!desc)
+ goto out;
+
+ if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ goto out;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+ if (desc->status & IRQ_MOVE_PENDING)
+ cpumask_copy(cpumask, desc->pending_mask);
+ else
+#endif
+ cpumask_copy(cpumask, desc->affinity);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ notify->notify(notify, cpumask);
+
+ free_cpumask_var(cpumask);
+out:
+ kref_put(¬ify->kref, notify->release);
+}
+
+/**
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated but before it is bound with request_irq()
+ * and must be disabled before the IRQ is freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_affinity_notify *old_notify;
+ unsigned long flags;
+
+ /* The release function is promised process context */
+ might_sleep();
+
+ if (!desc)
+ return -EINVAL;
+
+ /* Complete initialisation of *notify */
+ if (notify) {
+ notify->irq = irq;
+ kref_init(¬ify->kref);
+ INIT_WORK(¬ify->work, irq_affinity_notify);
+ }
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ if (old_notify)
+ kref_put(&old_notify->kref, old_notify->release);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
+
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
@@ -1002,6 +1079,10 @@ void free_irq(unsigned int irq, void *dev_id)
if (!desc)
return;
+#ifdef CONFIG_SMP
+ BUG_ON(desc->affinity_notify);
+#endif
+
chip_bus_lock(desc);
kfree(__free_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
--
1.7.3.2
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related [flat|nested] 9+ messages in thread* [RFC][PATCH 2/5] lib: cpu_rmap: CPU affinity reverse-mapping
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
2010-11-19 18:44 ` [RFC][PATCH 1/5] genirq: Add IRQ affinity notifiers Ben Hutchings
@ 2010-11-19 18:44 ` Ben Hutchings
2010-11-19 18:47 ` [RFC][PATCH 3/5] net: RPS: Enable hardware acceleration Ben Hutchings
` (3 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 18:44 UTC (permalink / raw)
To: David Miller, Tom Herbert, Thomas Gleixner
Cc: netdev, linux-kernel, linux-net-drivers
When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU. This requires a reverse-map of IRQ affinity. Add
library functions to support a generic reverse-mapping from CPUs to
objects with affinity and the specific case where the objects are
IRQs.
---
include/linux/cpu_rmap.h | 73 ++++++++++++
lib/Kconfig | 4 +
lib/Makefile | 2 +
lib/cpu_rmap.c | 272 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 351 insertions(+), 0 deletions(-)
create mode 100644 include/linux/cpu_rmap.h
create mode 100644 lib/cpu_rmap.c
diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
new file mode 100644
index 0000000..4771e95
--- /dev/null
+++ b/include/linux/cpu_rmap.h
@@ -0,0 +1,73 @@
+/*
+ * cpu_rmap.c: CPU affinity reverse-map support
+ * Copyright 2010 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+/**
+ * struct cpu_rmap - CPU affinity reverse-map
+ * @near: For each CPU, the index and distance to the nearest object,
+ * based on affinity masks
+ * @size: Number of objects to be reverse-mapped
+ * @used: Number of objects added
+ * @obj: Array of object pointers
+ */
+struct cpu_rmap {
+ struct {
+ u16 index;
+ u16 dist;
+ } near[NR_CPUS];
+ u16 size, used;
+ void *obj[0];
+};
+#define CPU_RMAP_DIST_INF 0xffff
+
+extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags);
+
+/**
+ * free_cpu_rmap - free CPU affinity reverse-map
+ * @rmap: Reverse-map allocated with alloc_cpu_rmap(), or %NULL
+ */
+static inline void free_cpu_rmap(struct cpu_rmap *rmap)
+{
+ kfree(rmap);
+}
+
+extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj);
+extern void cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
+ const struct cpumask *affinity);
+
+static inline u16 cpu_rmap_lookup_index(struct cpu_rmap *rmap, unsigned int cpu)
+{
+ return rmap->near[cpu].index;
+}
+
+static inline void *cpu_rmap_lookup_obj(struct cpu_rmap *rmap, unsigned int cpu)
+{
+ return rmap->obj[rmap->near[cpu].index];
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+
+/**
+ * alloc_irq_cpu_rmap - allocate CPU affinity reverse-map for IRQs
+ * @size: Number of objects to be mapped
+ *
+ * Must be called in process context.
+ */
+static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size)
+{
+ return alloc_cpu_rmap(size, GFP_KERNEL);
+}
+extern void free_irq_cpu_rmap(struct cpu_rmap *rmap);
+
+extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index fa9bf2c..a66c76d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -195,6 +195,10 @@ config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
depends on EXPERIMENTAL && BROKEN
+config CPU_RMAP
+ bool
+ depends on SMP
+
#
# Netlink attribute parsing support is select'ed if needed
#
diff --git a/lib/Makefile b/lib/Makefile
index e6a3763..ded1a81 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -106,6 +106,8 @@ obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
+obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
new file mode 100644
index 0000000..394f277
--- /dev/null
+++ b/lib/cpu_rmap.c
@@ -0,0 +1,272 @@
+/*
+ * cpu_rmap.c: CPU affinity reverse-map support
+ * Copyright 2010 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include <linux/cpu_rmap.h>
+#ifdef CONFIG_GENERIC_HARDIRQS
+#include <linux/interrupt.h>
+#endif
+#include <linux/module.h>
+
+/*
+ * These functions maintain a mapping from CPUs to some ordered set of
+ * objects with CPU affinities. This can be seen as a reverse-map of
+ * CPU affinity. However, we do not assume that the object affinities
+ * cover all CPUs in the system. For those CPUs not directly covered
+ * by object affinities, we attempt to find a nearest object based on
+ * CPU topology.
+ */
+
+/**
+ * alloc_cpu_rmap - allocate CPU affinity reverse-map
+ * @size: Number of objects to be mapped
+ * @flags: Allocation flags e.g. %GFP_KERNEL
+ */
+struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags)
+{
+ struct cpu_rmap *rmap;
+ unsigned int cpu;
+
+ /* This is a silly number of objects, and we use u16 indices. */
+ if (size > 0xffff)
+ return NULL;
+
+ rmap = kzalloc(sizeof(*rmap) + size * sizeof(rmap->obj[0]), flags);
+ if (!rmap)
+ return NULL;
+
+ /* Initially assign CPUs to objects on a rota, since we have
+ * no idea where the objects are. Use infinite distance, so
+ * any object with known distance is preferable. Include the
+ * CPUs that are not present/online, since we definitely want
+ * any newly-hotplugged CPUs to have some object assigned.
+ */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ rmap->near[cpu].index = cpu % size;
+ rmap->near[cpu].dist = CPU_RMAP_DIST_INF;
+ }
+
+ rmap->size = size;
+ return rmap;
+}
+EXPORT_SYMBOL(alloc_cpu_rmap);
+
+/* Reevaluate nearest object for neighbours of given object at the
+ * given distance.
+ */
+static void cpu_rmap_update_neigh(struct cpu_rmap *rmap,
+ const struct cpumask *mask,
+ u16 index, u16 dist)
+{
+ int neigh;
+
+ for_each_cpu(neigh, mask) {
+ if (dist < rmap->near[neigh].dist) {
+ rmap->near[neigh].index = index;
+ rmap->near[neigh].dist = dist;
+ }
+ }
+}
+
+/* Reevaluate nearest object for given CPU, comparing with the given
+ * neighbours at the given distance.
+ */
+static bool cpu_rmap_copy_neigh(struct cpu_rmap *rmap, unsigned int cpu,
+ const struct cpumask *mask, u16 dist)
+{
+ int neigh;
+
+ for_each_cpu(neigh, mask) {
+ if (rmap->near[neigh].dist <= dist) {
+ rmap->near[cpu].index = rmap->near[neigh].index;
+ rmap->near[cpu].dist = dist;
+ return true;
+ }
+ }
+ return false;
+}
+
+#ifdef DEBUG
+static void debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
+{
+ unsigned index;
+ unsigned int cpu;
+
+ pr_info("cpu_rmap %p, %s:\n", rmap, prefix);
+
+ for_each_possible_cpu(cpu) {
+ index = rmap->near[cpu].index;
+ pr_info("cpu %d -> obj %u (distance %u)\n",
+ cpu, index, rmap->near[cpu].dist);
+ }
+}
+#else
+static inline void
+debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
+{
+}
+#endif
+
+/**
+ * cpu_rmap_add - add object to a rmap
+ * @rmap: CPU rmap allocated with alloc_cpu_rmap()
+ * @obj: Object to add to rmap
+ *
+ * Return index of object.
+ */
+int cpu_rmap_add(struct cpu_rmap *rmap, void *obj)
+{
+ u16 index;
+
+ BUG_ON(rmap->used >= rmap->size);
+ index = rmap->used++;
+ rmap->obj[index] = obj;
+ return index;
+}
+EXPORT_SYMBOL(cpu_rmap_add);
+
+/**
+ * cpu_rmap_update - update CPU rmap following a change of object affinity
+ * @rmap: CPU rmap to update
+ * @index: Index of object whose affinity changed
+ * @affinity: New CPU affinity of object
+ */
+void cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
+ const struct cpumask *affinity)
+{
+ unsigned int cpu;
+
+ /* Invalidate old distances to this object */
+ for_each_online_cpu(cpu)
+ if (rmap->near[cpu].index == index)
+ rmap->near[cpu].dist = CPU_RMAP_DIST_INF;
+
+ debug_print_rmap(rmap, "after invalidating old distances");
+
+ /* Set this as the nearest object for all CPUs in the affinity mask,
+ * plus the following CPUs if they don't have a nearer object:
+ * - all other threads in the same core (distance 1);
+ * - all other cores in the same package (distance 2);
+ * - all other packages in the same NUMA node (distance 3).
+ */
+ for_each_cpu(cpu, affinity) {
+ rmap->near[cpu].index = index;
+ rmap->near[cpu].dist = 0;
+ cpu_rmap_update_neigh(rmap, topology_thread_cpumask(cpu),
+ index, 1);
+ cpu_rmap_update_neigh(rmap, topology_core_cpumask(cpu),
+ index, 2);
+ cpu_rmap_update_neigh(rmap, cpumask_of_node(cpu_to_node(cpu)),
+ index, 3);
+ }
+
+ debug_print_rmap(rmap, "after updating neighbours");
+
+ /* Find new nearest object for any CPUs left with invalid distances */
+ for_each_online_cpu(cpu) {
+ if (!(rmap->near[cpu].index == index &&
+ rmap->near[cpu].dist == CPU_RMAP_DIST_INF))
+ continue;
+ if (cpu_rmap_copy_neigh(rmap, cpu,
+ topology_thread_cpumask(cpu), 1))
+ continue;
+ if (cpu_rmap_copy_neigh(rmap, cpu,
+ topology_core_cpumask(cpu), 2))
+ continue;
+ if (cpu_rmap_copy_neigh(rmap, cpu,
+ cpumask_of_node(cpu_to_node(cpu)), 3))
+ continue;
+ /* We could continue into NUMA node distances, but for now
+ * we give up.
+ */
+ }
+
+ debug_print_rmap(rmap, "after copying neighbours");
+}
+EXPORT_SYMBOL(cpu_rmap_update);
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+
+/* Glue between IRQ affinity notifiers and CPU rmaps */
+
+struct irq_glue {
+ struct irq_affinity_notify notify;
+ struct cpu_rmap *rmap;
+ u16 index;
+};
+
+/**
+ * free_irq_cpu_rmap - free a CPU affinity reverse-map used for IRQs
+ * @rmap: Reverse-map allocated with alloc_irq_cpu_map(), or %NULL
+ *
+ * Must be called in process context, before freeing the IRQs, and
+ * without holding any locks required by global workqueue items.
+ */
+void free_irq_cpu_rmap(struct cpu_rmap *rmap)
+{
+ struct irq_glue *glue;
+ u16 index;
+
+ if (!rmap)
+ return;
+
+ for (index = 0; index < rmap->used; index++) {
+ glue = rmap->obj[index];
+ irq_set_affinity_notifier(glue->notify.irq, NULL);
+ }
+ irq_run_affinity_notifiers();
+
+ kfree(rmap);
+}
+EXPORT_SYMBOL(free_irq_cpu_rmap);
+
+static void
+irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask)
+{
+ struct irq_glue *glue =
+ container_of(notify, struct irq_glue, notify);
+ cpu_rmap_update(glue->rmap, glue->index, mask);
+}
+
+static void irq_cpu_rmap_release(struct kref *ref)
+{
+ struct irq_glue *glue =
+ container_of(ref, struct irq_glue, notify.kref);
+ kfree(glue);
+}
+
+/**
+ * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map
+ * @rmap: The reverse-map
+ * @irq: The IRQ number
+ *
+ * This adds an IRQ affinity notifier that will update the reverse-map
+ * automatically.
+ *
+ * Must be called in process context, after the IRQ is allocated but
+ * before it is bound with request_irq().
+ */
+int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
+{
+ struct irq_glue *glue = kzalloc(sizeof(*glue), GFP_KERNEL);
+ int rc;
+
+ if (!glue)
+ return -ENOMEM;
+ glue->notify.notify = irq_cpu_rmap_notify;
+ glue->notify.release = irq_cpu_rmap_release;
+ glue->rmap = rmap;
+ glue->index = cpu_rmap_add(rmap, glue);
+ rc = irq_set_affinity_notifier(irq, &glue->notify);
+ if (rc)
+ kfree(glue);
+ return rc;
+}
+EXPORT_SYMBOL(irq_cpu_rmap_add);
+
+#endif /* CONFIG_GENERIC_HARDIRQS */
--
1.7.3.2
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related [flat|nested] 9+ messages in thread* [RFC][PATCH 3/5] net: RPS: Enable hardware acceleration
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
2010-11-19 18:44 ` [RFC][PATCH 1/5] genirq: Add IRQ affinity notifiers Ben Hutchings
2010-11-19 18:44 ` [RFC][PATCH 2/5] lib: cpu_rmap: CPU affinity reverse-mapping Ben Hutchings
@ 2010-11-19 18:47 ` Ben Hutchings
2010-11-19 18:47 ` [RFC][PATCH 4/5] sfc: Limit filter search depth further for performance hints (i.e. RFS) Ben Hutchings
` (2 subsequent siblings)
5 siblings, 0 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 18:47 UTC (permalink / raw)
To: David Miller, Tom Herbert; +Cc: netdev, linux-net-drivers
Allow drivers for multiqueue hardware with flow filter tables to
accelerate RFS. The driver must:
1. Set net_device::rx_cpu_rmap to a cpu_rmap of the RX completion
IRQs (in queue order). This will provide a mapping from CPUs to the
queues for which completions are handled nearest to them.
2. Implement net_device_ops::ndo_rx_flow_steer. This operation adds
or replaces a filter steering the given flow to the given RX queue, if
possible.
3. Periodically remove filters for which rps_may_expire_flow() returns
true.
---
At Netconf, Tom Herbert suggested making ndo_start_xmit() set up filters
using information attached to the skbs it receives, rather than adding a
callback from the RX path. This would be fine for TCP but not for all
UDP applications and in particular it seems like it would be useless for
multicast receivers.
Ben.
include/linux/netdevice.h | 31 ++++++++++++++--
net/Kconfig | 1 +
net/core/dev.c | 89 ++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 112 insertions(+), 9 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b45c1b8..7875042 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -530,14 +530,16 @@ struct rps_map {
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
/*
- * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
- * tail pointer for that CPU's input queue at the time of last enqueue.
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
*/
struct rps_dev_flow {
u16 cpu;
- u16 fill;
+ u16 filter;
unsigned int last_qtail;
};
+#define RPS_NO_FILTER 0xffff
/*
* The rps_dev_flow_table structure contains a table of flow mappings.
@@ -587,6 +589,9 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
+extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id);
+
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct rps_map __rcu *rps_map;
@@ -706,6 +711,13 @@ struct netdev_rx_queue {
* int (*ndo_set_vf_port)(struct net_device *dev, int vf,
* struct nlattr *port[]);
* int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ *
+ * RFS acceleration.
+ * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
+ * u16 rxq_index, u32 flow_id);
+ * Set hardware filter for RFS. rxq_index is the target queue index;
+ * flow_id is a flow ID to be passed to rps_may_expire_flow() later.
+ * Return the filter ID on success, or a negative error code.
*/
#define HAVE_NET_DEVICE_OPS
struct net_device_ops {
@@ -778,6 +790,12 @@ struct net_device_ops {
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
#endif
+#ifdef CONFIG_RPS
+ int (*ndo_rx_flow_steer)(struct net_device *dev,
+ const struct sk_buff *skb,
+ u16 rxq_index,
+ u32 flow_id);
+#endif
};
/*
@@ -992,6 +1010,13 @@ struct net_device {
/* Number of RX queues currently active in device */
unsigned int real_num_rx_queues;
+
+#ifdef CONFIG_CPU_RMAP
+ /* CPU reverse-mapping for RX completion interrupts, indexed
+ * by RX queue number. Assigned by driver. This must only be
+ * set if the ndo_rx_flow_steer operation is defined. */
+ struct cpu_rmap *rx_cpu_rmap;
+#endif
#endif
rx_handler_func_t __rcu *rx_handler;
diff --git a/net/Kconfig b/net/Kconfig
index 55fd82e..f330a45 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -218,6 +218,7 @@ source "net/dns_resolver/Kconfig"
config RPS
boolean
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
+ select CPU_RMAP
default y
menu "Network testing"
diff --git a/net/core/dev.c b/net/core/dev.c
index 381b8e2..7650ca9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
#include <trace/events/skb.h>
#include <linux/pci.h>
#include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
#include "net-sysfs.h"
@@ -2435,6 +2436,49 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+ struct rps_dev_flow *rflow, u16 next_cpu)
+{
+ struct netdev_rx_queue *rxqueue;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *old_rflow;
+ u32 flow_id;
+ u16 rxq_index;
+ u16 tcpu;
+ int rc;
+
+ tcpu = rflow->cpu = next_cpu;
+ if (tcpu == RPS_NO_CPU)
+ return rflow;
+
+ /* Should we steer this flow to a different hardware queue? */
+ if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
+ goto out;
+ rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+ if (rxq_index == skb_get_rx_queue(skb))
+ goto out;
+
+ rxqueue = dev->_rx + rxq_index;
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (!flow_table)
+ goto out;
+ flow_id = skb->rxhash & flow_table->mask;
+ rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id);
+ if (rc < 0)
+ goto out;
+ old_rflow = rflow;
+ rflow = &flow_table->flows[flow_id];
+ rflow->cpu = next_cpu;
+ rflow->filter = rc;
+ if (old_rflow->filter == rflow->filter)
+ old_rflow->filter = RPS_NO_FILTER;
+
+out:
+ rflow->last_qtail = per_cpu(softnet_data, tcpu).input_queue_head;
+ return rflow;
+}
+
/*
* get_rps_cpu is called from netif_receive_skb and returns the target
* CPU from the RPS map of the receiving queue for a given skb.
@@ -2505,12 +2549,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
if (unlikely(tcpu != next_cpu) &&
(tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
((int)(per_cpu(softnet_data, tcpu).input_queue_head -
- rflow->last_qtail)) >= 0)) {
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU)
- rflow->last_qtail = per_cpu(softnet_data,
- tcpu).input_queue_head;
- }
+ rflow->last_qtail)) >= 0))
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
*rflowp = rflow;
cpu = tcpu;
@@ -2531,6 +2572,42 @@ done:
return cpu;
}
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+ u32 flow_id, u16 filter_id)
+{
+ struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+ struct rps_dev_flow_table *flow_table;
+ struct rps_dev_flow *rflow;
+ bool expire = true;
+ int cpu;
+
+ rcu_read_lock();
+ flow_table = rcu_dereference(rxqueue->rps_flow_table);
+ if (flow_table && flow_id <= flow_table->mask) {
+ rflow = &flow_table->flows[flow_id];
+ cpu = ACCESS_ONCE(rflow->cpu);
+ if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+ ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+ rflow->last_qtail) <
+ (int)(10 * flow_table->mask)))
+ expire = false;
+ }
+ rcu_read_unlock();
+ return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
/* Called from hardirq (IPI) context */
static void rps_trigger_softirq(void *data)
{
--
1.7.3.2
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related [flat|nested] 9+ messages in thread* [RFC][PATCH 4/5] sfc: Limit filter search depth further for performance hints (i.e. RFS)
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
` (2 preceding siblings ...)
2010-11-19 18:47 ` [RFC][PATCH 3/5] net: RPS: Enable hardware acceleration Ben Hutchings
@ 2010-11-19 18:47 ` Ben Hutchings
2010-11-19 18:48 ` [RFC][PATCH 5/5] sfc: Implement RFS acceleration Ben Hutchings
2010-11-19 19:19 ` [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
5 siblings, 0 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 18:47 UTC (permalink / raw)
To: David Miller, Tom Herbert; +Cc: netdev, linux-net-drivers
---
drivers/net/sfc/filter.c | 13 +++++++++----
1 files changed, 9 insertions(+), 4 deletions(-)
diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index 52cb608..e0ad1b8 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -26,6 +26,10 @@
*/
#define FILTER_CTL_SRCH_MAX 200
+/* Don't try very hard to find space for performance hints, as this is
+ * counter-productive. */
+#define FILTER_CTL_SRCH_HINT_MAX 5
+
struct efx_filter_table {
u32 offset; /* address of table relative to BAR */
unsigned size; /* number of entries */
@@ -182,15 +186,16 @@ static int efx_filter_search(struct efx_filter_table *table,
struct efx_filter_spec *spec, u32 key,
bool for_insert, int *depth_required)
{
- unsigned hash, incr, filter_idx, depth;
+ unsigned hash, incr, filter_idx, depth, depth_max;
struct efx_filter_spec *cmp;
hash = efx_filter_hash(key);
incr = efx_filter_increment(key);
+ depth_max = (spec->priority <= EFX_FILTER_PRI_HINT ?
+ FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX);
for (depth = 1, filter_idx = hash & (table->size - 1);
- depth <= FILTER_CTL_SRCH_MAX &&
- test_bit(filter_idx, table->used_bitmap);
+ depth <= depth_max && test_bit(filter_idx, table->used_bitmap);
++depth) {
cmp = &table->spec[filter_idx];
if (efx_filter_equal(spec, cmp))
@@ -199,7 +204,7 @@ static int efx_filter_search(struct efx_filter_table *table,
}
if (!for_insert)
return -ENOENT;
- if (depth > FILTER_CTL_SRCH_MAX)
+ if (depth > depth_max)
return -EBUSY;
found:
*depth_required = depth;
--
1.7.3.2
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related [flat|nested] 9+ messages in thread* [RFC][PATCH 5/5] sfc: Implement RFS acceleration
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
` (3 preceding siblings ...)
2010-11-19 18:47 ` [RFC][PATCH 4/5] sfc: Limit filter search depth further for performance hints (i.e. RFS) Ben Hutchings
@ 2010-11-19 18:48 ` Ben Hutchings
2010-11-19 19:19 ` [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
5 siblings, 0 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 18:48 UTC (permalink / raw)
To: David Miller, Tom Herbert; +Cc: netdev, linux-net-drivers
---
drivers/net/sfc/Kconfig | 4 ++
drivers/net/sfc/efx.c | 66 ++++++++++++++++++++++++++----
drivers/net/sfc/efx.h | 9 ++++
drivers/net/sfc/filter.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 170 insertions(+), 9 deletions(-)
diff --git a/drivers/net/sfc/Kconfig b/drivers/net/sfc/Kconfig
index a65c986..8d286c3 100644
--- a/drivers/net/sfc/Kconfig
+++ b/drivers/net/sfc/Kconfig
@@ -20,3 +20,7 @@ config SFC_MTD
This exposes the on-board flash memory as MTD devices (e.g.
/dev/mtd1). This makes it possible to upload new firmware
to the NIC.
+config SFC_RFS_ACCEL
+ bool
+ depends on SFC && RPS && GENERIC_HARDIRQS
+ default y
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 05df20e..ee2118a 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -21,6 +21,7 @@
#include <linux/ethtool.h>
#include <linux/topology.h>
#include <linux/gfp.h>
+#include <linux/cpu_rmap.h>
#include "net_driver.h"
#include "efx.h"
#include "mdio_10g.h"
@@ -119,6 +120,8 @@ static int napi_weight = 64;
* monitor. On Falcon-based NICs, this will:
* - Check the on-board hardware monitor;
* - Poll the link state and reconfigure the hardware as necessary.
+ * If RFS is enabled, this will scan part of the RX IP filter table and
+ * remove filters for inactive flows.
*/
static unsigned int efx_monitor_interval = 1 * HZ;
@@ -1163,10 +1166,32 @@ static int efx_wanted_channels(void)
return count;
}
+static int
+efx_init_rx_cpu_rmap(struct efx_nic *efx, struct msix_entry *xentries)
+{
+#ifdef CONFIG_SFC_RFS_ACCEL
+ int i, rc;
+
+ efx->net_dev->rx_cpu_rmap = alloc_irq_cpu_rmap(efx->n_rx_channels);
+ if (!efx->net_dev->rx_cpu_rmap)
+ return -ENOMEM;
+ for (i = 0; i < efx->n_rx_channels; i++) {
+ rc = irq_cpu_rmap_add(efx->net_dev->rx_cpu_rmap,
+ xentries[i].vector);
+ if (rc) {
+ free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap);
+ efx->net_dev->rx_cpu_rmap = NULL;
+ return rc;
+ }
+ }
+#endif
+ return 0;
+}
+
/* Probe the number and type of interrupts we are able to obtain, and
* the resulting numbers of channels and RX queues.
*/
-static void efx_probe_interrupts(struct efx_nic *efx)
+static int efx_probe_interrupts(struct efx_nic *efx)
{
int max_channels =
min_t(int, efx->type->phys_addr_channels, EFX_MAX_CHANNELS);
@@ -1208,6 +1233,11 @@ static void efx_probe_interrupts(struct efx_nic *efx)
efx->n_tx_channels = efx->n_channels;
efx->n_rx_channels = efx->n_channels;
}
+ rc = efx_init_rx_cpu_rmap(efx, xentries);
+ if (rc) {
+ pci_disable_msix(efx->pci_dev);
+ return rc;
+ }
for (i = 0; i < n_channels; i++)
efx_get_channel(efx, i)->irq =
xentries[i].vector;
@@ -1241,6 +1271,8 @@ static void efx_probe_interrupts(struct efx_nic *efx)
efx->n_tx_channels = 1;
efx->legacy_irq = efx->pci_dev->irq;
}
+
+ return 0;
}
static void efx_remove_interrupts(struct efx_nic *efx)
@@ -1299,7 +1331,9 @@ static int efx_probe_nic(struct efx_nic *efx)
/* Determine the number of channels and queues by trying to hook
* in MSI-X interrupts. */
- efx_probe_interrupts(efx);
+ rc = efx_probe_interrupts(efx);
+ if (rc)
+ goto fail;
if (efx->n_channels > 1)
get_random_bytes(&efx->rx_hash_key, sizeof(efx->rx_hash_key));
@@ -1314,6 +1348,10 @@ static int efx_probe_nic(struct efx_nic *efx)
efx_init_irq_moderation(efx, tx_irq_mod_usec, rx_irq_mod_usec, true);
return 0;
+
+fail:
+ efx->type->remove(efx);
+ return rc;
}
static void efx_remove_nic(struct efx_nic *efx)
@@ -1411,13 +1449,15 @@ static void efx_start_all(struct efx_nic *efx)
if (efx->reset_pending != RESET_TYPE_NONE)
efx_mcdi_mode_poll(efx);
- /* Start the hardware monitor if there is one. Otherwise (we're link
- * event driven), we have to poll the PHY because after an event queue
- * flush, we could have a missed a link state change */
- if (efx->type->monitor != NULL) {
+ /* Start the periodic monitor if necessary */
+ if (efx->type->monitor || efx_filter_rfs_enabled())
queue_delayed_work(efx->workqueue, &efx->monitor_work,
efx_monitor_interval);
- } else {
+
+ /* If we normally rely on link state events, we have to poll
+ * the PHY because after an event queue flush, we could have a
+ * missed a link state change */
+ if (!efx->type->monitor) {
mutex_lock(&efx->mac_lock);
if (efx->phy_op->poll(efx))
efx_link_status_changed(efx);
@@ -1548,17 +1588,18 @@ static void efx_monitor(struct work_struct *data)
netif_vdbg(efx, timer, efx->net_dev,
"hardware monitor executing on CPU %d\n",
raw_smp_processor_id());
- BUG_ON(efx->type->monitor == NULL);
/* If the mac_lock is already held then it is likely a port
* reconfiguration is already in place, which will likely do
* most of the work of monitor() anyway. */
- if (mutex_trylock(&efx->mac_lock)) {
+ if (efx->type->monitor && mutex_trylock(&efx->mac_lock)) {
if (efx->port_enabled)
efx->type->monitor(efx);
mutex_unlock(&efx->mac_lock);
}
+ efx_filter_rfs_expire(efx);
+
queue_delayed_work(efx->workqueue, &efx->monitor_work,
efx_monitor_interval);
}
@@ -1841,6 +1882,9 @@ static const struct net_device_ops efx_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = efx_netpoll,
#endif
+#ifdef CONFIG_SFC_RFS_ACCEL
+ .ndo_rx_flow_steer = efx_filter_rfs,
+#endif
};
static void efx_update_name(struct efx_nic *efx)
@@ -2276,6 +2320,10 @@ static void efx_fini_struct(struct efx_nic *efx)
*/
static void efx_pci_remove_main(struct efx_nic *efx)
{
+#ifdef CONFIG_SFC_RFS_ACCEL
+ free_irq_cpu_rmap(efx->net_dev->rx_cpu_rmap);
+ efx->net_dev->rx_cpu_rmap = NULL;
+#endif
efx_nic_fini_interrupt(efx);
efx_fini_channels(efx);
efx_fini_port(efx);
diff --git a/drivers/net/sfc/efx.h b/drivers/net/sfc/efx.h
index 10a1bf4..8b8cf63 100644
--- a/drivers/net/sfc/efx.h
+++ b/drivers/net/sfc/efx.h
@@ -77,6 +77,15 @@ extern int efx_filter_remove_filter(struct efx_nic *efx,
extern void efx_filter_table_clear(struct efx_nic *efx,
enum efx_filter_table_id table_id,
enum efx_filter_priority priority);
+#ifdef CONFIG_SFC_RFS_ACCEL
+extern int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+ u16 rxq_index, u32 flow_id);
+extern void efx_filter_rfs_expire(struct efx_nic *efx);
+#define efx_filter_rfs_enabled() 1
+#else
+static inline void efx_filter_rfs_expire(struct efx_nic *efx) {}
+#define efx_filter_rfs_enabled() 0
+#endif
/* Channels */
extern void efx_process_channel_now(struct efx_channel *channel);
diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index e0ad1b8..2f64703 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -7,6 +7,8 @@
* by the Free Software Foundation, incorporated herein by reference.
*/
+#include <net/ip.h>
+
#include "efx.h"
#include "filter.h"
#include "io.h"
@@ -43,6 +45,10 @@ struct efx_filter_state {
spinlock_t lock;
struct efx_filter_table table[EFX_FILTER_TABLE_COUNT];
unsigned search_depth[EFX_FILTER_TYPE_COUNT];
+#ifdef CONFIG_SFC_RFS_ACCEL
+ u32 *rps_flow_id;
+ unsigned rps_expire_index;
+#endif
};
/* The filter hash function is LFSR polynomial x^16 + x^3 + 1 of a 32-bit
@@ -411,6 +417,13 @@ int efx_probe_filters(struct efx_nic *efx)
spin_lock_init(&state->lock);
if (efx_nic_rev(efx) >= EFX_REV_FALCON_B0) {
+#ifdef CONFIG_SFC_RFS_ACCEL
+ state->rps_flow_id = kcalloc(FR_BZ_RX_FILTER_TBL0_ROWS,
+ sizeof(*state->rps_flow_id),
+ GFP_KERNEL);
+ if (!state->rps_flow_id)
+ goto fail;
+#endif
table = &state->table[EFX_FILTER_TABLE_RX_IP];
table->offset = FR_BZ_RX_FILTER_TBL0;
table->size = FR_BZ_RX_FILTER_TBL0_ROWS;
@@ -455,5 +468,92 @@ void efx_remove_filters(struct efx_nic *efx)
kfree(state->table[table_id].used_bitmap);
vfree(state->table[table_id].spec);
}
+#ifdef CONFIG_SFC_RFS_ACCEL
+ kfree(state->rps_flow_id);
+#endif
kfree(state);
}
+
+#ifdef CONFIG_SFC_RFS_ACCEL
+
+int efx_filter_rfs(struct net_device *net_dev, const struct sk_buff *skb,
+ u16 rxq_index, u32 flow_id)
+{
+ struct efx_nic *efx = netdev_priv(net_dev);
+ struct efx_filter_state *state = efx->filter_state;
+ struct efx_filter_spec spec;
+ const struct iphdr *ip;
+ const __be16 *ports;
+ int nhoff;
+ int rc;
+
+ nhoff = skb_network_offset(skb);
+
+ if (skb->protocol != htons(ETH_P_IP))
+ return -EPROTONOSUPPORT;
+
+ /* RFS must validate the IP header length before calling us */
+ EFX_BUG_ON_PARANOID(!pskb_may_pull(skb, nhoff + sizeof(*ip)));
+ ip = (const struct iphdr *)(skb->data + nhoff);
+ if (ip->frag_off & htons(IP_MF | IP_OFFSET))
+ return -EPROTONOSUPPORT;
+ EFX_BUG_ON_PARANOID(!pskb_may_pull(skb, nhoff + 4 * ip->ihl + 4));
+ ports = (const __be16 *)(skb->data + nhoff + 4 * ip->ihl);
+
+ switch (ip->protocol) {
+ case IPPROTO_TCP:
+ efx_filter_set_rx_tcp_full(&spec,
+ ntohl(ip->saddr), ntohs(ports[0]),
+ ntohl(ip->daddr), ntohs(ports[1]));
+ break;
+ case IPPROTO_UDP:
+ efx_filter_set_rx_udp_full(&spec,
+ ntohl(ip->saddr), ntohs(ports[0]),
+ ntohl(ip->daddr), ntohs(ports[1]));
+ break;
+ default:
+ return -EPROTONOSUPPORT;
+ }
+ spec.priority = EFX_FILTER_PRI_HINT;
+ spec.dmaq_id = rxq_index;
+
+ rc = efx_filter_insert_filter(efx, &spec, true);
+ if (rc >= 0)
+ state->rps_flow_id[rc] = flow_id;
+
+ return rc;
+}
+
+void efx_filter_rfs_expire(struct efx_nic *efx)
+{
+ struct efx_filter_state *state = efx->filter_state;
+ struct efx_filter_table *table = &state->table[EFX_FILTER_TABLE_RX_IP];
+ unsigned mask = table->size - 1;
+ unsigned index;
+ unsigned stop;
+
+ spin_lock_bh(&state->lock);
+
+ /* Check filters in batches of 1024 */
+ index = state->rps_expire_index;
+ stop = (index + 1024) & mask;
+
+ while (index != stop) {
+ if (test_bit(index, table->used_bitmap) &&
+ table->spec[index].priority == EFX_FILTER_PRI_HINT &&
+ rps_may_expire_flow(efx->net_dev,
+ table->spec[index].dmaq_id,
+ state->rps_flow_id[index], index))
+ efx_filter_table_clear_entry(efx, table, index);
+ index = (index + 1) & mask;
+ }
+
+ state->rps_expire_index = stop;
+ if (table->used == 0)
+ efx_filter_table_reset_search_depth(state,
+ EFX_FILTER_TABLE_RX_IP);
+
+ spin_unlock_bh(&state->lock);
+}
+
+#endif /* CONFIG_SFC_RFS_ACCEL */
--
1.7.3.2
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [RFC][PATCH 0/5] RFS hardware acceleration (v2)
2010-11-19 18:42 [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
` (4 preceding siblings ...)
2010-11-19 18:48 ` [RFC][PATCH 5/5] sfc: Implement RFS acceleration Ben Hutchings
@ 2010-11-19 19:19 ` Ben Hutchings
2010-11-19 19:42 ` Tom Herbert
2010-11-19 21:16 ` Rick Jones
5 siblings, 2 replies; 9+ messages in thread
From: Ben Hutchings @ 2010-11-19 19:19 UTC (permalink / raw)
To: David Miller, Tom Herbert, Rick Jones; +Cc: netdev, linux-net-drivers
So, some preliminary benchmark results.
Tom said he was using 200 concurrent netperf TCP_RR tests, so I've done
the same, using netperf 2.4.1 (a bit out of date, I know).
The test machines were a Dell R810 with 4 * Xeon E7520 1.87 GHz and a
Dell R900 with 4 * Xeon X7350 2.92 GHz (both quad-core processors, with
HT disabled, for a total of 16 cores).
The kernel was an x86-64 build of net-next-2.6 with NUMA and
PREEMPT_VOLUNTARY enabled.
The NICs were Solarstorm SFN5122F (dual-port SFP+) adapters connected
with a Direct Attach cable.
The sfc driver allocates 4 IRQs per port (and it doesn't seem to be
possible to allocate more on this hardware), which I pinned to the first
core of each package.
I tested with and without pinning of processes. When pinning, I
assigned netperf and netserver processes to all 16 cores in rotation.
Unpinned Pinned
No RFS Soft Accel No RFS Soft Accel
RFS RFS RFS RFS
Request size = 1, response size = 1, moderation = 60 us adaptive (default)
avg(Hz) 1759 3213 3633 2222 3523 3848
std.dev 189 76 265 703 2136 2120
lat(us) 568 311 275 450 284 260
scaled 0.55 0.88 0.63 0.92
Request size = 1, response size = 1, moderation = 20 us adaptive
avg(Hz) 1797 3616 3917 2458 3706 4125
std.dev 260 101 295 1098 1987 2186
lat(us) 556 277 255 407 270 242
scaled 0.50 0.92 0.66 0.90
Request size = 100, response size = 10000, moderation = 60 us adaptive
avg(Hz) 1658 2909 3230 2338 3003 3437
std.dev 149 144 221 993 856 1615
lat(us) 603 344 310 428 333 291
scaled 0.57 0.90 0.78 0.87
Request size = 100, response size = 10000, moderation = 20 us adaptive
avg(Hz) 3348 3110 3331 2470 3271 3487
std.dev 406 176 364 1110 1693 1817
lat(us) 299 322 300 405 306 287
scaled 1.08 0.93 0.76 0.94
So accelerated RFS gave a 6-13% improvement over software RFS in
transaction rate for these various cases.
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [RFC][PATCH 0/5] RFS hardware acceleration (v2)
2010-11-19 19:19 ` [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
@ 2010-11-19 19:42 ` Tom Herbert
2010-11-19 21:16 ` Rick Jones
1 sibling, 0 replies; 9+ messages in thread
From: Tom Herbert @ 2010-11-19 19:42 UTC (permalink / raw)
To: Ben Hutchings; +Cc: David Miller, Rick Jones, netdev, linux-net-drivers
> So accelerated RFS gave a 6-13% improvement over software RFS in
> transaction rate for these various cases.
>
Very nice preliminary results! Are you seeing any OOO packets in
these tests, I am still holding on to hope that these can be avoided
with such mechanisms.
Tom
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 0/5] RFS hardware acceleration (v2)
2010-11-19 19:19 ` [RFC][PATCH 0/5] RFS hardware acceleration (v2) Ben Hutchings
2010-11-19 19:42 ` Tom Herbert
@ 2010-11-19 21:16 ` Rick Jones
1 sibling, 0 replies; 9+ messages in thread
From: Rick Jones @ 2010-11-19 21:16 UTC (permalink / raw)
To: Ben Hutchings; +Cc: David Miller, Tom Herbert, netdev, linux-net-drivers
Ben Hutchings wrote:
> So, some preliminary benchmark results.
>
> Tom said he was using 200 concurrent netperf TCP_RR tests, so I've done
> the same, using netperf 2.4.1 (a bit out of date, I know).
Not a huge deal for a basic TCP_RR test though.
<data snipped>
> So accelerated RFS gave a 6-13% improvement over software RFS in
> transaction rate for these various cases.
Do you have any data on frequency with which unpinned netperf processes migrated
from one core to another? Or PCIe utilization?
rick jones
^ permalink raw reply [flat|nested] 9+ messages in thread