Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next-2.6 4/5] sfc: Limit filter search depth further for performance hints (i.e. RFS)
From: Ben Hutchings @ 2011-01-19 21:06 UTC (permalink / raw)
  To: Tom Herbert; +Cc: netdev, linux-net-drivers
In-Reply-To: <1295470787.11126.82.camel@bwh-desktop>

---
I consider this experimental still; so it's not signed-off.

Ben.

 drivers/net/sfc/filter.c |   13 +++++++++----
 1 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/sfc/filter.c b/drivers/net/sfc/filter.c
index d4722c4..47a1b79 100644
--- a/drivers/net/sfc/filter.c
+++ b/drivers/net/sfc/filter.c
@@ -27,6 +27,10 @@
  */
 #define FILTER_CTL_SRCH_MAX 200
 
+/* Don't try very hard to find space for performance hints, as this is
+ * counter-productive. */
+#define FILTER_CTL_SRCH_HINT_MAX 5
+
 enum efx_filter_table_id {
 	EFX_FILTER_TABLE_RX_IP = 0,
 	EFX_FILTER_TABLE_RX_MAC,
@@ -325,15 +329,16 @@ static int efx_filter_search(struct efx_filter_table *table,
 			     struct efx_filter_spec *spec, u32 key,
 			     bool for_insert, int *depth_required)
 {
-	unsigned hash, incr, filter_idx, depth;
+	unsigned hash, incr, filter_idx, depth, depth_max;
 	struct efx_filter_spec *cmp;
 
 	hash = efx_filter_hash(key);
 	incr = efx_filter_increment(key);
+	depth_max = (spec->priority <= EFX_FILTER_PRI_HINT ?
+		     FILTER_CTL_SRCH_HINT_MAX : FILTER_CTL_SRCH_MAX);
 
 	for (depth = 1, filter_idx = hash & (table->size - 1);
-	     depth <= FILTER_CTL_SRCH_MAX &&
-		     test_bit(filter_idx, table->used_bitmap);
+	     depth <= depth_max && test_bit(filter_idx, table->used_bitmap);
 	     ++depth) {
 		cmp = &table->spec[filter_idx];
 		if (efx_filter_equal(spec, cmp))
@@ -342,7 +347,7 @@ static int efx_filter_search(struct efx_filter_table *table,
 	}
 	if (!for_insert)
 		return -ENOENT;
-	if (depth > FILTER_CTL_SRCH_MAX)
+	if (depth > depth_max)
 		return -EBUSY;
 found:
 	*depth_required = depth;
-- 
1.7.3.4



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH net-next-2.6 3/5] net: RPS: Enable hardware acceleration of RFS
From: Ben Hutchings @ 2011-01-19 21:03 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-net-drivers, Tom Herbert
In-Reply-To: <1295470787.11126.82.camel@bwh-desktop>

Allow drivers for multiqueue hardware with flow filter tables to
accelerate RFS.  The driver must:

1. Set net_device::rx_cpu_rmap to a cpu_rmap of the RX completion
IRQs (in queue order).  This will provide a mapping from CPUs to the
queues for which completions are handled nearest to them.

2. Implement net_device_ops::ndo_rx_flow_steer.  This operation adds
or replaces a filter steering the given flow to the given RX queue, if
possible.

3. Periodically remove filters for which rps_may_expire_flow() returns
true.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 include/linux/netdevice.h |   33 ++++++++++++++-
 net/Kconfig               |    6 +++
 net/core/dev.c            |   97 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 127 insertions(+), 9 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index d971346..abe8b2d 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -551,14 +551,16 @@ struct rps_map {
 #define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
 
 /*
- * The rps_dev_flow structure contains the mapping of a flow to a CPU and the
- * tail pointer for that CPU's input queue at the time of last enqueue.
+ * The rps_dev_flow structure contains the mapping of a flow to a CPU, the
+ * tail pointer for that CPU's input queue at the time of last enqueue, and
+ * a hardware filter index.
  */
 struct rps_dev_flow {
 	u16 cpu;
-	u16 fill;
+	u16 filter;
 	unsigned int last_qtail;
 };
+#define RPS_NO_FILTER 0xffff
 
 /*
  * The rps_dev_flow_table structure contains a table of flow mappings.
@@ -608,6 +610,11 @@ static inline void rps_reset_sock_flow(struct rps_sock_flow_table *table,
 
 extern struct rps_sock_flow_table __rcu *rps_sock_flow_table;
 
+#ifdef CONFIG_RFS_ACCEL
+extern bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+				u32 flow_id, u16 filter_id);
+#endif
+
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
 	struct rps_map __rcu		*rps_map;
@@ -753,6 +760,13 @@ struct xps_dev_maps {
  * int (*ndo_set_vf_port)(struct net_device *dev, int vf,
  *			  struct nlattr *port[]);
  * int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
+ *
+ *	RFS acceleration.
+ * int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
+ *			    u16 rxq_index, u32 flow_id);
+ *	Set hardware filter for RFS.  rxq_index is the target queue index;
+ *	flow_id is a flow ID to be passed to rps_may_expire_flow() later.
+ *	Return the filter ID on success, or a negative error code.
  */
 #define HAVE_NET_DEVICE_OPS
 struct net_device_ops {
@@ -825,6 +839,12 @@ struct net_device_ops {
 	int			(*ndo_fcoe_get_wwn)(struct net_device *dev,
 						    u64 *wwn, int type);
 #endif
+#ifdef CONFIG_RFS_ACCEL
+	int			(*ndo_rx_flow_steer)(struct net_device *dev,
+						     const struct sk_buff *skb,
+						     u16 rxq_index,
+						     u32 flow_id);
+#endif
 };
 
 /*
@@ -1039,6 +1059,13 @@ struct net_device {
 
 	/* Number of RX queues currently active in device */
 	unsigned int		real_num_rx_queues;
+
+#ifdef CONFIG_RFS_ACCEL
+	/* CPU reverse-mapping for RX completion interrupts, indexed
+	 * by RX queue number.  Assigned by driver.  This must only be
+	 * set if the ndo_rx_flow_steer operation is defined. */
+	struct cpu_rmap		*rx_cpu_rmap;
+#endif
 #endif
 
 	rx_handler_func_t __rcu	*rx_handler;
diff --git a/net/Kconfig b/net/Kconfig
index 7284062..79cabf1 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -221,6 +221,12 @@ config RPS
 	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
 	default y
 
+config RFS_ACCEL
+	boolean
+	depends on RPS && GENERIC_HARDIRQS
+	select CPU_RMAP
+	default y
+
 config XPS
 	boolean
 	depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
diff --git a/net/core/dev.c b/net/core/dev.c
index 7741507..3c18d9c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
 #include <trace/events/skb.h>
 #include <linux/pci.h>
 #include <linux/inetdevice.h>
+#include <linux/cpu_rmap.h>
 
 #include "net-sysfs.h"
 
@@ -2532,6 +2533,53 @@ EXPORT_SYMBOL(__skb_get_rxhash);
 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
 EXPORT_SYMBOL(rps_sock_flow_table);
 
+static struct rps_dev_flow *
+set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
+	    struct rps_dev_flow *rflow, u16 next_cpu)
+{
+	u16 tcpu;
+
+	tcpu = rflow->cpu = next_cpu;
+	if (tcpu != RPS_NO_CPU) {
+#ifdef CONFIG_RFS_ACCEL
+		struct netdev_rx_queue *rxqueue;
+		struct rps_dev_flow_table *flow_table;
+		struct rps_dev_flow *old_rflow;
+		u32 flow_id;
+		u16 rxq_index;
+		int rc;
+
+		/* Should we steer this flow to a different hardware queue? */
+		if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
+			goto out;
+		rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
+		if (rxq_index == skb_get_rx_queue(skb))
+			goto out;
+
+		rxqueue = dev->_rx + rxq_index;
+		flow_table = rcu_dereference(rxqueue->rps_flow_table);
+		if (!flow_table)
+			goto out;
+		flow_id = skb->rxhash & flow_table->mask;
+		rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
+							rxq_index, flow_id);
+		if (rc < 0)
+			goto out;
+		old_rflow = rflow;
+		rflow = &flow_table->flows[flow_id];
+		rflow->cpu = next_cpu;
+		rflow->filter = rc;
+		if (old_rflow->filter == rflow->filter)
+			old_rflow->filter = RPS_NO_FILTER;
+	out:
+#endif
+		rflow->last_qtail =
+			per_cpu(softnet_data, tcpu).input_queue_head;
+	}
+
+	return rflow;
+}
+
 /*
  * get_rps_cpu is called from netif_receive_skb and returns the target
  * CPU from the RPS map of the receiving queue for a given skb.
@@ -2602,12 +2650,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 		if (unlikely(tcpu != next_cpu) &&
 		    (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
 		     ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
-		      rflow->last_qtail)) >= 0)) {
-			tcpu = rflow->cpu = next_cpu;
-			if (tcpu != RPS_NO_CPU)
-				rflow->last_qtail = per_cpu(softnet_data,
-				    tcpu).input_queue_head;
-		}
+		      rflow->last_qtail)) >= 0))
+			rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+
 		if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
 			*rflowp = rflow;
 			cpu = tcpu;
@@ -2628,6 +2673,46 @@ done:
 	return cpu;
 }
 
+#ifdef CONFIG_RFS_ACCEL
+
+/**
+ * rps_may_expire_flow - check whether an RFS hardware filter may be removed
+ * @dev: Device on which the filter was set
+ * @rxq_index: RX queue index
+ * @flow_id: Flow ID passed to ndo_rx_flow_steer()
+ * @filter_id: Filter ID returned by ndo_rx_flow_steer()
+ *
+ * Drivers that implement ndo_rx_flow_steer() should periodically call
+ * this function for each installed filter and remove the filters for
+ * which it returns %true.
+ */
+bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
+			 u32 flow_id, u16 filter_id)
+{
+	struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
+	struct rps_dev_flow_table *flow_table;
+	struct rps_dev_flow *rflow;
+	bool expire = true;
+	int cpu;
+
+	rcu_read_lock();
+	flow_table = rcu_dereference(rxqueue->rps_flow_table);
+	if (flow_table && flow_id <= flow_table->mask) {
+		rflow = &flow_table->flows[flow_id];
+		cpu = ACCESS_ONCE(rflow->cpu);
+		if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
+		    ((int)(per_cpu(softnet_data, cpu).input_queue_head -
+			   rflow->last_qtail) <
+		     (int)(10 * flow_table->mask)))
+			expire = false;
+	}
+	rcu_read_unlock();
+	return expire;
+}
+EXPORT_SYMBOL(rps_may_expire_flow);
+
+#endif /* CONFIG_RFS_ACCEL */
+
 /* Called from hardirq (IPI) context */
 static void rps_trigger_softirq(void *data)
 {
-- 
1.7.3.4



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH net-next-2.6 2/5] lib: cpu_rmap: CPU affinity reverse-mapping
From: Ben Hutchings @ 2011-01-19 21:03 UTC (permalink / raw)
  To: David Miller, Thomas Gleixner
  Cc: netdev, linux-net-drivers, Tom Herbert, linux-kernel
In-Reply-To: <1295470787.11126.82.camel@bwh-desktop>

When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU.  This requires a reverse-map of IRQ affinity.  Add
library functions to support a generic reverse-mapping from CPUs to
objects with affinity and the specific case where the objects are
IRQs.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
 include/linux/cpu_rmap.h |   73 +++++++++++++
 lib/Kconfig              |    4 +
 lib/Makefile             |    2 +
 lib/cpu_rmap.c           |  269 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 348 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/cpu_rmap.h
 create mode 100644 lib/cpu_rmap.c

diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h
new file mode 100644
index 0000000..473771a
--- /dev/null
+++ b/include/linux/cpu_rmap.h
@@ -0,0 +1,73 @@
+/*
+ * cpu_rmap.c: CPU affinity reverse-map support
+ * Copyright 2011 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+
+/**
+ * struct cpu_rmap - CPU affinity reverse-map
+ * @size: Number of objects to be reverse-mapped
+ * @used: Number of objects added
+ * @obj: Pointer to array of object pointers
+ * @near: For each CPU, the index and distance to the nearest object,
+ *      based on affinity masks
+ */
+struct cpu_rmap {
+	u16		size, used;
+	void		**obj;
+	struct {
+		u16	index;
+		u16	dist;
+	}		near[0];
+};
+#define CPU_RMAP_DIST_INF 0xffff
+
+extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags);
+
+/**
+ * free_cpu_rmap - free CPU affinity reverse-map
+ * @rmap: Reverse-map allocated with alloc_cpu_rmap(), or %NULL
+ */
+static inline void free_cpu_rmap(struct cpu_rmap *rmap)
+{
+	kfree(rmap);
+}
+
+extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj);
+extern int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
+			   const struct cpumask *affinity);
+
+static inline u16 cpu_rmap_lookup_index(struct cpu_rmap *rmap, unsigned int cpu)
+{
+	return rmap->near[cpu].index;
+}
+
+static inline void *cpu_rmap_lookup_obj(struct cpu_rmap *rmap, unsigned int cpu)
+{
+	return rmap->obj[rmap->near[cpu].index];
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+
+/**
+ * alloc_irq_cpu_rmap - allocate CPU affinity reverse-map for IRQs
+ * @size: Number of objects to be mapped
+ *
+ * Must be called in process context.
+ */
+static inline struct cpu_rmap *alloc_irq_cpu_rmap(unsigned int size)
+{
+	return alloc_cpu_rmap(size, GFP_KERNEL);
+}
+extern void free_irq_cpu_rmap(struct cpu_rmap *rmap);
+
+extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq);
+
+#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 0ee67e0..8334342 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -201,6 +201,10 @@ config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
        bool "Disable obsolete cpumask functions" if DEBUG_PER_CPU_MAPS
        depends on EXPERIMENTAL && BROKEN
 
+config CPU_RMAP
+	bool
+	depends on SMP
+
 #
 # Netlink attribute parsing support is select'ed if needed
 #
diff --git a/lib/Makefile b/lib/Makefile
index cbb774f..b73ba01 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -110,6 +110,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o
 
 obj-$(CONFIG_AVERAGE) += average.o
 
+obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
new file mode 100644
index 0000000..987acfa
--- /dev/null
+++ b/lib/cpu_rmap.c
@@ -0,0 +1,269 @@
+/*
+ * cpu_rmap.c: CPU affinity reverse-map support
+ * Copyright 2011 Solarflare Communications Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation, incorporated herein by reference.
+ */
+
+#include <linux/cpu_rmap.h>
+#ifdef CONFIG_GENERIC_HARDIRQS
+#include <linux/interrupt.h>
+#endif
+#include <linux/module.h>
+
+/*
+ * These functions maintain a mapping from CPUs to some ordered set of
+ * objects with CPU affinities.  This can be seen as a reverse-map of
+ * CPU affinity.  However, we do not assume that the object affinities
+ * cover all CPUs in the system.  For those CPUs not directly covered
+ * by object affinities, we attempt to find a nearest object based on
+ * CPU topology.
+ */
+
+/**
+ * alloc_cpu_rmap - allocate CPU affinity reverse-map
+ * @size: Number of objects to be mapped
+ * @flags: Allocation flags e.g. %GFP_KERNEL
+ */
+struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags)
+{
+	struct cpu_rmap *rmap;
+	unsigned int cpu;
+	size_t obj_offset;
+
+	/* This is a silly number of objects, and we use u16 indices. */
+	if (size > 0xffff)
+		return NULL;
+
+	/* Offset of object pointer array from base structure */
+	obj_offset = ALIGN(offsetof(struct cpu_rmap, near[nr_cpu_ids]),
+			   sizeof(void *));
+
+	rmap = kzalloc(obj_offset + size * sizeof(rmap->obj[0]), flags);
+	if (!rmap)
+		return NULL;
+
+	rmap->obj = (void **)((char *)rmap + obj_offset);
+
+	/* Initially assign CPUs to objects on a rota, since we have
+	 * no idea where the objects are.  Use infinite distance, so
+	 * any object with known distance is preferable.  Include the
+	 * CPUs that are not present/online, since we definitely want
+	 * any newly-hotplugged CPUs to have some object assigned.
+	 */
+	for_each_possible_cpu(cpu) {
+		rmap->near[cpu].index = cpu % size;
+		rmap->near[cpu].dist = CPU_RMAP_DIST_INF;
+	}
+
+	rmap->size = size;
+	return rmap;
+}
+EXPORT_SYMBOL(alloc_cpu_rmap);
+
+/* Reevaluate nearest object for given CPU, comparing with the given
+ * neighbours at the given distance.
+ */
+static bool cpu_rmap_copy_neigh(struct cpu_rmap *rmap, unsigned int cpu,
+				const struct cpumask *mask, u16 dist)
+{
+	int neigh;
+
+	for_each_cpu(neigh, mask) {
+		if (rmap->near[cpu].dist > dist &&
+		    rmap->near[neigh].dist <= dist) {
+			rmap->near[cpu].index = rmap->near[neigh].index;
+			rmap->near[cpu].dist = dist;
+			return true;
+		}
+	}
+	return false;
+}
+
+#ifdef DEBUG
+static void debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
+{
+	unsigned index;
+	unsigned int cpu;
+
+	pr_info("cpu_rmap %p, %s:\n", rmap, prefix);
+
+	for_each_possible_cpu(cpu) {
+		index = rmap->near[cpu].index;
+		pr_info("cpu %d -> obj %u (distance %u)\n",
+			cpu, index, rmap->near[cpu].dist);
+	}
+}
+#else
+static inline void
+debug_print_rmap(const struct cpu_rmap *rmap, const char *prefix)
+{
+}
+#endif
+
+/**
+ * cpu_rmap_add - add object to a rmap
+ * @rmap: CPU rmap allocated with alloc_cpu_rmap()
+ * @obj: Object to add to rmap
+ *
+ * Return index of object.
+ */
+int cpu_rmap_add(struct cpu_rmap *rmap, void *obj)
+{
+	u16 index;
+
+	BUG_ON(rmap->used >= rmap->size);
+	index = rmap->used++;
+	rmap->obj[index] = obj;
+	return index;
+}
+EXPORT_SYMBOL(cpu_rmap_add);
+
+/**
+ * cpu_rmap_update - update CPU rmap following a change of object affinity
+ * @rmap: CPU rmap to update
+ * @index: Index of object whose affinity changed
+ * @affinity: New CPU affinity of object
+ */
+int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
+		    const struct cpumask *affinity)
+{
+	cpumask_var_t update_mask;
+	unsigned int cpu;
+
+	if (unlikely(!zalloc_cpumask_var(&update_mask, GFP_KERNEL)))
+		return -ENOMEM;
+
+	/* Invalidate distance for all CPUs for which this used to be
+	 * the nearest object.  Mark those CPUs for update.
+	 */
+	for_each_online_cpu(cpu) {
+		if (rmap->near[cpu].index == index) {
+			rmap->near[cpu].dist = CPU_RMAP_DIST_INF;
+			cpumask_set_cpu(cpu, update_mask);
+		}
+	}
+
+	debug_print_rmap(rmap, "after invalidating old distances");
+
+	/* Set distance to 0 for all CPUs in the new affinity mask.
+	 * Mark all CPUs within their NUMA nodes for update.
+	 */
+	for_each_cpu(cpu, affinity) {
+		rmap->near[cpu].index = index;
+		rmap->near[cpu].dist = 0;
+		cpumask_or(update_mask, update_mask,
+			   cpumask_of_node(cpu_to_node(cpu)));
+	}
+
+	debug_print_rmap(rmap, "after updating neighbours");
+
+	/* Update distances based on topology */
+	for_each_cpu(cpu, update_mask) {
+		if (cpu_rmap_copy_neigh(rmap, cpu,
+					topology_thread_cpumask(cpu), 1))
+			continue;
+		if (cpu_rmap_copy_neigh(rmap, cpu,
+					topology_core_cpumask(cpu), 2))
+			continue;
+		if (cpu_rmap_copy_neigh(rmap, cpu,
+					cpumask_of_node(cpu_to_node(cpu)), 3))
+			continue;
+		/* We could continue into NUMA node distances, but for now
+		 * we give up.
+		 */
+	}
+
+	debug_print_rmap(rmap, "after copying neighbours");
+
+	free_cpumask_var(update_mask);
+	return 0;
+}
+EXPORT_SYMBOL(cpu_rmap_update);
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+
+/* Glue between IRQ affinity notifiers and CPU rmaps */
+
+struct irq_glue {
+	struct irq_affinity_notify notify;
+	struct cpu_rmap *rmap;
+	u16 index;
+};
+
+/**
+ * free_irq_cpu_rmap - free a CPU affinity reverse-map used for IRQs
+ * @rmap: Reverse-map allocated with alloc_irq_cpu_map(), or %NULL
+ *
+ * Must be called in process context, before freeing the IRQs, and
+ * without holding any locks required by global workqueue items.
+ */
+void free_irq_cpu_rmap(struct cpu_rmap *rmap)
+{
+	struct irq_glue *glue;
+	u16 index;
+
+	if (!rmap)
+		return;
+
+	for (index = 0; index < rmap->used; index++) {
+		glue = rmap->obj[index];
+		irq_set_affinity_notifier(glue->notify.irq, NULL);
+	}
+	irq_run_affinity_notifiers();
+
+	kfree(rmap);
+}
+EXPORT_SYMBOL(free_irq_cpu_rmap);
+
+static void
+irq_cpu_rmap_notify(struct irq_affinity_notify *notify, const cpumask_t *mask)
+{
+	struct irq_glue *glue =
+		container_of(notify, struct irq_glue, notify);
+	int rc;
+
+	rc = cpu_rmap_update(glue->rmap, glue->index, mask);
+	if (rc)
+		pr_warning("irq_cpu_rmap_notify: update failed: %d\n", rc);
+}
+
+static void irq_cpu_rmap_release(struct kref *ref)
+{
+	struct irq_glue *glue =
+		container_of(ref, struct irq_glue, notify.kref);
+	kfree(glue);
+}
+
+/**
+ * irq_cpu_rmap_add - add an IRQ to a CPU affinity reverse-map
+ * @rmap: The reverse-map
+ * @irq: The IRQ number
+ *
+ * This adds an IRQ affinity notifier that will update the reverse-map
+ * automatically.
+ *
+ * Must be called in process context, after the IRQ is allocated but
+ * before it is bound with request_irq().
+ */
+int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
+{
+	struct irq_glue *glue = kzalloc(sizeof(*glue), GFP_KERNEL);
+	int rc;
+
+	if (!glue)
+		return -ENOMEM;
+	glue->notify.notify = irq_cpu_rmap_notify;
+	glue->notify.release = irq_cpu_rmap_release;
+	glue->rmap = rmap;
+	glue->index = cpu_rmap_add(rmap, glue);
+	rc = irq_set_affinity_notifier(irq, &glue->notify);
+	if (rc)
+		kfree(glue);
+	return rc;
+}
+EXPORT_SYMBOL(irq_cpu_rmap_add);
+
+#endif /* CONFIG_GENERIC_HARDIRQS */
-- 
1.7.3.4



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH net-next-2.6 1/5] genirq: Add IRQ affinity notifiers
From: Ben Hutchings @ 2011-01-19 21:01 UTC (permalink / raw)
  To: Thomas Gleixner; +Cc: netdev, linux-net-drivers, Tom Herbert, David Miller
In-Reply-To: <1295470787.11126.82.camel@bwh-desktop>

When initiating I/O on a multiqueue and multi-IRQ device, we may want
to select a queue for which the response will be handled on the same
or a nearby CPU.  This requires a reverse-map of IRQ affinity.  Add a
notification mechanism to support this.

This is based closely on work by Thomas Gleixner <tglx@linutronix.de>.

Signed-off-by: Ben Hutchings <bhutchings@solarflare.com>
---
Thomas, I hope this answers all your comments.  If you are happy with
this, can it go into net-next-2.6 so the rest of this series can get
into 2.6.39?

Ben.

 include/linux/interrupt.h |   31 +++++++++++++++++
 include/linux/irqdesc.h   |    3 ++
 kernel/irq/manage.c       |   82 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 55e0d42..9823e37 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -14,6 +14,8 @@
 #include <linux/smp.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
 
 #include <asm/atomic.h>
 #include <asm/ptrace.h>
@@ -240,6 +242,35 @@ extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
 
 extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
+
+/**
+ * struct irq_affinity_notify - context for notification of IRQ affinity changes
+ * @irq:		Interrupt to which notification applies
+ * @kref:		Reference count, for internal use
+ * @work:		Work item, for internal use
+ * @notify:		Function to be called on change.  This will be
+ *			called in process context.
+ * @release:		Function to be called on release.  This will be
+ *			called in process context.  Once registered, the
+ *			structure must only be freed when this function is
+ *			called or later.
+ */
+struct irq_affinity_notify {
+        unsigned int irq;
+        struct kref kref;
+        struct work_struct work;
+        void (*notify)(struct irq_affinity_notify *, const cpumask_t *mask);
+        void (*release)(struct kref *ref);
+};
+
+extern int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
+
+static inline void irq_run_affinity_notifiers(void)
+{
+	flush_scheduled_work();
+}
+
 #else /* CONFIG_SMP */
 
 static inline int irq_set_affinity(unsigned int irq, const struct cpumask *m)
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
index 6a64c6f..b98ffa4 100644
--- a/include/linux/irqdesc.h
+++ b/include/linux/irqdesc.h
@@ -8,6 +8,7 @@
  * For now it's included from <linux/irq.h>
  */
 
+struct irq_affinity_notify;
 struct proc_dir_entry;
 struct timer_rand_state;
 /**
@@ -24,6 +25,7 @@ struct timer_rand_state;
  * @last_unhandled:	aging timer for unhandled count
  * @irqs_unhandled:	stats field for spurious unhandled interrupts
  * @lock:		locking for SMP
+ * @affinity_notify:	context for notification of affinity changes
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -70,6 +72,7 @@ struct irq_desc {
 	raw_spinlock_t		lock;
 #ifdef CONFIG_SMP
 	const struct cpumask	*affinity_hint;
+	struct irq_affinity_notify *affinity_notify;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0caa59f..e48019e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -134,6 +134,10 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		irq_set_thread_affinity(desc);
 	}
 #endif
+	if (desc->affinity_notify) {
+		kref_get(&desc->affinity_notify->kref);
+		schedule_work(&desc->affinity_notify->work);
+	}
 	desc->status |= IRQ_AFFINITY_SET;
 	raw_spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -155,6 +159,79 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 }
 EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
 
+static void irq_affinity_notify(struct work_struct *work)
+{
+	struct irq_affinity_notify *notify =
+		container_of(work, struct irq_affinity_notify, work);
+	struct irq_desc *desc = irq_to_desc(notify->irq);
+	cpumask_var_t cpumask;
+	unsigned long flags;
+
+	if (!desc)
+		goto out;
+
+	if (!alloc_cpumask_var(&cpumask, GFP_KERNEL))
+		goto out;
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (desc->status & IRQ_MOVE_PENDING)
+		cpumask_copy(cpumask, desc->pending_mask);
+	else
+#endif
+		cpumask_copy(cpumask, desc->affinity);
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	notify->notify(notify, cpumask);
+
+	free_cpumask_var(cpumask);
+out:
+	kref_put(&notify->kref, notify->release);
+}
+
+/**
+ *	irq_set_affinity_notifier - control notification of IRQ affinity changes
+ *	@irq:		Interrupt for which to enable/disable notification
+ *	@notify:	Context for notification, or %NULL to disable
+ *			notification.  Function pointers must be initialised;
+ *			the other fields will be initialised by this function.
+ *
+ *	Must be called in process context.  Notification may only be enabled
+ *	after the IRQ is allocated and must be disabled before the IRQ is
+ *	freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_affinity_notify *old_notify;
+	unsigned long flags;
+
+	/* The release function is promised process context */
+	might_sleep();
+
+	if (!desc)
+		return -EINVAL;
+
+	/* Complete initialisation of *notify */
+	if (notify) {
+		notify->irq = irq;
+		kref_init(&notify->kref);
+		INIT_WORK(&notify->work, irq_affinity_notify);
+	}
+
+	raw_spin_lock_irqsave(&desc->lock, flags);
+	old_notify = desc->affinity_notify;
+	desc->affinity_notify = notify;
+	raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+	if (old_notify)
+		kref_put(&old_notify->kref, old_notify->release);
+	
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
@@ -1004,6 +1081,11 @@ void free_irq(unsigned int irq, void *dev_id)
 	if (!desc)
 		return;
 
+#ifdef CONFIG_SMP
+	if (WARN_ON(desc->affinity_notify))
+		desc->affinity_notify = NULL;
+#endif
+
 	chip_bus_lock(desc);
 	kfree(__free_irq(irq, dev_id));
 	chip_bus_sync_unlock(desc);
-- 
1.7.3.4



-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply related

* [PATCH net-next-2.6 0/5] RFS hardware acceleration (v3)
From: Ben Hutchings @ 2011-01-19 20:59 UTC (permalink / raw)
  To: David Miller, Thomas Gleixner, Tom Herbert
  Cc: netdev, linux-net-drivers, linux-kernel

This patch series extends RFS to use hardware RX filters where
available.  Depending on the number of hardware RX queues and their
IRQs' affinity, this should reduce the need for IPIs or at least get
packets delivered to the right NUMA node.

The first patch implements IRQ affinity notifiers, based on the outline
that Thomas Gleixner wrote in response to the previous version of this
patch series.  This has been updated based on Thomas's previous comments.

The second patch is a generalisation of the CPU affinity reverse-
mapping, plus functions to maintain such a mapping based on the new IRQ
affinity notifiers.  This has been updated based on Eric Dumazet's
comments.

The remaining patches add the RFS acceleration hooks and an
implementation in the sfc driver.  I have changed the sfc driver's
strategy for reclaiming entries in the filter table entry from the
previous version.  The table can now be scanned at the end of each NAPI
polling interval, based on the rate at which filters are being added.
However, I haven't yet constructed a good test case that involves
turnover of flows so I have yet to settle on a good strategy for this.

Ben.

Ben Hutchings (5):
  genirq: Add IRQ affinity notifiers
  lib: cpu_rmap: CPU affinity reverse-mapping
  net: RPS: Enable hardware acceleration of RFS
  sfc: Limit filter search depth further for performance hints (i.e.
    RFS)
  sfc: Implement hardware acceleration of RFS

 drivers/net/sfc/efx.c        |   49 ++++++++-
 drivers/net/sfc/efx.h        |   16 +++
 drivers/net/sfc/filter.c     |  107 ++++++++++++++++-
 drivers/net/sfc/net_driver.h |    3 +
 include/linux/cpu_rmap.h     |   73 ++++++++++++
 include/linux/interrupt.h    |   31 +++++
 include/linux/irqdesc.h      |    3 +
 include/linux/netdevice.h    |   33 +++++-
 kernel/irq/manage.c          |   82 +++++++++++++
 lib/Kconfig                  |    4 +
 lib/Makefile                 |    2 +
 lib/cpu_rmap.c               |  269 ++++++++++++++++++++++++++++++++++++++++++
 net/Kconfig                  |    6 +
 net/core/dev.c               |   97 ++++++++++++++-
 14 files changed, 760 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/cpu_rmap.h
 create mode 100644 lib/cpu_rmap.c

-- 
1.7.3.4


-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH] scm: provide full privilege set via SCM_PRIVILEGE
From: Casey Schaufler @ 2011-01-19 20:40 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: LKLM, xemul, David Miller, Sakkinen Jarkko.2 (EXT-Tieto/Tampere),
	Janne Karhunen, Reshetova Elena (Nokia-D/Helsinki), netdev,
	Serge E. Hallyn, Casey Schaufler
In-Reply-To: <m1lj2g7tph.fsf@fess.ebiederm.org>

On 1/19/2011 8:18 AM, Eric W. Biederman wrote:
> Casey Schaufler <casey@schaufler-ca.com> writes:
>
>> On 1/18/2011 9:45 PM, Eric W. Biederman wrote:


Quite a bit of work to do here, it would appear.

>>> Casey Schaufler <casey@schaufler-ca.com> writes:
>>>
>>>> Subject: [PATCH] scm: provide full privilege set via SCM_PRIVILEGE
>>>>
>>>> The SCM mechanism currently provides interfaces for delivering
>>>> the uid/gid and the "security context" (LSM information) of the
>>>> peer on a UDS socket. All of the security credential information
>>>> is available, but there is no interface available to obtain it.
>>>> Further, the existing interfaces require that the user chose
>>>> between the uid/gid and the context as the existing interfaces
>>>> are exclusive.
>>>>
>>>> This patch introduces an additional interface that provides
>>>> a complete set of security information from the peer credential.
>>>> No additional work is required to provide the information
>>>> internally, it is all being passed, just not exposed.
>>> In ascii text?
>> As is commonly done in /proc interfaces.
>>
>>> A bitmap in hex?
>> As is done in /proc/<pid>/status. I seriously doubt
>> that anyone would want the kernel doing the capability
>> set to text conversion.
> But when you have a perfectly good binary interface when reducing the
> encoding efficiency for effectively no gain.

Problem is that with a group list, capability sets, and LSM "context"
I don't have a perfectly good binary interface.

>>> Maybe it is just me, but this seems harder to deal with than
>>> if the data had been transferred in binary.
>> There are a couple of issues with passing a binary structure
>> in the modern cred case. First is the capability set, which
>> has been proven to grow over time. Sure, it took a while to
>> get past 32 bits, and hopefully will never go beyond 64, but
>> given the long term problems caused by 16 bit uids (some of
>> us still remember) I would hate to get bitten by this in my
>> old age. Second is the LSM specific security context, which
>> may not be there at all and if it is the size will depend on
>> the LSM in use.
> Sure but you have to use an interface that properly handles variable
> length binary data, to get to the string.  It feels like you are
> violating the even more classic one value per file rule.

This way leaves all parsing to user space, and uses well understood
facilities to create something that isn't going to require internal
counters and the like.

> Maybe I am missing something but is there any reason you can't have
> multiple cmsg types?

Sure, there could be a cmsg for each value, but if you know that
you're going to always use them all, why make them separate?

>> There are classic C language techniques for dealing with
>> both of these issues, and I've used them enough times to
>> want to avoid them where possible. This is the same logic
>> that the aforementioned /proc interface implementers have
>> been using for some time. And while there are problems
>> with formatting, passing and parsing a string they pale
>> in comparison to maintaining multiple versions of kernel
>> interface structures that are themselves variable depending
>> on the kernel configuration.
> If you are worrying about variable size structures that vary
> depending on kernel configuration I pretty certain you are doing
> it wrong.

I'm seriously more concerned with the maintainability of cap sets.
I saw the pain level when they went to 2x32bit and any structure
that includes a cap set, especially a user visible interface, is
going to be an issue. Not to mention that there's a group list in
there as well.

> The use of sprintf (not snprintf) and the crazy size computation needed
> for your string also worries me.  That part of the implementation
> appears to be just asking for trouble.

snprintf would be a simple change. The computation is just arithmetic.

> Having a giant function like your scm_passpriv inline in
> include/net/scm.h also seems very questionable.
>
> I think there is a real impedance mismatch here between the interface
> you are using and the way you are returning the data.
>
> There is a show stopper bug.  You don't translate uid/gid in the
> receivers user namespace so passing a message between two processes in
> different user namespaces can pass deceptive credentials. Given that the
> reason we have the struct cred on unix domain sockets in the first place
> is that we handled the user namespace conversion issues so we could
> cross namespaces without security issues not handling that case in a new
> scm cred is just inexcusable.

The issues with uid and gid translation appear to be easy to fix,
assuming that the cred_to_ucred() function provides a reasonable
example.

> On that note Serge and I are slowly working to get credentials to be
> namespace local as well, so shortly the credentials will also need to be
> translated as well as just uid's and gid's.

Oh dear. Does that include capability sets and LSM blobs?

> The scm->secid is if I understand correctly a per packet label.  Is that
> what you want here?  I thought you were interested in the information
> off of struct cred.

The intent is to pull in the access control information so
that the (trusted) application can make decisions based on
its own set of criteria. The kernel is making access checks
(in SELinux and Smack at any rate) based on the packet label,
but it might make sense to provide both.

> In principle returning all of the credential should be fine.  In
> practice I think this patch has a lot of poorly thought through details
> that will be a nightmare to maintain in practice.
>
> Eric
>
>
>>>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>>>> ---
>>>>
>>>>  include/asm-generic/socket.h |    1 +
>>>>  include/linux/net.h          |    1 +
>>>>  include/linux/socket.h       |    1 +
>>>>  include/net/scm.h            |   80 +++++++++++++++++++++++++++++++++++++++++-
>>>>  net/core/sock.c              |   11 ++++++
>>>>  5 files changed, 93 insertions(+), 1 deletions(-)
>>>> diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
>>>> index 9a6115e..7aa8e84 100644
>>>> --- a/include/asm-generic/socket.h
>>>> +++ b/include/asm-generic/socket.h
>>>> @@ -64,4 +64,5 @@
>>>>  #define SO_DOMAIN		39
>>>>  
>>>>  #define SO_RXQ_OVFL             40
>>>> +#define SO_PASSPRIV		41
>>>>  #endif /* __ASM_GENERIC_SOCKET_H */
>>>> diff --git a/include/linux/net.h b/include/linux/net.h
>>>> index 16faa13..159a929 100644
>>>> --- a/include/linux/net.h
>>>> +++ b/include/linux/net.h
>>>> @@ -71,6 +71,7 @@ struct net;
>>>>  #define SOCK_NOSPACE		2
>>>>  #define SOCK_PASSCRED		3
>>>>  #define SOCK_PASSSEC		4
>>>> +#define SOCK_PASSPRIV		5
>>>>  
>>>>  #ifndef ARCH_HAS_SOCKET_TYPES
>>>>  /**
>>>> diff --git a/include/linux/socket.h b/include/linux/socket.h
>>>> index 86b652f..e9cfd68 100644
>>>> --- a/include/linux/socket.h
>>>> +++ b/include/linux/socket.h
>>>> @@ -147,6 +147,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
>>>>  #define	SCM_RIGHTS	0x01		/* rw: access rights (array of int) */
>>>>  #define SCM_CREDENTIALS 0x02		/* rw: struct ucred		*/
>>>>  #define SCM_SECURITY	0x03		/* rw: security label		*/
>>>> +#define SCM_PRIVILEGES  0x04		/* rw: privilege set		*/
>>>>  
>>>>  struct ucred {
>>>>  	__u32	pid;
>>>> diff --git a/include/net/scm.h b/include/net/scm.h
>>>> index 3165650..4b8db21 100644
>>>> --- a/include/net/scm.h
>>>> +++ b/include/net/scm.h
>>>> @@ -101,6 +101,83 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
>>>>  { }
>>>>  #endif /* CONFIG_SECURITY_NETWORK */
>>>>  
>>>> +static __inline__ void scm_passpriv(struct socket *sock, struct msghdr *msg,
>>>> +				struct scm_cookie *scm)
>>>> +{
>>>> +	const struct cred *credp = scm->cred;
>>>> +	const struct group_info *gip;
>>>> +	char *result;
>>>> +	char *cp;
>>>> +	int i;
>>>> +#ifdef CONFIG_SECURITY_NETWORK
>>>> +	char *secdata;
>>>> +	u32 seclen;
>>>> +	int err;
>>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>>> +
>>>> +	if (!test_bit(SOCK_PASSPRIV, &sock->flags))
>>>> +		return;
>>>> +
>>>> +	gip = credp->group_info;
>>>> +
>>>> +	/*
>>>> +	 * uid + euid + gid + egid + group-list + capabilities
>>>> +	 *     + "uid=" + "euid=" + "gid=" + "egid=" + "grps="
>>>> +	 *     + "cap-e=" + "cap-p=" + "cap-i="
>>>> +	 * 10  + 10   + 10  + 10   + (ngrps * 10) + ecap + pcap + icap
>>>> +	 *     + 4 + 5 + 4 + 5 + 5 + 6 + 6 + 6
>>>> +	 */
>>>> +	i = ((4 + gip->ngroups) * 11) + (3 * (_KERNEL_CAPABILITY_U32S * 8 + 1))
>>>> +		+ 41;
>>>> +
>>>> +#ifdef CONFIG_SECURITY_NETWORK
>>>> +	err = security_secid_to_secctx(scm->secid, &secdata, &seclen);
>>>> +	if (!err)
>>>> +		/*
>>>> +		 * " context="
>>>> +		 */
>>>> +		i += seclen + 10;
>>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>>> +
>>>> +	result = kzalloc(i, GFP_KERNEL);
>>>> +	if (result == NULL)
>>>> +		return;
>>>> +
>>>> +	cp = result + sprintf(result, "euid=%d uid=%d egid=%d gid=%d",
>>>> +				credp->euid, credp->uid,
>>>> +				credp->egid, credp->gid);
>>>> +
>>>> +	if (gip != NULL && gip->ngroups > 0) {
>>>> +		cp += sprintf(cp, " grps=%d", GROUP_AT(gip, 0));
>>>> +		for (i = 1 ; i < gip->ngroups; i++)
>>>> +			cp += sprintf(cp, ",%d", GROUP_AT(gip, i));
>>>> +	}
>>>> +
>>>> +	cp += sprintf(cp, " cap-e=");
>>>> +	CAP_FOR_EACH_U32(i)
>>>> +		cp += sprintf(cp, "%08x", credp->cap_effective.cap[i]);
>>>> +	cp += sprintf(cp, " cap-p=");
>>>> +	CAP_FOR_EACH_U32(i)
>>>> +		cp += sprintf(cp, "%08x", credp->cap_permitted.cap[i]);
>>>> +	cp += sprintf(cp, " cap-i=");
>>>> +	CAP_FOR_EACH_U32(i)
>>>> +		cp += sprintf(cp, "%08x", credp->cap_inheritable.cap[i]);
>>>> +
>>>> +#ifdef CONFIG_SECURITY_NETWORK
>>>> +	cp += sprintf(cp, " context=");
>>>> +	strncpy(cp, secdata, seclen);
>>>> +	cp += seclen;
>>>> +	*cp = '\0';
>>>> +
>>>> +	security_release_secctx(secdata, seclen);
>>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>>> +
>>>> +	put_cmsg(msg, SOL_SOCKET, SCM_PRIVILEGES, strlen(result)+1, result);
>>>> +
>>>> +	kfree(result);
>>>> +}
>>>> +
>>>> +
>>>>  static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>>>  				struct scm_cookie *scm, int flags)
>>>>  {
>>>> @@ -114,6 +191,8 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>>>  	if (test_bit(SOCK_PASSCRED, &sock->flags))
>>>>  		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(scm->creds), &scm->creds);
>>>>  
>>>> +	scm_passpriv(sock, msg, scm);
>>>> +
>>>>  	scm_destroy_cred(scm);
>>>>  
>>>>  	scm_passec(sock, msg, scm);
>>>> @@ -124,6 +203,5 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>>>  	scm_detach_fds(msg, scm);
>>>>  }
>>>>  
>>>> -
>>>>  #endif /* __LINUX_NET_SCM_H */
>>>>  
>>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>>> index fb60801..f134126 100644
>>>> --- a/net/core/sock.c
>>>> +++ b/net/core/sock.c
>>>> @@ -725,6 +725,13 @@ set_rcvbuf:
>>>>  		else
>>>>  			clear_bit(SOCK_PASSSEC, &sock->flags);
>>>>  		break;
>>>> +
>>>> +	case SO_PASSPRIV:
>>>> +		if (valbool)
>>>> +			set_bit(SOCK_PASSPRIV, &sock->flags);
>>>> +		else
>>>> +			clear_bit(SOCK_PASSPRIV, &sock->flags);
>>>> +		break;
>>>>  	case SO_MARK:
>>>>  		if (!capable(CAP_NET_ADMIN))
>>>>  			ret = -EPERM;
>>>> @@ -950,6 +957,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>>>  		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
>>>>  		break;
>>>>  
>>>> +	case SO_PASSPRIV:
>>>> +		v.val = test_bit(SOCK_PASSPRIV, &sock->flags) ? 1 : 0;
>>>> +		break;
>>>> +
>>>>  	case SO_PEERSEC:
>>>>  		return security_socket_getpeersec_stream(sock, optval, optlen, len);
>>>>  
>

^ permalink raw reply

* Re: Bonding on bond
From: Nicolas de Pesloüan @ 2011-01-19 20:33 UTC (permalink / raw)
  To: Jiri Bohac
  Cc: Jay Vosburgh, bonding-devel@lists.sourceforge.net,
	netdev@vger.kernel.org

Le 19/01/2011 16:49, Jiri Bohac a écrit :
 > On Tue, Jan 18, 2011 at 09:07:20AM +0100, Nicolas de Pesloüan wrote:
 >> Staking bond is not supported. Currently, no setup is know to
 >> require stacking bond.

 > I agree. This question and weird bugreports from people trying
 > this come up over and over. How about this patch?

Why not. Adding this to the documentation should also help.

 > bonding: prohibit enslaving of bonding masters
 >
 > Nested bonding is not supported and will result in strange problems, e.g.:
 > - netif_receive_skb() will not properly change skb->dev to point to the
 >   uppoer-most bonding master
 > - arp monitor will not work (dev->last_rx is only updated by hardware drivers)
 > - accidentally enslaving a bonding master to itself will cause an infinite
 >   recursion in the TX path
 >
 > This patch prevents this by prohibiting a bonding master from being further enslaved.
 >
 > Signed-off-by: Jiri Bohac <jbohac@suse.cz>
 >
 > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
 > index b1025b8..d4d5f42 100644
 > --- a/drivers/net/bonding/bond_main.c
 > +++ b/drivers/net/bonding/bond_main.c
 > @@ -1448,8 +1448,8 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev)
 >  	}
 >
 >  	/* already enslaved */
 > -	if (slave_dev->flags & IFF_SLAVE) {
 > -		pr_debug("Error, Device was already enslaved\n");
 > +	if (slave_dev->priv_flags & IFF_BONDING) {
 > +		pr_debug("Error, Device already enslaved or a bonding master\n");

Even if it is possible to test for slave and for master with a single condition (IFF_BONDING), I 
suggest to split the tests and the error messages, to give end user the best possible diagnostic.

If the device is already a master, let's say it.
If the device is already enslaved, let's continue to say it. It might even be better to give the 
name of the other master that already own this slave.

 >  		return -EBUSY;
 >  	}
 >
 >
 >
 > --
 > Jiri Bohac <jbohac@suse.cz>
 > SUSE Labs, SUSE CZ

^ permalink raw reply

* Re: [Xen-devel] Re: [PATCH] xen network backend driver
From: Ben Hutchings @ 2011-01-19 20:16 UTC (permalink / raw)
  To: Pasi Kärkkäinen
  Cc: Jeremy Fitzhardinge, Ian Campbell, netdev@vger.kernel.org,
	xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <20110119195839.GO2754@reaktio.net>

On Wed, 2011-01-19 at 21:58 +0200, Pasi Kärkkäinen wrote:
> On Wed, Jan 19, 2011 at 07:48:19PM +0000, Ben Hutchings wrote:
[...]
> > It was possible to use multiple receive queues per device long before
> > this since the networking core is not involved in locking them.  (Though
> > it did require some hacks to create multiple NAPI contexts, before
> > 2.6.24.)  This is mostly useful useful in conjunction with separate IRQs
> > per RX queue, spread across multiple CPUs (sometimes referred to as
> > Receive Side Scaling or RSS).
> > 
> 
> Ok. I should read changelogs more closely.. I thought both the receive/transmit 
> multiqueue features appeared 'recently', but it seems I was wrong.

There was a relatively recent change that allows drivers to record which
receive queue each packet came in on.  This is used by RPS/RFS, and for
transmit queue selection when bridging/forwarding.  But multiple receive
queues were still usable before that.

> I think Linux 2.6.32 added multiqueue VLAN support..

VLAN devices and several other types of software device don't have
queues of their own, but they can now use all transmit queues of the
underlying device rather than just one.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] bonding: added 802.3ad round-robin hashing policy for single TCP session balancing
From: Nicolas de Pesloüan @ 2011-01-19 20:12 UTC (permalink / raw)
  To: Oleg V. Ukhno
  Cc: Jay Vosburgh, John Fastabend, David S. Miller,
	netdev@vger.kernel.org, Sébastien Barré,
	Christophe Paasch
In-Reply-To: <4D370DC7.6000500@yandex-team.ru>

Le 19/01/2011 17:13, Oleg V. Ukhno a écrit :
> On 01/18/2011 11:24 PM, Jay Vosburgh wrote:
[snip]
>> I haven't done much testing with this lately, but I suspect this
>> behavior hasn't really changed. Raising the tcp_reordering sysctl value
>> can mitigate this somewhat (by making TCP more tolerant of this), but
>> that doesn't help non-TCP protocols.
>>
>> Barring evidence to the contrary, I presume that Oleg's system
>> delivers out of order at the receiver. That's not automatically a
>> reason to reject it, but this entire proposal is sufficiently complex to
>> configure that very explicit documentation will be necessary.
>>
>> -J
>>
>> ---
>> -Jay Vosburgh, IBM Linux Technology Center, fubar@us.ibm.com
>>
>
> Jay,
[snip]
>
> What is your opinion on my idea with patch?
>
> I will come back with results for VLAN tunneling case, if this is
> necessary (Nicolas, shall I do that test - I think it will show similar
> results for performance?)

If you have time for that, then yes, please, do the same test using balance-rr+vlan to segregate 
path. With those results, we whould have the opportunity to enhance the documentation with some well 
tested cases of TCP load balancing on a LAN, not limited to 802.3ad automatic setup. Both setups 
make sense, and assuming the results would be similar is probably true, but not reliable enough to 
assert it into the documentation.

Thanks,

	Nicolas.

^ permalink raw reply

* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 20:12 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: jamal, Pablo Neira Ayuso, David Miller, arthur.marsh,
	eric.dumazet, netdev
In-Reply-To: <alpine.LNX.2.01.1101192044270.19837@obet.zrqbmnf.qr>

On Wed, Jan 19, 2011 at 08:47:32PM +0100, Jan Engelhardt wrote:
> 
> On Wednesday 2011-01-19 20:24, Jarek Poplawski wrote:
> >> On Wednesday 2011-01-19 17:54, Jarek Poplawski wrote:
> >> 
> >> It looks like the authors' intentinos were to make NLM_F_MATCH not
> >> stop after a single entry has been found. So that sounds like dump,
> >> ok.
> >> 
> >> But NLM_F_ROOT does not quite strike me as a dump request. What if I
> >> wanted just a single item returned but still start at the root?
> >
> >Hmm... Does it say about starting at the root?:
> >
> >"          NLM_F_ROOT     Return the complete table instead of a
> >                          single entry."
> 
> I was referring to netlink.h which paraphrased that, perhaps
> too short:
> 
> #define NLM_F_ROOT      0x100   /* specify tree root    */
> 
> But the RFC description makes for a better wording: if NLM_F_ROOT is
> supposed to return "the complete table", how is it different from
> NLM_F_MATCH with a wildcard criteria?
> 
> |          NLM_F_MATCH    Return all entries matching criteria passed in
> |                         message content.

As I said, I'd prefer not to pretend I understand it, but, knowing
names of people around this, I'm also quite sure there was a purpose.
On the other hand, I'm not sure the names of flags and descriptions
weren't mixed while making it general for different subsystems.

BTW, don't we have in ip/tc many examples of duplicate options?

Jarek P.

^ permalink raw reply

* Re: [RFC] ipv6: don't flush routes when setting loopback down
From: Stephen Hemminger @ 2011-01-19 20:01 UTC (permalink / raw)
  To: Jiri Bohac
  Cc: ebiederm, yoshfuji, netdev, stable, maheshkelkar, brian.haley,
	David Miller, lorenzo
In-Reply-To: <20110119195632.GA27574@midget.suse.cz>

On Wed, 19 Jan 2011 20:56:32 +0100
Jiri Bohac <jbohac@suse.cz> wrote:

> On Wed, Jan 19, 2011 at 11:38:17AM -0800, Stephen Hemminger wrote:
> > Jiri Bohac <jbohac@suse.cz> wrote:
> > > I have the feeling that Eric's patch is the safest solution we
> > > have so far:
> > Eric's patch has other regressions, see the discussion.
> 
> What regression do you mean? I have read the whole discussion
> thoroughly. You only say in one message that deleting ::1 would
> propagate to routing daemons. And Eric correctly stated that
> people couldn't hit this, because  deleting ::1 would break
> things on its own.
> 
> Is there a real problem with Eric's fix?
> 
> Thanks,
> 

If address is assigned to loopback interface (other than ::1) then
Eric's fix doesn't work.  It is common to use an additional address
on the lo device when doing routing protocols.

^ permalink raw reply

* Re: [Xen-devel] Re: [PATCH] xen network backend driver
From: Pasi Kärkkäinen @ 2011-01-19 19:58 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Jeremy Fitzhardinge, Ian Campbell, netdev@vger.kernel.org,
	xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <1295466499.11126.67.camel@bwh-desktop>

On Wed, Jan 19, 2011 at 07:48:19PM +0000, Ben Hutchings wrote:
> On Wed, 2011-01-19 at 21:28 +0200, Pasi Kärkkäinen wrote:
> > On Wed, Jan 19, 2011 at 11:16:59AM -0800, Jeremy Fitzhardinge wrote:
> > > On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> > > > Not in itself.  NAPI polling will run on the same CPU which scheduled it
> > > > (so wherever the IRQ was initially handled).  If the protocol used
> > > > between netfront and netback doesn't support RSS then RPS
> > > > <http://lwn.net/Articles/362339/> can be used to spread the RX work
> > > > across CPUs.
> > > 
> > > There's only one irq per netback which is bound to one (V)CPU at a
> > > time.  I guess we could extend it to have multiple irqs per netback and
> > > some way of distributing packet flows over them, but that would only
> > > really make sense if there's a single interface with much more traffic
> > > than the others; otherwise the interrupts should be fairly well
> > > distributed (assuming that the different netback irqs are routed to
> > > different cpus).
> > > 
> > 
> > Does "multiqueue" only work for NIC drivers (and frontend drivers),
> > or could it be used also for netback?
> 
> Netfront and netback would have to agree on how many queues to use in
> each direction.
> 

Yep.

> > (afaik Linux multiqueue enables setting up multiple receive queues
> > each having a separate irq.)
> 
> In the context of Linux networking, 'multiqueue' generally refers to use
> of multiple *transmit* queues.  The networking core handles scheduling
> and locking of each transmit queue, so it had to be extended to support
> multiple queues - initially done in 2.6.23, then made scalable in
> 2.6.27.
> 

Thanks for clearing that up.

> It was possible to use multiple receive queues per device long before
> this since the networking core is not involved in locking them.  (Though
> it did require some hacks to create multiple NAPI contexts, before
> 2.6.24.)  This is mostly useful useful in conjunction with separate IRQs
> per RX queue, spread across multiple CPUs (sometimes referred to as
> Receive Side Scaling or RSS).
> 

Ok. I should read changelogs more closely.. I thought both the receive/transmit 
multiqueue features appeared 'recently', but it seems I was wrong.

I think Linux 2.6.32 added multiqueue VLAN support..

-- Pasi


^ permalink raw reply

* Re: [RFC] ipv6: don't flush routes when setting loopback down
From: Jiri Bohac @ 2011-01-19 19:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Jiri Bohac, yoshfuji, netdev, stable, maheshkelkar, brian.haley,
	lorenzo, David Miller, ebiederm
In-Reply-To: <20110119113817.0819ddf1@s6510>

On Wed, Jan 19, 2011 at 11:38:17AM -0800, Stephen Hemminger wrote:
> Jiri Bohac <jbohac@suse.cz> wrote:
> > I have the feeling that Eric's patch is the safest solution we
> > have so far:
> Eric's patch has other regressions, see the discussion.

What regression do you mean? I have read the whole discussion
thoroughly. You only say in one message that deleting ::1 would
propagate to routing daemons. And Eric correctly stated that
people couldn't hit this, because  deleting ::1 would break
things on its own.

Is there a real problem with Eric's fix?

Thanks,

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ

^ permalink raw reply

* Re: [Xen-devel] Re: [PATCH] xen network backend driver
From: Ben Hutchings @ 2011-01-19 19:48 UTC (permalink / raw)
  To: Pasi Kärkkäinen
  Cc: Jeremy Fitzhardinge, Ian Campbell, netdev@vger.kernel.org,
	xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <20110119192823.GN2754@reaktio.net>

On Wed, 2011-01-19 at 21:28 +0200, Pasi Kärkkäinen wrote:
> On Wed, Jan 19, 2011 at 11:16:59AM -0800, Jeremy Fitzhardinge wrote:
> > On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> > > Not in itself.  NAPI polling will run on the same CPU which scheduled it
> > > (so wherever the IRQ was initially handled).  If the protocol used
> > > between netfront and netback doesn't support RSS then RPS
> > > <http://lwn.net/Articles/362339/> can be used to spread the RX work
> > > across CPUs.
> > 
> > There's only one irq per netback which is bound to one (V)CPU at a
> > time.  I guess we could extend it to have multiple irqs per netback and
> > some way of distributing packet flows over them, but that would only
> > really make sense if there's a single interface with much more traffic
> > than the others; otherwise the interrupts should be fairly well
> > distributed (assuming that the different netback irqs are routed to
> > different cpus).
> > 
> 
> Does "multiqueue" only work for NIC drivers (and frontend drivers),
> or could it be used also for netback?

Netfront and netback would have to agree on how many queues to use in
each direction.

> (afaik Linux multiqueue enables setting up multiple receive queues
> each having a separate irq.)

In the context of Linux networking, 'multiqueue' generally refers to use
of multiple *transmit* queues.  The networking core handles scheduling
and locking of each transmit queue, so it had to be extended to support
multiple queues - initially done in 2.6.23, then made scalable in
2.6.27.

It was possible to use multiple receive queues per device long before
this since the networking core is not involved in locking them.  (Though
it did require some hacks to create multiple NAPI contexts, before
2.6.24.)  This is mostly useful useful in conjunction with separate IRQs
per RX queue, spread across multiple CPUs (sometimes referred to as
Receive Side Scaling or RSS).

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jan Engelhardt @ 2011-01-19 19:47 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: jamal, Pablo Neira Ayuso, David Miller, arthur.marsh,
	eric.dumazet, netdev
In-Reply-To: <20110119192409.GE1845@del.dom.local>


On Wednesday 2011-01-19 20:24, Jarek Poplawski wrote:
>> On Wednesday 2011-01-19 17:54, Jarek Poplawski wrote:
>> 
>> It looks like the authors' intentinos were to make NLM_F_MATCH not
>> stop after a single entry has been found. So that sounds like dump,
>> ok.
>> 
>> But NLM_F_ROOT does not quite strike me as a dump request. What if I
>> wanted just a single item returned but still start at the root?
>
>Hmm... Does it say about starting at the root?:
>
>"          NLM_F_ROOT     Return the complete table instead of a
>                          single entry."

I was referring to netlink.h which paraphrased that, perhaps
too short:

#define NLM_F_ROOT      0x100   /* specify tree root    */

But the RFC description makes for a better wording: if NLM_F_ROOT is
supposed to return "the complete table", how is it different from
NLM_F_MATCH with a wildcard criteria?

|          NLM_F_MATCH    Return all entries matching criteria passed in
|                         message content.

^ permalink raw reply

* Re: [PATCH 18/79] IPVS: Remove useless { } block from ip_vs_process_message()
From: Joe Perches @ 2011-01-19 19:40 UTC (permalink / raw)
  To: kaber; +Cc: davem, netfilter-devel, netdev
In-Reply-To: <1295464519-21763-19-git-send-email-kaber@trash.net>

On Wed, 2011-01-19 at 20:14 +0100, kaber@trash.net wrote:
> From: Simon Horman <horms@verge.net.au>
>  net/netfilter/ipvs/ip_vs_sync.c |   24 +++++++++++-------------
> diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
> @@ -381,20 +381,18 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
>  			}
>  		}
>  
> -		{
> -			if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
> -					      (union nf_inet_addr *)&s->caddr,
> -					      s->cport,
> -					      (union nf_inet_addr *)&s->vaddr,
> -					      s->vport, &param)) {
> -				pr_err("ip_vs_conn_fill_param_sync failed");
> -				return;
> -			}
> -			if (!(flags & IP_VS_CONN_F_TEMPLATE))
> -				cp = ip_vs_conn_in_get(&param);
> -			else
> -				cp = ip_vs_ct_in_get(&param);
> +		if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol,
> +					       (union nf_inet_addr *)&s->caddr,
> +					       s->cport,
> +					       (union nf_inet_addr *)&s->vaddr,
> +					       s->vport, &param)) {
> +			pr_err("ip_vs_conn_fill_param_sync failed");

The original and this pr_err is missing a \n before close quote.



^ permalink raw reply

* Re: [RFC] ipv6: don't flush routes when setting loopback down
From: Stephen Hemminger @ 2011-01-19 19:38 UTC (permalink / raw)
  To: Jiri Bohac
  Cc: David Miller, ebiederm, brian.haley, netdev, maheshkelkar,
	lorenzo, yoshfuji, stable
In-Reply-To: <20110119191823.GC8442@midget.suse.cz>

On Wed, 19 Jan 2011 20:18:23 +0100
Jiri Bohac <jbohac@suse.cz> wrote:

> Hi,
> 
> 
> The commit (29ba5fed1bbd09c2cba890798c8f9eaab251401d) causes
> another regression:
> 
> Prior to the commit, on a freshly booted system, when I do:
> 	sysctl net.ipv6.conf.all.disable_ipv6=1
> Then any attempt to connect to ::1 will fail immediately with
> "Network is unreachable" (e.g. "ping6 ::1" or "telnet ::1 22".
> 
> After the commit, doing
> 	sysctl net.ipv6.conf.all.disable_ipv6=1
> makes connection attempts to ::1 wait for a long time before they fail.
> 
> This is caused by the local route which is now left configured. 
> "ip -6 r l table all" has an additional line in its output:
> 	local ::1 via :: dev lo  table local  proto none  metric 0  mtu 16436 rtt 40ms rttvar 40ms cwnd 3 advmss 16376 hoplimit 0
> 
> With both ::1 and 127.0.0.1 specified for localhost in
> /etc/hosts, disabling ipv6 now breaks many applications
> connecting to localhost. Deleting the local route manually solves
> the problems.
> 
> Could this be reverted, please?
> 
> I have the feeling that Eric's patch is the safest solution we
> have so far:
> 
> > Finding the real bug is beyond me right now, but fixing the regression
> > in disable_ipv6 is simple.  We can just delete ::1 when we bring down
> > the loopback interface, and it will be restored automatically when we
> > bring the loopback interface back up.
> > 
> > Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> > ---
> > Index: linux-2.6.37-rc5.x86_64/net/ipv6/addrconf.c
> > ===================================================================
> > --- linux-2.6.37-rc5.x86_64.orig/net/ipv6/addrconf.c
> > +++ linux-2.6.37-rc5.x86_64/net/ipv6/addrconf.c
> > @@ -2727,6 +2727,7 @@ static int addrconf_ifdown(struct net_de
> >  		/* If just doing link down, and address is permanent
> >  		   and not link-local, then retain it. */
> >  		if (!how &&
> > +		    !ipv6_addr_loopback(&ifa->addr) &&
> >  		    (ifa->flags&IFA_F_PERMANENT) &&
> >  		    !(ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)) {
> >  			list_move_tail(&ifa->if_list, &keep_list);
> 

Eric's patch has other regressions, see the discussion.

^ permalink raw reply

* Re: [PATCH] xen network backend driver
From: Ian Campbell @ 2011-01-19 19:31 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Tom Herbert, netdev@vger.kernel.org, Jeremy Fitzhardinge,
	xen-devel, Konrad Rzeszutek Wilk
In-Reply-To: <1295465147.11126.57.camel@bwh-desktop>

On Wed, 2011-01-19 at 19:25 +0000, Ben Hutchings wrote: 
> If NAPI doesn't work for netback, with or without RPS, maybe you have to
> use that kthread.  But please don't reinvent the wheel by creating your
> own tasklets or require users to set a special parameter.

Agreed, the tasklets really should have been deprecated in favour of the
kthread long ago anyway and I should definitely have thought to nuke
them as part of the preparation for upstreaming.

Hopefully NAPI can be made to work for this case anyway.

Ian.

^ permalink raw reply

* Re: [Xen-devel] Re: [PATCH] xen network backend driver
From: Pasi Kärkkäinen @ 2011-01-19 19:28 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ben Hutchings, Ian Campbell, netdev@vger.kernel.org, xen-devel,
	Konrad Rzeszutek Wilk
In-Reply-To: <4D3738AB.60701@goop.org>

On Wed, Jan 19, 2011 at 11:16:59AM -0800, Jeremy Fitzhardinge wrote:
> On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> > Not in itself.  NAPI polling will run on the same CPU which scheduled it
> > (so wherever the IRQ was initially handled).  If the protocol used
> > between netfront and netback doesn't support RSS then RPS
> > <http://lwn.net/Articles/362339/> can be used to spread the RX work
> > across CPUs.
> 
> There's only one irq per netback which is bound to one (V)CPU at a
> time.  I guess we could extend it to have multiple irqs per netback and
> some way of distributing packet flows over them, but that would only
> really make sense if there's a single interface with much more traffic
> than the others; otherwise the interrupts should be fairly well
> distributed (assuming that the different netback irqs are routed to
> different cpus).
> 

Does "multiqueue" only work for NIC drivers (and frontend drivers),
or could it be used also for netback?

(afaik Linux multiqueue enables setting up multiple receive queues each having a separate irq.)

-- Pasi


^ permalink raw reply

* Re: [PATCH] xen network backend driver
From: Ben Hutchings @ 2011-01-19 19:25 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ian Campbell, netdev@vger.kernel.org, xen-devel,
	Konrad Rzeszutek Wilk, Tom Herbert
In-Reply-To: <4D3738AB.60701@goop.org>

On Wed, 2011-01-19 at 11:16 -0800, Jeremy Fitzhardinge wrote:
> On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> > Not in itself.  NAPI polling will run on the same CPU which scheduled it
> > (so wherever the IRQ was initially handled).  If the protocol used
> > between netfront and netback doesn't support RSS then RPS
> > <http://lwn.net/Articles/362339/> can be used to spread the RX work
> > across CPUs.
> 
> There's only one irq per netback which is bound to one (V)CPU at a
> time.  I guess we could extend it to have multiple irqs per netback and
> some way of distributing packet flows over them, but that would only
> really make sense if there's a single interface with much more traffic
> than the others; otherwise the interrupts should be fairly well
> distributed (assuming that the different netback irqs are routed to
> different cpus).
> 
> Also, I assume that if most of the packets are not terminating in dom0
> itself but are sent out some other device (either real hardware or to
> another domain), then there won't be any protocol processing and the
> amount of CPU required to handle the packet is minimal.  Is that true? 
> And if so, would RPS help in that case? I would expect the cost of an
> IPI to swamp anything else that needs to happen to the packet.

IPIs are apparently pretty cheap now.

If NAPI doesn't work for netback, with or without RPS, maybe you have to
use that kthread.  But please don't reinvent the wheel by creating your
own tasklets or require users to set a special parameter.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] xen network backend driver
From: Ian Campbell @ 2011-01-19 19:24 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ben Hutchings, netdev@vger.kernel.org, xen-devel,
	Konrad Rzeszutek Wilk
In-Reply-To: <4D3738AB.60701@goop.org>

On Wed, 2011-01-19 at 19:16 +0000, Jeremy Fitzhardinge wrote: 
> On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> > Not in itself.  NAPI polling will run on the same CPU which scheduled it
> > (so wherever the IRQ was initially handled).  If the protocol used
> > between netfront and netback doesn't support RSS then RPS
> > <http://lwn.net/Articles/362339/> can be used to spread the RX work
> > across CPUs.
> 
> There's only one irq per netback which is bound to one (V)CPU at a
> time.  I guess we could extend it to have multiple irqs per netback and
> some way of distributing packet flows over them, but that would only
> really make sense if there's a single interface with much more traffic
> than the others; otherwise the interrupts should be fairly well
> distributed (assuming that the different netback irqs are routed to
> different cpus).

I'd gotten myself confused thinking in terms of a single driver -- this
all just falls out naturally from each vif backend instance having its
own interrupt, just like it does today, NAPI makes no difference here.

There is talk of implementing multiqueue (and hence multi-IRQ) support
for the guest RX path (netback TX) and using RSS in that case but not
yet any plans for anything similar on the guest TX path.

Ian.


^ permalink raw reply

* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 19:24 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: jamal, Pablo Neira Ayuso, David Miller, arthur.marsh,
	eric.dumazet, netdev
In-Reply-To: <alpine.LNX.2.01.1101191858390.13633@obet.zrqbmnf.qr>

On Wed, Jan 19, 2011 at 07:04:06PM +0100, Jan Engelhardt wrote:
> 
> On Wednesday 2011-01-19 17:54, Jarek Poplawski wrote:
> >
> >I still don't understand why you call this the nonsense. There are
> >two dump flags NLM_F_ROOT and NLM_F_MATCH plus for convenience
> >NLM_F_DUMP as 2 in 1. Avahi uses these specific flags. Why would
> >anybody have added these specific flags if they can never be used
> >separately?
> 
> It looks like the authors' intentinos were to make NLM_F_MATCH not
> stop after a single entry has been found. So that sounds like dump,
> ok.
> 
> But NLM_F_ROOT does not quite strike me as a dump request. What if I
> wanted just a single item returned but still start at the root?

Hmm... Does it say about starting at the root?:

"          NLM_F_ROOT     Return the complete table instead of a
                          single entry."

> 
> Or asking from a different direction, what's NLM_F_ROOT good for
> when, say, struct rtmsg->rtm_table specifies (in rtnetlink) where to
> start? (Particularly, 0 for an "invisible root" that contains all
> tables.)

I can't say I understand these flags, but IMHO the main point is we
should respect them as separate, even if mostly unused and look like
unnecessary. (Unless there is really no other way of fixing this
genetlink bug.) If it were undocumented... but after all this the RFC.

Jarek P.

^ permalink raw reply

* Re: [RFC] ipv6: don't flush routes when setting loopback down
From: Jiri Bohac @ 2011-01-19 19:18 UTC (permalink / raw)
  To: David Miller
  Cc: ebiederm, shemminger, brian.haley, netdev, maheshkelkar, lorenzo,
	yoshfuji, stable
In-Reply-To: <20101216.182656.226781473.davem@davemloft.net>

Hi,


The commit (29ba5fed1bbd09c2cba890798c8f9eaab251401d) causes
another regression:

Prior to the commit, on a freshly booted system, when I do:
	sysctl net.ipv6.conf.all.disable_ipv6=1
Then any attempt to connect to ::1 will fail immediately with
"Network is unreachable" (e.g. "ping6 ::1" or "telnet ::1 22".

After the commit, doing
	sysctl net.ipv6.conf.all.disable_ipv6=1
makes connection attempts to ::1 wait for a long time before they fail.

This is caused by the local route which is now left configured. 
"ip -6 r l table all" has an additional line in its output:
	local ::1 via :: dev lo  table local  proto none  metric 0  mtu 16436 rtt 40ms rttvar 40ms cwnd 3 advmss 16376 hoplimit 0

With both ::1 and 127.0.0.1 specified for localhost in
/etc/hosts, disabling ipv6 now breaks many applications
connecting to localhost. Deleting the local route manually solves
the problems.

Could this be reverted, please?

I have the feeling that Eric's patch is the safest solution we
have so far:

> Finding the real bug is beyond me right now, but fixing the regression
> in disable_ipv6 is simple.  We can just delete ::1 when we bring down
> the loopback interface, and it will be restored automatically when we
> bring the loopback interface back up.
> 
> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
> ---
> Index: linux-2.6.37-rc5.x86_64/net/ipv6/addrconf.c
> ===================================================================
> --- linux-2.6.37-rc5.x86_64.orig/net/ipv6/addrconf.c
> +++ linux-2.6.37-rc5.x86_64/net/ipv6/addrconf.c
> @@ -2727,6 +2727,7 @@ static int addrconf_ifdown(struct net_de
>  		/* If just doing link down, and address is permanent
>  		   and not link-local, then retain it. */
>  		if (!how &&
> +		    !ipv6_addr_loopback(&ifa->addr) &&
>  		    (ifa->flags&IFA_F_PERMANENT) &&
>  		    !(ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)) {
>  			list_move_tail(&ifa->if_list, &keep_list);

-- 
Jiri Bohac <jbohac@suse.cz>
SUSE Labs, SUSE CZ


^ permalink raw reply

* Re: [PATCH] xen network backend driver
From: Jeremy Fitzhardinge @ 2011-01-19 19:16 UTC (permalink / raw)
  To: Ben Hutchings
  Cc: Ian Campbell, netdev@vger.kernel.org, xen-devel,
	Konrad Rzeszutek Wilk
In-Reply-To: <1295460304.11126.53.camel@bwh-desktop>

On 01/19/2011 10:05 AM, Ben Hutchings wrote:
> Not in itself.  NAPI polling will run on the same CPU which scheduled it
> (so wherever the IRQ was initially handled).  If the protocol used
> between netfront and netback doesn't support RSS then RPS
> <http://lwn.net/Articles/362339/> can be used to spread the RX work
> across CPUs.

There's only one irq per netback which is bound to one (V)CPU at a
time.  I guess we could extend it to have multiple irqs per netback and
some way of distributing packet flows over them, but that would only
really make sense if there's a single interface with much more traffic
than the others; otherwise the interrupts should be fairly well
distributed (assuming that the different netback irqs are routed to
different cpus).

Also, I assume that if most of the packets are not terminating in dom0
itself but are sent out some other device (either real hardware or to
another domain), then there won't be any protocol processing and the
amount of CPU required to handle the packet is minimal.  Is that true? 
And if so, would RPS help in that case? I would expect the cost of an
IPI to swamp anything else that needs to happen to the packet.

    J

^ permalink raw reply

* [PATCH 77/79] netfilter: nf_conntrack: nf_conntrack snmp helper
From: kaber @ 2011-01-19 19:15 UTC (permalink / raw)
  To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>

From: Jiri Olsa <jolsa@redhat.com>

Adding support for SNMP broadcast connection tracking. The SNMP
broadcast requests are now paired with the SNMP responses.
Thus allowing using SNMP broadcasts with firewall enabled.

Please refer to the following conversation:
http://marc.info/?l=netfilter-devel&m=125992205006600&w=2

Patrick McHardy wrote:
> > The best solution would be to add generic broadcast tracking, the
> > use of expectations for this is a bit of abuse.
> > The second best choice I guess would be to move the help() function
> > to a shared module and generalize it so it can be used for both.
This patch implements the "second best choice".

Since the netbios-ns conntrack module uses the same helper
functionality as the snmp, only one helper function is added
for both snmp and netbios-ns modules into the new object -
nf_conntrack_broadcast.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/netfilter/nf_conntrack_snmp.h |    9 +++
 include/net/netfilter/nf_conntrack_helper.h |    6 ++
 net/ipv4/netfilter/Kconfig                  |    3 +-
 net/ipv4/netfilter/nf_nat_snmp_basic.c      |    9 ++-
 net/netfilter/Kconfig                       |   19 ++++++
 net/netfilter/Makefile                      |    2 +
 net/netfilter/nf_conntrack_broadcast.c      |   82 +++++++++++++++++++++++++++
 net/netfilter/nf_conntrack_netbios_ns.c     |   74 +++---------------------
 net/netfilter/nf_conntrack_snmp.c           |   77 +++++++++++++++++++++++++
 9 files changed, 211 insertions(+), 70 deletions(-)
 create mode 100644 include/linux/netfilter/nf_conntrack_snmp.h
 create mode 100644 net/netfilter/nf_conntrack_broadcast.c
 create mode 100644 net/netfilter/nf_conntrack_snmp.c

diff --git a/include/linux/netfilter/nf_conntrack_snmp.h b/include/linux/netfilter/nf_conntrack_snmp.h
new file mode 100644
index 0000000..064bc63
--- /dev/null
+++ b/include/linux/netfilter/nf_conntrack_snmp.h
@@ -0,0 +1,9 @@
+#ifndef _NF_CONNTRACK_SNMP_H
+#define _NF_CONNTRACK_SNMP_H
+
+extern int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+				unsigned int protoff,
+				struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo);
+
+#endif /* _NF_CONNTRACK_SNMP_H */
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 32c305d..f1c1311 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -63,4 +63,10 @@ static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
 extern int nf_conntrack_helper_init(void);
 extern void nf_conntrack_helper_fini(void);
 
+extern int nf_conntrack_broadcast_help(struct sk_buff *skb,
+				       unsigned int protoff,
+				       struct nf_conn *ct,
+				       enum ip_conntrack_info ctinfo,
+				       unsigned int timeout);
+
 #endif /*_NF_CONNTRACK_HELPER_H*/
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2..f926a31 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
 
 config NF_NAT_SNMP_BASIC
 	tristate "Basic SNMP-ALG support"
-	depends on NF_NAT
+	depends on NF_CONNTRACK_SNMP && NF_NAT
 	depends on NETFILTER_ADVANCED
+	default NF_NAT && NF_CONNTRACK_SNMP
 	---help---
 
 	  This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419..8812a02 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
 #include <net/netfilter/nf_conntrack_expect.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_nat_helper.h>
+#include <linux/netfilter/nf_conntrack_snmp.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
 {
 	int ret = 0;
 
-	ret = nf_conntrack_helper_register(&snmp_helper);
-	if (ret < 0)
-		return ret;
+	BUG_ON(nf_nat_snmp_hook != NULL);
+	rcu_assign_pointer(nf_nat_snmp_hook, help);
+
 	ret = nf_conntrack_helper_register(&snmp_trap_helper);
 	if (ret < 0) {
 		nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
 
 static void __exit nf_nat_snmp_basic_fini(void)
 {
-	nf_conntrack_helper_unregister(&snmp_helper);
+	rcu_assign_pointer(nf_nat_snmp_hook, NULL);
 	nf_conntrack_helper_unregister(&snmp_trap_helper);
 }
 
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e2480bd..939b504 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -185,9 +185,13 @@ config NF_CONNTRACK_IRC
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_CONNTRACK_BROADCAST
+	tristate
+
 config NF_CONNTRACK_NETBIOS_NS
 	tristate "NetBIOS name service protocol support"
 	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_BROADCAST
 	help
 	  NetBIOS name service requests are sent as broadcast messages from an
 	  unprivileged port and responded to with unicast messages to the
@@ -204,6 +208,21 @@ config NF_CONNTRACK_NETBIOS_NS
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config NF_CONNTRACK_SNMP
+	tristate "SNMP service protocol support"
+	depends on NETFILTER_ADVANCED
+	select NF_CONNTRACK_BROADCAST
+	help
+	  SNMP service requests are sent as broadcast messages from an
+	  unprivileged port and responded to with unicast messages to the
+	  same port. This make them hard to firewall properly because connection
+	  tracking doesn't deal with broadcasts. This helper tracks locally
+	  originating SNMP service requests and the corresponding
+	  responses. It relies on correct IP address configuration, specifically
+	  netmask and broadcast address.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 config NF_CONNTRACK_PPTP
 	tristate "PPtP protocol support"
 	depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 401d574..2c2628d 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -28,7 +28,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
 obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
 obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
 obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
+obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
 obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
+obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
 obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
 obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
 obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 0000000..4e99cca
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
+/*
+ *      broadcast connection tracking helper
+ *
+ *      (c) 2005 Patrick McHardy <kaber@trash.net>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <net/route.h>
+#include <linux/inetdevice.h>
+#include <linux/skbuff.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+int nf_conntrack_broadcast_help(struct sk_buff *skb,
+				unsigned int protoff,
+				struct nf_conn *ct,
+				enum ip_conntrack_info ctinfo,
+				unsigned int timeout)
+{
+	struct nf_conntrack_expect *exp;
+	struct iphdr *iph = ip_hdr(skb);
+	struct rtable *rt = skb_rtable(skb);
+	struct in_device *in_dev;
+	struct nf_conn_help *help = nfct_help(ct);
+	__be32 mask = 0;
+
+	/* we're only interested in locally generated packets */
+	if (skb->sk == NULL)
+		goto out;
+	if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
+		goto out;
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		goto out;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(rt->dst.dev);
+	if (in_dev != NULL) {
+		for_primary_ifa(in_dev) {
+			if (ifa->ifa_broadcast == iph->daddr) {
+				mask = ifa->ifa_mask;
+				break;
+			}
+		} endfor_ifa(in_dev);
+	}
+	rcu_read_unlock();
+
+	if (mask == 0)
+		goto out;
+
+	exp = nf_ct_expect_alloc(ct);
+	if (exp == NULL)
+		goto out;
+
+	exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+	exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
+
+	exp->mask.src.u3.ip       = mask;
+	exp->mask.src.u.udp.port  = htons(0xFFFF);
+
+	exp->expectfn             = NULL;
+	exp->flags                = NF_CT_EXPECT_PERMANENT;
+	exp->class		  = NF_CT_EXPECT_CLASS_DEFAULT;
+	exp->helper               = NULL;
+
+	nf_ct_expect_related(exp);
+	nf_ct_expect_put(exp);
+
+	nf_ct_refresh(ct, skb, timeout * HZ);
+out:
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index aadde01..4c8f30a 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -18,14 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/skbuff.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_addr.h>
 #include <linux/in.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <net/route.h>
 
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_helper.h>
@@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
 MODULE_ALIAS_NFCT_HELPER("netbios_ns");
 
 static unsigned int timeout __read_mostly = 3;
-module_param(timeout, uint, 0400);
+module_param(timeout, uint, S_IRUSR);
 MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
 
-static int help(struct sk_buff *skb, unsigned int protoff,
-		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
-{
-	struct nf_conntrack_expect *exp;
-	struct iphdr *iph = ip_hdr(skb);
-	struct rtable *rt = skb_rtable(skb);
-	struct in_device *in_dev;
-	__be32 mask = 0;
-
-	/* we're only interested in locally generated packets */
-	if (skb->sk == NULL)
-		goto out;
-	if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
-		goto out;
-	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
-		goto out;
-
-	rcu_read_lock();
-	in_dev = __in_dev_get_rcu(rt->dst.dev);
-	if (in_dev != NULL) {
-		for_primary_ifa(in_dev) {
-			if (ifa->ifa_broadcast == iph->daddr) {
-				mask = ifa->ifa_mask;
-				break;
-			}
-		} endfor_ifa(in_dev);
-	}
-	rcu_read_unlock();
-
-	if (mask == 0)
-		goto out;
-
-	exp = nf_ct_expect_alloc(ct);
-	if (exp == NULL)
-		goto out;
-
-	exp->tuple                = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
-	exp->tuple.src.u.udp.port = htons(NMBD_PORT);
-
-	exp->mask.src.u3.ip       = mask;
-	exp->mask.src.u.udp.port  = htons(0xFFFF);
-
-	exp->expectfn             = NULL;
-	exp->flags                = NF_CT_EXPECT_PERMANENT;
-	exp->class		  = NF_CT_EXPECT_CLASS_DEFAULT;
-	exp->helper               = NULL;
-
-	nf_ct_expect_related(exp);
-	nf_ct_expect_put(exp);
-
-	nf_ct_refresh(ct, skb, timeout * HZ);
-out:
-	return NF_ACCEPT;
-}
-
 static struct nf_conntrack_expect_policy exp_policy = {
 	.max_expected	= 1,
 };
 
+static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
+		   struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+}
+
 static struct nf_conntrack_helper helper __read_mostly = {
 	.name			= "netbios-ns",
-	.tuple.src.l3num	= AF_INET,
+	.tuple.src.l3num	= NFPROTO_IPV4,
 	.tuple.src.u.udp.port	= cpu_to_be16(NMBD_PORT),
 	.tuple.dst.protonum	= IPPROTO_UDP,
 	.me			= THIS_MODULE,
-	.help			= help,
+	.help			= netbios_ns_help,
 	.expect_policy		= &exp_policy,
 };
 
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 0000000..6e545e2
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,77 @@
+/*
+ *      SNMP service broadcast connection tracking helper
+ *
+ *      (c) 2011 Jiri Olsa <jolsa@redhat.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/in.h>
+
+#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+
+#define SNMP_PORT	161
+
+MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
+MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFCT_HELPER("snmp");
+
+static unsigned int timeout __read_mostly = 30;
+module_param(timeout, uint, S_IRUSR);
+MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
+
+int (*nf_nat_snmp_hook)(struct sk_buff *skb,
+			unsigned int protoff,
+			struct nf_conn *ct,
+			enum ip_conntrack_info ctinfo);
+EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
+
+static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
+		struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+{
+	typeof(nf_nat_snmp_hook) nf_nat_snmp;
+
+	nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
+
+	nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
+	if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
+		return nf_nat_snmp(skb, protoff, ct, ctinfo);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_conntrack_expect_policy exp_policy = {
+	.max_expected	= 1,
+};
+
+static struct nf_conntrack_helper helper __read_mostly = {
+	.name			= "snmp",
+	.tuple.src.l3num	= NFPROTO_IPV4,
+	.tuple.src.u.udp.port	= cpu_to_be16(SNMP_PORT),
+	.tuple.dst.protonum	= IPPROTO_UDP,
+	.me			= THIS_MODULE,
+	.help			= snmp_conntrack_help,
+	.expect_policy		= &exp_policy,
+};
+
+static int __init nf_conntrack_snmp_init(void)
+{
+	exp_policy.timeout = timeout;
+	return nf_conntrack_helper_register(&helper);
+}
+
+static void __exit nf_conntrack_snmp_fini(void)
+{
+	nf_conntrack_helper_unregister(&helper);
+}
+
+module_init(nf_conntrack_snmp_init);
+module_exit(nf_conntrack_snmp_fini);
-- 
1.7.2.3


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox