Netdev List
 help / color / mirror / Atom feed
* [RFC PATCH 2/3] mm: qmempool - quick queue based memory pool
From: Jesper Dangaard Brouer @ 2014-12-10 14:15 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, linux-kernel, linux-mm,
	Christoph Lameter
  Cc: linux-api, Eric Dumazet, David S. Miller, Hannes Frederic Sowa,
	Alexander Duyck, Alexei Starovoitov, Paul E. McKenney,
	Mathieu Desnoyers, Steven Rostedt
In-Reply-To: <20141210141332.31779.56391.stgit@dragon>

A quick queue-based memory pool, that functions as a cache in-front
of kmem_cache SLAB/SLUB allocators.  Which allows faster than
SLAB/SLUB reuse/caching of fixed size memory elements

The speed gain comes from, the shared storage, using a Lock-Free
queue that supports bulk refilling elements (to a percpu cache)
with a single cmpxchg.  Thus, the (lock-prefixed) cmpxchg cost is
amortize over the bulk size.

Qmempool cannot easily replace all kmem_cache usage, because it is
restricted in which contexts is can be used in, as the Lock-Free
queue is not preemption safe. E.g. only supports GFP_ATOMIC allocations
from SLAB.

This version is optimized for usage from softirq context, and cannot
be used from hardirq context.  Usage from none-softirq requires usage
of local_bh_{disable,enable}, which have a fairly high cost.

Performance micro benchmarks against SLUB. First test is fast-path
reuse of same element. Second test is allocating 256 element before
freeing elements again, this pattern comes from how NIC ring queue
cleanups often run.

On CPU E5-2695, CONFIG_PREEMPT=y, showing cost of alloc+free:

                 SLUB      - softirq   - none-softirq
 fastpath-reuse: 19.563 ns -  7.837 ns - 18.536 ns
 N(256)-pattern: 45.039 ns - 11.782 ns - 24.186 ns

A significant win for usage from softirq, and a smaller win for
none-softirq which requires taking local_bh_{disable,enable}.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---

 include/linux/qmempool.h |  205 +++++++++++++++++++++++++++++
 mm/Kconfig               |   12 ++
 mm/Makefile              |    1 
 mm/qmempool.c            |  322 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 540 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/qmempool.h
 create mode 100644 mm/qmempool.c

diff --git a/include/linux/qmempool.h b/include/linux/qmempool.h
new file mode 100644
index 0000000..922ed27
--- /dev/null
+++ b/include/linux/qmempool.h
@@ -0,0 +1,205 @@
+/*
+ * qmempool - a quick queue based mempool
+ *
+ * A quick queue-based memory pool, that functions as a cache in-front
+ * of kmem_cache SLAB/SLUB allocators.  Which allows faster than
+ * SLAB/SLUB reuse/caching of fixed size memory elements
+ *
+ * The speed gain comes from, the shared storage, using a Lock-Free
+ * queue that supports bulk refilling elements (to a percpu cache)
+ * with a single cmpxchg.  Thus, the lock-prefixed cmpxchg cost is
+ * amortize over the bulk size.
+ *
+ * The Lock-Free queue is based on an array (of pointer to elements).
+ * This make access more cache optimal, as e.g. on 64bit 8 pointers
+ * can be stored per cache-line (which is superior to a linked list
+ * approach).  Only storing the pointers to elements, is also
+ * beneficial as we don't touch the elements data.
+ *
+ * Qmempool cannot easily replace all kmem_cache usage, because it is
+ * restricted in which contexts is can be used in, as the Lock-Free
+ * queue is not preemption safe.  This version is optimized for usage
+ * from softirq context, and cannot be used from hardirq context.
+ *
+ * Only support GFP_ATOMIC allocations from SLAB.
+ *
+ * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
+ *  for licensing details see kernel-base/COPYING
+ */
+
+#ifndef _LINUX_QMEMPOOL_H
+#define _LINUX_QMEMPOOL_H
+
+#include <linux/alf_queue.h>
+#include <linux/prefetch.h>
+#include <linux/hardirq.h>
+
+/* Bulking is an essential part of the performance gains as this
+ * amortize the cost of cmpxchg ops used when accessing sharedq
+ */
+#define QMEMPOOL_BULK 16
+#define QMEMPOOL_REFILL_MULTIPLIER 2
+
+struct qmempool_percpu {
+	struct alf_queue *localq;
+};
+
+struct qmempool {
+	/* The shared queue (sharedq) is a Multi-Producer-Multi-Consumer
+	 *  queue where access is protected by an atomic cmpxchg operation.
+	 *  The queue support bulk transfers, which amortize the cost
+	 *  of the atomic cmpxchg operation.
+	 */
+	struct alf_queue	*sharedq;
+
+	/* Per CPU local "cache" queues for faster atomic free access.
+	 * The local queues (localq) are Single-Producer-Single-Consumer
+	 * queues as they are per CPU.
+	 */
+	struct qmempool_percpu __percpu *percpu;
+
+	/* Backed by some SLAB kmem_cache */
+	struct kmem_cache	*kmem;
+
+	/* Setup */
+	uint32_t prealloc;
+	gfp_t gfp_mask;
+};
+
+extern void qmempool_destroy(struct qmempool *pool);
+extern struct qmempool *qmempool_create(
+	uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc,
+	struct kmem_cache *kmem, gfp_t gfp_mask);
+
+extern void *__qmempool_alloc_from_sharedq(
+	struct qmempool *pool, gfp_t gfp_mask, struct alf_queue *localq);
+extern void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool,
+				       struct alf_queue *localq);
+
+/* The percpu variables (SPSC queues) needs preempt protection, and
+ * the shared MPMC queue also needs protection against the same CPU
+ * access the same queue.
+ *
+ * Specialize and optimize the qmempool to run from softirq.
+ * Don't allow qmempool to be used from interrupt context.
+ *
+ * IDEA: When used from softirq, take advantage of the protection
+ * softirq gives.  A softirq will never preempt another softirq,
+ * running on the same CPU.  The only event that can preempt a softirq
+ * is an interrupt handler (and perhaps we don't need to support
+ * calling qmempool from an interrupt).  Another softirq, even the
+ * same one, can run on another CPU however, but these helpers are
+ * only protecting our percpu variables.
+ *
+ * Thus, our percpu variables are safe if current the CPU is the one
+ * serving the softirq (tested via in_serving_softirq()), like:
+ *
+ *  if (!in_serving_softirq())
+ *		local_bh_disable();
+ *
+ * This makes qmempool very fast, when accesses from softirq, but
+ * slower when accessed outside softirq.  The other contexts need to
+ * disable bottom-halves "bh" via local_bh_{disable,enable} (which on
+ * have been measured add cost if 7.5ns on CPU E5-2695).
+ *
+ * MUST not be used from interrupt context, when relying on softirq usage.
+ */
+static inline int __qmempool_preempt_disable(void)
+{
+	int in_serving_softirq = in_serving_softirq();
+
+	if (!in_serving_softirq)
+		local_bh_disable();
+
+	return in_serving_softirq;
+}
+
+static inline void __qmempool_preempt_enable(int in_serving_softirq)
+{
+	if (!in_serving_softirq)
+		local_bh_enable();
+}
+
+/* Elements - alloc and free functions are inlined here for
+ * performance reasons, as the per CPU lockless access should be as
+ * fast as possible.
+ */
+
+/* Main allocation function
+ *
+ * Caller must make sure this is called from a preemptive safe context
+ */
+static inline void * main_qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+	/* NUMA considerations, for now the numa node is not handles,
+	 * this could be handled via e.g. numa_mem_id()
+	 */
+	void *elem;
+	struct qmempool_percpu *cpu;
+	int num;
+
+	/* 1. attempt get element from local per CPU queue */
+	cpu = this_cpu_ptr(pool->percpu);
+	num = alf_sc_dequeue(cpu->localq, (void **)&elem, 1);
+	if (num == 1) /* Succes: alloc elem by deq from localq cpu cache */
+		return elem;
+
+	/* 2. attempt get element from shared queue.  This involves
+	 * refilling the localq for next round. Side-effect can be
+	 * alloc from SLAB.
+	 */
+	elem = __qmempool_alloc_from_sharedq(pool, gfp_mask, cpu->localq);
+	return elem;
+}
+
+static inline void *__qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+	void *elem;
+	int state;
+
+	state = __qmempool_preempt_disable();
+	elem  = main_qmempool_alloc(pool, gfp_mask);
+	__qmempool_preempt_enable(state);
+	return elem;
+}
+
+static inline void *__qmempool_alloc_softirq(struct qmempool *pool,
+					     gfp_t gfp_mask)
+{
+	return main_qmempool_alloc(pool, gfp_mask);
+}
+
+/* Main free function */
+static inline void __qmempool_free(struct qmempool *pool, void *elem)
+{
+	struct qmempool_percpu *cpu;
+	int num;
+	int state;
+
+	/* NUMA considerations, how do we make sure to avoid caching
+	 * elements from a different NUMA node.
+	 */
+	state = __qmempool_preempt_disable();
+
+	/* 1. attempt to free/return element to local per CPU queue */
+	cpu = this_cpu_ptr(pool->percpu);
+	num = alf_sp_enqueue(cpu->localq, &elem, 1);
+	if (num == 1) /* success: element free'ed by enqueue to localq */
+		goto done;
+
+	/* 2. localq cannot store more elements, need to return some
+	 * from localq to sharedq, to make room. Side-effect can be
+	 * free to SLAB.
+	 */
+	__qmempool_free_to_sharedq(elem, pool, cpu->localq);
+
+done:
+	__qmempool_preempt_enable(state);
+}
+
+/* API users can choose to use "__" prefixed versions for inlining */
+extern void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask);
+extern void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask);
+extern void qmempool_free(struct qmempool *pool, void *elem);
+
+#endif /* _LINUX_QMEMPOOL_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..abaa94c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -618,3 +618,15 @@ config MAX_STACK_SIZE_MB
 	  changed to a smaller value in which case that is used.
 
 	  A sane initial value is 80 MB.
+
+config QMEMPOOL
+	bool "Quick queue based mempool (qmempool)"
+	default y
+	select ALF_QUEUE
+	help
+	  A mempool designed for faster than SLAB/kmem_cache
+	  reuse/caching of fixed size memory elements.  Works as a
+	  caching layer in-front of existing kmem_cache SLABs.  Speed
+	  is achieved by _bulk_ refilling percpu local cache, from a
+	  Lock-Free queue requiring a single (locked) cmpxchg per bulk
+	  transfer, thus amortizing the cost of the cmpxchg.
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0..49c1e18 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -69,3 +69,4 @@ obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_QMEMPOOL) += qmempool.o
diff --git a/mm/qmempool.c b/mm/qmempool.c
new file mode 100644
index 0000000..d6debcc
--- /dev/null
+++ b/mm/qmempool.c
@@ -0,0 +1,322 @@
+/*
+ * qmempool - a quick queue based mempool
+ *
+ * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
+ *  for licensing details see kernel-base/COPYING
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/qmempool.h>
+#include <linux/log2.h>
+
+/* Due to hotplug CPU support, we need access to all qmempools
+ * in-order to cleanup elements in localq for the CPU going offline.
+ *
+ * TODO: implement HOTPLUG_CPU
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(qmempool_list);
+static DEFINE_SPINLOCK(qmempool_list_lock);
+#endif
+ */
+
+void qmempool_destroy(struct qmempool *pool)
+{
+	void *elem = NULL;
+	int j;
+
+	if (pool->percpu) {
+		for_each_possible_cpu(j) {
+			struct qmempool_percpu *cpu =
+				per_cpu_ptr(pool->percpu, j);
+
+			while (alf_mc_dequeue(cpu->localq, &elem, 1) == 1)
+				kmem_cache_free(pool->kmem, elem);
+			BUG_ON(!alf_queue_empty(cpu->localq));
+			alf_queue_free(cpu->localq);
+		}
+		free_percpu(pool->percpu);
+	}
+
+	if (pool->sharedq) {
+		while (alf_mc_dequeue(pool->sharedq, &elem, 1) == 1)
+			kmem_cache_free(pool->kmem, elem);
+		BUG_ON(!alf_queue_empty(pool->sharedq));
+		alf_queue_free(pool->sharedq);
+	}
+
+	kfree(pool);
+}
+EXPORT_SYMBOL(qmempool_destroy);
+
+struct qmempool *
+qmempool_create(uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc,
+		struct kmem_cache *kmem, gfp_t gfp_mask)
+{
+	struct qmempool *pool;
+	int i, j, num;
+	void *elem;
+
+	/* Validate constraints, e.g. due to bulking */
+	if (localq_sz < QMEMPOOL_BULK) {
+		pr_err("%s() localq size(%d) too small for bulking\n",
+		       __func__, localq_sz);
+		return NULL;
+	}
+	if (sharedq_sz < (QMEMPOOL_BULK * QMEMPOOL_REFILL_MULTIPLIER)) {
+		pr_err("%s() sharedq size(%d) too small for bulk refill\n",
+		       __func__, sharedq_sz);
+		return NULL;
+	}
+	if (!is_power_of_2(localq_sz) || !is_power_of_2(sharedq_sz)) {
+		pr_err("%s() queue sizes (%d/%d) must be power-of-2\n",
+		       __func__, localq_sz, sharedq_sz);
+		return NULL;
+	}
+	if (prealloc > sharedq_sz) {
+		pr_err("%s() prealloc(%d) req > sharedq size(%d)\n",
+		       __func__, prealloc, sharedq_sz);
+		return NULL;
+	}
+	if ((prealloc % QMEMPOOL_BULK) != 0) {
+		pr_warn("%s() prealloc(%d) should be div by BULK size(%d)\n",
+			__func__, prealloc, QMEMPOOL_BULK);
+	}
+	if (!kmem) {
+		pr_err("%s() kmem_cache is a NULL ptr\n",  __func__);
+		return NULL;
+	}
+
+	pool = kzalloc(sizeof(*pool), gfp_mask);
+	if (!pool)
+		return NULL;
+	pool->kmem     = kmem;
+	pool->gfp_mask = gfp_mask;
+
+	/* MPMC (Multi-Producer-Multi-Consumer) queue */
+	pool->sharedq = alf_queue_alloc(sharedq_sz, gfp_mask);
+	if (IS_ERR_OR_NULL(pool->sharedq)) {
+		pr_err("%s() failed to create shared queue(%d) ERR_PTR:0x%p\n",
+		       __func__, sharedq_sz, pool->sharedq);
+		qmempool_destroy(pool);
+		return NULL;
+	}
+
+	pool->prealloc = prealloc;
+	for (i = 0; i < prealloc; i++) {
+		elem = kmem_cache_alloc(pool->kmem, gfp_mask);
+		if (!elem) {
+			pr_err("%s() kmem_cache out of memory?!\n",  __func__);
+			qmempool_destroy(pool);
+			return NULL;
+		}
+		/* Could use the SP version given it is not visible yet */
+		num = alf_mp_enqueue(pool->sharedq, &elem, 1);
+		BUG_ON(num <= 0);
+	}
+
+	pool->percpu = alloc_percpu(struct qmempool_percpu);
+	if (pool->percpu == NULL) {
+		pr_err("%s() failed to alloc percpu\n", __func__);
+		qmempool_destroy(pool);
+		return NULL;
+	}
+
+	/* SPSC (Single-Consumer-Single-Producer) queue per CPU */
+	for_each_possible_cpu(j) {
+		struct qmempool_percpu *cpu = per_cpu_ptr(pool->percpu, j);
+
+		cpu->localq = alf_queue_alloc(localq_sz, gfp_mask);
+		if (IS_ERR_OR_NULL(cpu->localq)) {
+			pr_err("%s() failed alloc localq(sz:%d) on cpu:%d\n",
+			       __func__, localq_sz, j);
+			qmempool_destroy(pool);
+			return NULL;
+		}
+	}
+
+	return pool;
+}
+EXPORT_SYMBOL(qmempool_create);
+
+/* Element handling
+ */
+
+/* This function is called when sharedq runs-out of elements.
+ * Thus, sharedq needs to be refilled (enq) with elems from slab.
+ *
+ * Caller must assure this is called in an preemptive safe context due
+ * to alf_mp_enqueue() call.
+ */
+void *__qmempool_alloc_from_slab(struct qmempool *pool, gfp_t gfp_mask)
+{
+	void *elems[QMEMPOOL_BULK]; /* on stack variable */
+	void *elem;
+	int num, i, j;
+
+	/* Cannot use SLAB that can sleep if (gfp_mask & __GFP_WAIT),
+	 * else preemption disable/enable scheme becomes too complicated
+	 */
+	BUG_ON(gfp_mask & __GFP_WAIT);
+
+	elem = kmem_cache_alloc(pool->kmem, gfp_mask);
+	if (elem == NULL) /* slab depleted, no reason to call below allocs */
+		return NULL;
+
+	/* SLAB considerations, we need a kmem_cache interface that
+	 * supports allocating a bulk of elements.
+	 */
+
+	for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) {
+		for (j = 0; j < QMEMPOOL_BULK; j++) {
+			elems[j] = kmem_cache_alloc(pool->kmem, gfp_mask);
+			/* Handle if slab gives us NULL elem */
+			if (elems[j] == NULL) {
+				pr_err("%s() ARGH - slab returned NULL",
+				       __func__);
+				num = alf_mp_enqueue(pool->sharedq, elems, j-1);
+				BUG_ON(num == 0); //FIXME handle
+				return elem;
+			}
+		}
+		num = alf_mp_enqueue(pool->sharedq, elems, QMEMPOOL_BULK);
+		/* FIXME: There is a theoretical chance that multiple
+		 * CPU enter here, refilling sharedq at the same time,
+		 * thus we must handle "full" situation, for now die
+		 * hard so someone will need to fix this.
+		 */
+		BUG_ON(num == 0); /* sharedq should have room */
+	}
+
+	/* What about refilling localq here? (else it will happen on
+	 * next cycle, and will cost an extra cmpxchg).
+	 */
+	return elem;
+}
+
+/* This function is called when the localq runs out-of elements.
+ * Thus, localq is refilled (enq) with elements (deq) from sharedq.
+ *
+ * Caller must assure this is called in an preemptive safe context due
+ * to alf_mp_dequeue() call.
+ */
+void *__qmempool_alloc_from_sharedq(struct qmempool *pool, gfp_t gfp_mask,
+				    struct alf_queue *localq)
+{
+	void *elems[QMEMPOOL_BULK]; /* on stack variable */
+	void *elem;
+	int num;
+
+	/* Costs atomic "cmpxchg", but amortize cost by bulk dequeue */
+	num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK);
+	if (likely(num > 0)) {
+		/* Consider prefetching data part of elements here, it
+		 * should be an optimal place to hide memory prefetching.
+		 * Especially given the localq is known to be an empty FIFO
+		 * which guarantees the order objs are accessed in.
+		 */
+		elem = elems[0]; /* extract one element */
+		if (num > 1) {
+			num = alf_sp_enqueue(localq, &elems[1], num-1);
+			/* Refill localq, should be empty, must succeed */
+			BUG_ON(num == 0);
+		}
+		return elem;
+	}
+	/* Use slab if sharedq runs out of elements */
+	elem = __qmempool_alloc_from_slab(pool, gfp_mask);
+	return elem;
+}
+EXPORT_SYMBOL(__qmempool_alloc_from_sharedq);
+
+/* Called when sharedq is full. Thus also make room in sharedq,
+ * besides also freeing the "elems" given.
+ */
+bool __qmempool_free_to_slab(struct qmempool *pool, void **elems, int n)
+{
+	int num, i, j;
+	/* SLAB considerations, we could use kmem_cache interface that
+	 * supports returning a bulk of elements.
+	 */
+
+	/* free these elements for real */
+	for (i = 0; i < n; i++)
+		kmem_cache_free(pool->kmem, elems[i]);
+
+	/* Make room in sharedq for next round */
+	for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) {
+		num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK);
+		for (j = 0; j < num; j++)
+			kmem_cache_free(pool->kmem, elems[j]);
+	}
+	return true;
+}
+
+/* This function is called when the localq is full. Thus, elements
+ * from localq needs to be (dequeued) and returned (enqueued) to
+ * sharedq (or if shared is full, need to be free'ed to slab)
+ *
+ * MUST be called from a preemptive safe context.
+ */
+void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool,
+				struct alf_queue *localq)
+{
+	void *elems[QMEMPOOL_BULK]; /* on stack variable */
+	int num_enq, num_deq;
+
+	elems[0] = elem;
+	/* Make room in localq */
+	num_deq = alf_sc_dequeue(localq, &elems[1], QMEMPOOL_BULK-1);
+	if (unlikely(num_deq == 0))
+		goto failed;
+	num_deq++; /* count first 'elem' */
+
+	/* Successful dequeued 'num_deq' elements from localq, "free"
+	 * these elems by enqueuing to sharedq
+	 */
+	num_enq = alf_mp_enqueue(pool->sharedq, elems, num_deq);
+	if (likely(num_enq == num_deq)) /* Success enqueued to sharedq */
+		return;
+
+	/* If sharedq is full (num_enq == 0) dequeue elements will be
+	 * returned directly to the SLAB allocator.
+	 *
+	 * Note: This usage of alf_queue API depend on enqueue is
+	 * fixed, by only enqueueing if all elements could fit, this
+	 * is an API that might change.
+	 */
+
+	__qmempool_free_to_slab(pool, elems, num_deq);
+	return;
+failed:
+	/* dequeing from a full localq should always be possible */
+	BUG();
+}
+EXPORT_SYMBOL(__qmempool_free_to_sharedq);
+
+/* API users can choose to use "__" prefixed versions for inlining */
+void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+	return __qmempool_alloc(pool, gfp_mask);
+}
+EXPORT_SYMBOL(qmempool_alloc);
+
+void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask)
+{
+	return __qmempool_alloc_softirq(pool, gfp_mask);
+}
+EXPORT_SYMBOL(qmempool_alloc_softirq);
+
+void qmempool_free(struct qmempool *pool, void *elem)
+{
+	return __qmempool_free(pool, elem);
+}
+EXPORT_SYMBOL(qmempool_free);
+
+MODULE_DESCRIPTION("Quick queue based mempool (qmempool)");
+MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@brouer.com>");
+MODULE_LICENSE("GPL");

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 1/3] lib: adding an Array-based Lock-Free (ALF) queue
From: Jesper Dangaard Brouer @ 2014-12-10 14:15 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, linux-kernel, linux-mm,
	Christoph Lameter
  Cc: linux-api, Eric Dumazet, David S. Miller, Hannes Frederic Sowa,
	Alexander Duyck, Alexei Starovoitov, Paul E. McKenney,
	Mathieu Desnoyers, Steven Rostedt
In-Reply-To: <20141210141332.31779.56391.stgit@dragon>

This Array-based Lock-Free (ALF) queue, is a very fast bounded
Producer-Consumer queue, supporting bulking.  The MPMC
(Multi-Producer/Multi-Consumer) variant uses a locked cmpxchg, but the
cost can be amorized by utilizing bulk enqueue/dequeue.

Results on x86_64 CPU E5-2695, for variants:
 MPMC = Multi-Producer-Multi-Consumer
 SPSC = Single-Producer-Single-Consumer

(none-bulking):  per element cost MPMC and SPSC
                   MPMC     -- SPSC
 simple         :  9.519 ns -- 1.282 ns
 multi(step:128): 12.905 ns -- 2.240 ns

The majority of the cost is associated with the locked cmpxchg in the
MPMC variant.  Bulking helps amortize this cost:

(bulking) cost per element comparing MPMC -> SPSC:
         MPMC     -- SPSC
 bulk2 : 5.849 ns -- 1.748 ns
 bulk3 : 4.102 ns -- 1.531 ns
 bulk4 : 3.281 ns -- 1.383 ns
 bulk6 : 2.530 ns -- 1.238 ns
 bulk8 : 2.125 ns -- 1.196 ns
 bulk16: 1.552 ns -- 1.109 ns

Joint work with Hannes Frederic Sowa.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
---

Correctness of memory barries on different arch's need to be evaluated.

The API might need some adjustments/discussions regarding:

1) the semantics of enq/deq when doing bulking, choosing what
to do when e.g. a bulk enqueue cannot fit fully.

2) some better way to detect miss-use of API, e.g. using
single-enqueue variant function call on a multi-enqueue variant data
structure.  Now no detection happens.

 include/linux/alf_queue.h |  303 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig               |   13 ++
 lib/Makefile              |    2 
 lib/alf_queue.c           |   47 +++++++
 4 files changed, 365 insertions(+), 0 deletions(-)
 create mode 100644 include/linux/alf_queue.h
 create mode 100644 lib/alf_queue.c

diff --git a/include/linux/alf_queue.h b/include/linux/alf_queue.h
new file mode 100644
index 0000000..fb1a774
--- /dev/null
+++ b/include/linux/alf_queue.h
@@ -0,0 +1,303 @@
+#ifndef _LINUX_ALF_QUEUE_H
+#define _LINUX_ALF_QUEUE_H
+/* linux/alf_queue.h
+ *
+ * ALF: Array-based Lock-Free queue
+ *
+ * Queue properties
+ *  - Array based for cache-line optimization
+ *  - Bounded by the array size
+ *  - FIFO Producer/Consumer queue, no queue traversal supported
+ *  - Very fast
+ *  - Designed as a queue for pointers to objects
+ *  - Bulk enqueue and dequeue support
+ *  - Supports combinations of Multi and Single Producer/Consumer
+ *
+ * Copyright (C) 2014, Red Hat, Inc.,
+ *  by Jesper Dangaard Brouer and Hannes Frederic Sowa
+ *  for licensing details see kernel-base/COPYING
+ */
+#include <linux/compiler.h>
+#include <linux/kernel.h>
+
+struct alf_actor {
+	u32 head;
+	u32 tail;
+};
+
+struct alf_queue {
+	u32 size;
+	u32 mask;
+	u32 flags;
+	struct alf_actor producer ____cacheline_aligned_in_smp;
+	struct alf_actor consumer ____cacheline_aligned_in_smp;
+	void *ring[0] ____cacheline_aligned_in_smp;
+};
+
+struct alf_queue *alf_queue_alloc(u32 size, gfp_t gfp);
+void		  alf_queue_free(struct alf_queue *q);
+
+/* Helpers for LOAD and STORE of elements, have been split-out because:
+ *  1. They can be reused for both "Single" and "Multi" variants
+ *  2. Allow us to experiment with (pipeline) optimizations in this area.
+ */
+static inline void
+__helper_alf_enqueue_store(u32 p_head, struct alf_queue *q,
+			   void **ptr, const u32 n)
+{
+	int i, index = p_head;
+
+	for (i = 0; i < n; i++, index++)
+		q->ring[index & q->mask] = ptr[i];
+}
+
+static inline void
+__helper_alf_dequeue_load(u32 c_head, struct alf_queue *q,
+			  void **ptr, const u32 elems)
+{
+	int i, index = c_head;
+
+	for (i = 0; i < elems; i++, index++)
+		ptr[i] = q->ring[index & q->mask];
+}
+
+/* Main Multi-Producer ENQUEUE
+ *
+ * Even-though current API have a "fixed" semantics of aborting if it
+ * cannot enqueue the full bulk size.  Users of this API should check
+ * on the returned number of enqueue elements match, to verify enqueue
+ * was successful.  This allow us to introduce a "variable" enqueue
+ * scheme later.
+ *
+ * Not preemption safe. Multiple CPUs can enqueue elements, but the
+ * same CPU is not allowed to be preempted and access the same
+ * queue. Due to how the tail is updated, this can result in a soft
+ * lock-up. (Same goes for alf_mc_dequeue).
+ */
+static inline int
+alf_mp_enqueue(const u32 n;
+	       struct alf_queue *q, void *ptr[n], const u32 n)
+{
+	u32 p_head, p_next, c_tail, space;
+
+	/* Reserve part of the array for enqueue STORE/WRITE */
+	do {
+		p_head = ACCESS_ONCE(q->producer.head);
+		c_tail = ACCESS_ONCE(q->consumer.tail);
+
+		space = q->size + c_tail - p_head;
+		if (n > space)
+			return 0;
+
+		p_next = p_head + n;
+	}
+	while (unlikely(cmpxchg(&q->producer.head, p_head, p_next) != p_head));
+
+	/* STORE the elems into the queue array */
+	__helper_alf_enqueue_store(p_head, q, ptr, n);
+	smp_wmb(); /* Write-Memory-Barrier matching dequeue LOADs */
+
+	/* Wait for other concurrent preceding enqueues not yet done,
+	 * this part make us none-wait-free and could be problematic
+	 * in case of congestion with many CPUs
+	 */
+	while (unlikely(ACCESS_ONCE(q->producer.tail) != p_head))
+		cpu_relax();
+	/* Mark this enq done and avail for consumption */
+	ACCESS_ONCE(q->producer.tail) = p_next;
+
+	return n;
+}
+
+/* Main Multi-Consumer DEQUEUE */
+static inline int
+alf_mc_dequeue(const u32 n;
+	       struct alf_queue *q, void *ptr[n], const u32 n)
+{
+	u32 c_head, c_next, p_tail, elems;
+
+	/* Reserve part of the array for dequeue LOAD/READ */
+	do {
+		c_head = ACCESS_ONCE(q->consumer.head);
+		p_tail = ACCESS_ONCE(q->producer.tail);
+
+		elems = p_tail - c_head;
+
+		if (elems == 0)
+			return 0;
+		else
+			elems = min(elems, n);
+
+		c_next = c_head + elems;
+	}
+	while (unlikely(cmpxchg(&q->consumer.head, c_head, c_next) != c_head));
+
+	/* LOAD the elems from the queue array.
+	 *   We don't need a smb_rmb() Read-Memory-Barrier here because
+	 *   the above cmpxchg is an implied full Memory-Barrier.
+	 */
+	__helper_alf_dequeue_load(c_head, q, ptr, elems);
+
+	/* Archs with weak Memory Ordering need a memory barrier here.
+	 * As the STORE to q->consumer.tail, must happen after the
+	 * dequeue LOADs. Dequeue LOADs have a dependent STORE into
+	 * ptr, thus a smp_wmb() is enough. Paired with enqueue
+	 * implicit full-MB in cmpxchg.
+	 */
+	smp_wmb();
+
+	/* Wait for other concurrent preceding dequeues not yet done */
+	while (unlikely(ACCESS_ONCE(q->consumer.tail) != c_head))
+		cpu_relax();
+	/* Mark this deq done and avail for producers */
+	ACCESS_ONCE(q->consumer.tail) = c_next;
+
+	return elems;
+}
+
+/* #define ASSERT_DEBUG_SPSC 1 */
+#ifndef ASSERT_DEBUG_SPSC
+#define ASSERT(x) do { } while (0)
+#else
+#define ASSERT(x)							\
+	do {								\
+		if (unlikely(!(x))) {					\
+			pr_crit("Assertion failed %s:%d: \"%s\"\n",	\
+				__FILE__, __LINE__, #x);		\
+			BUG();						\
+		}							\
+	} while (0)
+#endif
+
+/* Main SINGLE Producer ENQUEUE
+ *  caller MUST make sure preemption is disabled
+ */
+static inline int
+alf_sp_enqueue(const u32 n;
+	       struct alf_queue *q, void *ptr[n], const u32 n)
+{
+	u32 p_head, p_next, c_tail, space;
+
+	/* Reserve part of the array for enqueue STORE/WRITE */
+	p_head = q->producer.head;
+	smp_rmb(); /* for consumer.tail write, making sure deq loads are done */
+	c_tail = ACCESS_ONCE(q->consumer.tail);
+
+	space = q->size + c_tail - p_head;
+	if (n > space)
+		return 0;
+
+	p_next = p_head + n;
+	ASSERT(ACCESS_ONCE(q->producer.head) == p_head);
+	q->producer.head = p_next;
+
+	/* STORE the elems into the queue array */
+	__helper_alf_enqueue_store(p_head, q, ptr, n);
+	smp_wmb(); /* Write-Memory-Barrier matching dequeue LOADs */
+
+	/* Assert no other CPU (or same CPU via preemption) changed queue */
+	ASSERT(ACCESS_ONCE(q->producer.tail) == p_head);
+
+	/* Mark this enq done and avail for consumption */
+	ACCESS_ONCE(q->producer.tail) = p_next;
+
+	return n;
+}
+
+/* Main SINGLE Consumer DEQUEUE
+ *  caller MUST make sure preemption is disabled
+ */
+static inline int
+alf_sc_dequeue(const u32 n;
+	       struct alf_queue *q, void *ptr[n], const u32 n)
+{
+	u32 c_head, c_next, p_tail, elems;
+
+	/* Reserve part of the array for dequeue LOAD/READ */
+	c_head = q->consumer.head;
+	p_tail = ACCESS_ONCE(q->producer.tail);
+
+	elems = p_tail - c_head;
+
+	if (elems == 0)
+		return 0;
+	else
+		elems = min(elems, n);
+
+	c_next = c_head + elems;
+	ASSERT(ACCESS_ONCE(q->consumer.head) == c_head);
+	q->consumer.head = c_next;
+
+	smp_rmb(); /* Read-Memory-Barrier matching enq STOREs */
+	__helper_alf_dequeue_load(c_head, q, ptr, elems);
+
+	/* Archs with weak Memory Ordering need a memory barrier here.
+	 * As the STORE to q->consumer.tail, must happen after the
+	 * dequeue LOADs. Dequeue LOADs have a dependent STORE into
+	 * ptr, thus a smp_wmb() is enough.
+	 */
+	smp_wmb();
+
+	/* Assert no other CPU (or same CPU via preemption) changed queue */
+	ASSERT(ACCESS_ONCE(q->consumer.tail) == c_head);
+
+	/* Mark this deq done and avail for producers */
+	ACCESS_ONCE(q->consumer.tail) = c_next;
+
+	return elems;
+}
+
+static inline bool
+alf_queue_empty(struct alf_queue *q)
+{
+	u32 c_tail = ACCESS_ONCE(q->consumer.tail);
+	u32 p_tail = ACCESS_ONCE(q->producer.tail);
+
+	/* The empty (and initial state) is when consumer have reached
+	 * up with producer.
+	 *
+	 * DOUBLE-CHECK: Should we use producer.head, as this indicate
+	 * a producer is in-progress(?)
+	 */
+	return c_tail == p_tail;
+}
+
+static inline int
+alf_queue_count(struct alf_queue *q)
+{
+	u32 c_head = ACCESS_ONCE(q->consumer.head);
+	u32 p_tail = ACCESS_ONCE(q->producer.tail);
+	u32 elems;
+
+	/* Due to u32 arithmetic the values are implicitly
+	 * masked/modulo 32-bit, thus saving one mask operation
+	 */
+	elems = p_tail - c_head;
+	/* Thus, same as:
+	 *  elems = (p_tail - c_head) & q->mask;
+	 */
+	return elems;
+}
+
+static inline int
+alf_queue_avail_space(struct alf_queue *q)
+{
+	u32 p_head = ACCESS_ONCE(q->producer.head);
+	u32 c_tail = ACCESS_ONCE(q->consumer.tail);
+	u32 space;
+
+	/* The max avail space is q->size and
+	 * the empty state is when (consumer == producer)
+	 */
+
+	/* Due to u32 arithmetic the values are implicitly
+	 * masked/modulo 32-bit, thus saving one mask operation
+	 */
+	space = q->size + c_tail - p_head;
+	/* Thus, same as:
+	 *  space = (q->size + c_tail - p_head) & q->mask;
+	 */
+	return space;
+}
+
+#endif /* _LINUX_ALF_QUEUE_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 54cf309..3c0cd58 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -439,6 +439,19 @@ config NLATTR
 	bool
 
 #
+# ALF queue
+#
+config ALF_QUEUE
+	bool "ALF: Array-based Lock-Free (Producer-Consumer) queue"
+	default y
+	help
+	  This Array-based Lock-Free (ALF) queue, is a very fast
+	  bounded Producer-Consumer queue, supporting bulking.  The
+	  MPMC (Multi-Producer/Multi-Consumer) variant uses a locked
+	  cmpxchg, but the cost can be amorized by utilizing bulk
+	  enqueue/dequeue.
+
+#
 # Generic 64-bit atomic support is selected if needed
 #
 config GENERIC_ATOMIC64
diff --git a/lib/Makefile b/lib/Makefile
index 0211d2b..cd3a2d0 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -119,6 +119,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
 
 obj-$(CONFIG_NLATTR) += nlattr.o
 
+obj-$(CONFIG_ALF_QUEUE) += alf_queue.o
+
 obj-$(CONFIG_LRU_CACHE) += lru_cache.o
 
 obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o
diff --git a/lib/alf_queue.c b/lib/alf_queue.c
new file mode 100644
index 0000000..d6c9b69
--- /dev/null
+++ b/lib/alf_queue.c
@@ -0,0 +1,47 @@
+/*
+ * lib/alf_queue.c
+ *
+ * ALF: Array-based Lock-Free queue
+ *  - Main implementation in: include/linux/alf_queue.h
+ *
+ * Copyright (C) 2014, Red Hat, Inc.,
+ *  by Jesper Dangaard Brouer and Hannes Frederic Sowa
+ *  for licensing details see kernel-base/COPYING
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/slab.h> /* kzalloc */
+#include <linux/alf_queue.h>
+#include <linux/log2.h>
+
+struct alf_queue *alf_queue_alloc(u32 size, gfp_t gfp)
+{
+	struct alf_queue *q;
+	size_t mem_size;
+
+	if (!(is_power_of_2(size)) || size > 65536)
+		return ERR_PTR(-EINVAL);
+
+	/* The ring array is allocated together with the queue struct */
+	mem_size = size * sizeof(void *) + sizeof(struct alf_queue);
+	q = kzalloc(mem_size, gfp);
+	if (!q)
+		return ERR_PTR(-ENOMEM);
+
+	q->size = size;
+	q->mask = size - 1;
+
+	return q;
+}
+EXPORT_SYMBOL_GPL(alf_queue_alloc);
+
+void alf_queue_free(struct alf_queue *q)
+{
+	kfree(q);
+}
+EXPORT_SYMBOL_GPL(alf_queue_free);
+
+MODULE_DESCRIPTION("ALF: Array-based Lock-Free queue");
+MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@brouer.com>");
+MODULE_LICENSE("GPL");

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related

* [RFC PATCH 0/3] Faster than SLAB caching of SKBs with qmempool (backed by alf_queue)
From: Jesper Dangaard Brouer @ 2014-12-10 14:15 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, netdev, linux-kernel, linux-mm,
	Christoph Lameter
  Cc: linux-api, Eric Dumazet, David S. Miller, Hannes Frederic Sowa,
	Alexander Duyck, Alexei Starovoitov, Paul E. McKenney,
	Mathieu Desnoyers, Steven Rostedt
In-Reply-To: <20141210033902.2114.68658.stgit@ahduyck-vm-fedora20>

The network stack have some use-cases that puts some extreme demands
on the memory allocator.  One use-case, 10Gbit/s wirespeed at smallest
packet size[1], requires handling a packet every 67.2 ns (nanosec).

Micro benchmarking[2] the SLUB allocator (with skb size 256bytes
elements), show "fast-path" instant reuse only costs 19 ns, but a
closer to network usage pattern show the cost rise to 45 ns.

This patchset introduce a quick mempool (qmempool), which when used
in-front of the SKB (sk_buff) kmem_cache, saves 12 ns on "fast-path"
drop in iptables "raw" table, but more importantly saves 40 ns with
IP-forwarding, which were hitting the slower SLUB use-case.


One of the building blocks for achieving this speedup is a cmpxchg
based Lock-Free queue that supports bulking, named alf_queue for
Array-based Lock-Free queue.  By bulking elements (pointers) from the
queue, the cost of the cmpxchg (approx 8 ns) is amortized over several
elements.

 Patch1: alf_queue (Lock-Free queue)

 Patch2: qmempool using alf_queue

 Patch3: usage of qmempool for SKB caching


Notice, this patchset depend on introduction of napi_alloc_skb(),
which is part of Alexander Duyck's work patchset [3].

Different correctness tests and micro benchmarks are avail via my
github repo "prototype-kernel"[4], where the alf_queue and qmempool is
also kept in sync with this patchset.

Links:
 [1]: http://netoptimizer.blogspot.dk/2014/05/the-calculations-10gbits-wirespeed.html
 [2]: https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/qmempool_bench.c
 [3]: http://thread.gmane.org/gmane.linux.network/342347
 [4]: https://github.com/netoptimizer/prototype-kernel

---

Jesper Dangaard Brouer (3):
      net: use qmempool in-front of sk_buff kmem_cache
      mm: qmempool - quick queue based memory pool
      lib: adding an Array-based Lock-Free (ALF) queue


 include/linux/alf_queue.h |  303 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/qmempool.h  |  205 +++++++++++++++++++++++++++++
 include/linux/skbuff.h    |    4 -
 lib/Kconfig               |   13 ++
 lib/Makefile              |    2 
 lib/alf_queue.c           |   47 +++++++
 mm/Kconfig                |   12 ++
 mm/Makefile               |    1 
 mm/qmempool.c             |  322 +++++++++++++++++++++++++++++++++++++++++++++
 net/core/dev.c            |    5 +
 net/core/skbuff.c         |   43 +++++-
 11 files changed, 950 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/alf_queue.h
 create mode 100644 include/linux/qmempool.h
 create mode 100644 lib/alf_queue.c
 create mode 100644 mm/qmempool.c

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply

* Re: xen-netback: make feature-rx-notify mandatory -- Breaks stubdoms
From: David Vrabel @ 2014-12-10 14:12 UTC (permalink / raw)
  To: John; +Cc: Xen-devel@lists.xen.org, Ian Campbell, Wei Liu,
	netdev@vger.kernel.org
In-Reply-To: <54884DA8.7030003@nuclearfallout.net>

On 10/12/14 13:42, John wrote:
> David,
> 
> This patch you put into 3.18.0 appears to break the latest version of
> stubdomains. I found this out today when I tried to update a machine to
> 3.18.0 and all of the domUs crashed on start with the dmesg output like
> this:

Cc'ing the lists and relevant netback maintainers.

I guess the stubdoms are using minios's netfront?  This is something I
forgot about when deciding if it was ok to make this feature mandatory.

The patch cannot be reverted as it's a prerequisite for a critical
(security) bug fix.  I am also unconvinced that the no-feature-rx-notify
support worked correctly anyway.

This can be resolved by:

- Fixing minios's netfront to support feature-rx-notify. This should be
easy but wouldn't help existing Xen deployments.

Or:

- Reimplement feature-rx-notify support.  I think the easiest way is to
queue packets on the guest Rx internal queue with a short expiry time.

> [   83.045785] device vif2.0 entered promiscuous mode
> [   83.059220] vif vif-2-0: 22 feature-rx-notify is mandatory
> [   83.060763] vif vif-2-0: 1 mapping in shared page 2047 from domain 2
> [   83.060861] vif vif-2-0: 1 mapping shared-frames 2047/2046 port tx 4
> rx 4
> 
> This is on the very latest patched version of 4.4.

David

^ permalink raw reply

* Re: [PATCH nf-next 0/2] netfilter: conntrack: route cache for forwarded connections
From: Pablo Neira Ayuso @ 2014-12-10 14:13 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel, netdev, brouer, Eric Dumazet
In-Reply-To: <1418052964-4632-1-git-send-email-fw@strlen.de>

On Mon, Dec 08, 2014 at 04:36:02PM +0100, Florian Westphal wrote:
> [ Pablo, in case you deem this too late for -next just let me know
> and I will resend once its open again ]
> 
> This adds an optional forward routing cache extension for netfilter
> connection tracking.
> 
> The memory cost is an additional 32 bytes per conntrack entry
> on x86_64.
> 
> Unlike any other currently implemented connection tracking
> extension the rtcache has no run-time tunables, it is always active.
> 
> Also, unlike other conntrack extensions, it can be built as a module,
> in this case modprobe/rmmod are used to enable/disable the cache.

I expect distributors will provide this a module. I think we should
provide features that can be enable/disable in some way, in this case
it can be modprobe/rmmod.

BTW, did you evaluate Eric's alternative? Any comment on that?

Florian Westphal <fw@strlen.de> wrote:
>> +     if (likely(dst))
>> +             skb_dst_set_noref_force(skb, dst);
>
> Note that Hannes submitted a patch vs. net-next that removes
> skb_dst_set_noref_force().

I refreshed the nf-next tree, this patch is now there.

If the merge window remains open, I'll take the pending patches in
patchwork and send a new batch for David by tomorrow morning.

Let me know, thanks.

^ permalink raw reply

* Re: [PATCH net v5 2/7] cxgb4i: fix credit check for tx_data_wr
From: Sergei Shtylyov @ 2014-12-10 14:09 UTC (permalink / raw)
  To: Karen Xie, linux-scsi, netdev
  Cc: hariprasad, anish, hch, James.Bottomley, michaelc, davem
In-Reply-To: <201412100143.sBA1hSF4024918@localhost6.localdomain6>

Hello.

On 12/10/2014 4:43 AM, Karen Xie wrote:

> [PATCH net v5 2/7] cxgb4i: fix credit check for tx_data_wr

> From: Karen Xie <kxie@chelsio.com>

> make sure any tx credit related checking is done before adding the wr header.

> Signed-off-by: Karen Xie <kxie@chelsio.com>
> ---
>   drivers/scsi/cxgbi/cxgb4i/cxgb4i.c |    9 +++++----
>   1 files changed, 5 insertions(+), 4 deletions(-)

> diff --git a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
> index f119a67..56dbd25 100644
> --- a/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
> +++ b/drivers/scsi/cxgbi/cxgb4i/cxgb4i.c
> @@ -547,15 +547,16 @@ static inline void make_tx_data_wr(struct cxgbi_sock *csk, struct sk_buff *skb,
[...]
>   		req->op_to_immdlen = htonl(FW_WR_OP(FW_OFLD_TX_DATA_WR) |
> -					FW_WR_COMPL(1) |
> -					FW_WR_IMMDLEN(dlen));
> +					   FW_WR_COMPL(1) |
> +					   FW_WR_IMMDLEN(dlen));
>   		req->flowid_len16 = htonl(FW_WR_FLOWID(csk->tid) |
> -						FW_WR_LEN16(credits));
> +					  FW_WR_LEN16(credits));

    The above looks like unrelated cleanup, worth putting in a separate 
net-next patch...

WBR, Sergei


^ permalink raw reply

* Re: [PATCH 1/1] net: dsa: Fix of kernel panic in case of missing PHY.
From: Andrey Volkov @ 2014-12-10 10:06 UTC (permalink / raw)
  To: Florian Fainelli; +Cc: netdev, Brian Norris
In-Reply-To: <5487BCBA.2060403@gmail.com>


Le 10/12/2014 04:23, Florian Fainelli wrote :
> On 09/12/14 09:31, Andrey Volkov wrote:
>> Fix of kernel panic in case of missing PHY.
>>
>> Signed-off-by: Andrey Volkov <andrey.volkov@nexvision.fr>
> 
> Brian has actually been able to reproduce such a crash in this code-path
> today:
> 
> if (!p->phy) {
>                 p->phy = ds->slave_mii_bus->phy_map[p->port];
>                 phy_connect_direct(slave_dev, p->phy,
> 				   dsa_slave_adjust_link,
>                                    p->phy_interface);
> }
> 
> we basically assume here that we have a valid phy pointer out of
> ds->slave_mii_bus->phy_map[p->port] which is not true in all cases,
> especially not if the device is not there.
Yes it's  what really happened in my case: some PHYs were not soldered 
(development version of the board). But in real life, they may be missed for various
reasons, so that the assumption was wrong.

> 
> I will come up with a fix for that, as for propagating the error code
> down to the caller, this can be a separate patch.
Ok, I'll wait for it

> 
> Thanks!
²You are welcome!

> 
>> ---
>>  net/dsa/slave.c |   19 +++++++++++++++----
>>  1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/net/dsa/slave.c b/net/dsa/slave.c
>> index 528380a..6f89caa 100644
>> --- a/net/dsa/slave.c
>> +++ b/net/dsa/slave.c
>> @@ -512,7 +512,7 @@ static int dsa_slave_fixed_link_update(struct net_device *dev,
>>  }
>>  
>>  /* slave device setup *******************************************************/
>> -static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
>> +static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
>>  				struct net_device *slave_dev)
>>  {
>>  	struct dsa_switch *ds = p->parent;
>> @@ -533,7 +533,7 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
>>  		ret = of_phy_register_fixed_link(port_dn);
>>  		if (ret) {
>>  			netdev_err(slave_dev, "failed to register fixed PHY\n");
>> -			return;
>> +			return ret;
>>  		}
>>  		phy_is_fixed = true;
>>  		phy_dn = port_dn;
>> @@ -555,12 +555,17 @@ static void dsa_slave_phy_setup(struct dsa_slave_priv *p,
>>  	 */
>>  	if (!p->phy) {
>>  		p->phy = ds->slave_mii_bus->phy_map[p->port];
>> -		phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
>> +		if(p->phy)
>> +			phy_connect_direct(slave_dev, p->phy, dsa_slave_adjust_link,
>>  				   p->phy_interface);
>> +		else
>> +			return -ENODEV;
>> +
>>  	} else {
>>  		netdev_info(slave_dev, "attached PHY at address %d [%s]\n",
>>  			    p->phy->addr, p->phy->drv->name);
>>  	}
>> +	return 0;
>>  }
>>  
>>  int dsa_slave_suspend(struct net_device *slave_dev)
>> @@ -653,7 +658,13 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
>>  	p->old_link = -1;
>>  	p->old_duplex = -1;
>>  
>> -	dsa_slave_phy_setup(p, slave_dev);
>> +	ret = dsa_slave_phy_setup(p, slave_dev);
>> +	if (ret) {
>> +		netdev_err(master, "error %d registering interface %s\n",
>> +			   ret, slave_dev->name);
>> +		free_netdev(slave_dev);
>> +		return NULL;
>> +	}
>>  
>>  	ret = register_netdev(slave_dev);
>>  	if (ret) {
>>
> 
> 

^ permalink raw reply

* [PATCH net-next v2] r8169:update rtl8168g pcie ephy parameter
From: Chunhao Lin @ 2014-12-10 13:28 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Chunhao Lin

Add ephy parameter to rtl8168g.
Also change the common function of rtl8168g from "rtl_hw_start_8168g_1" to
 "rtl_hw_start_8168g". And function "rtl_hw_start_8168g_1" is used for
setting rtl8168g hardware parameters.

Following is the explanation of what hardware parameter change for.
rtl8168g may erroneous judge the PCIe signal quality and show the error bit
on PCI configuration space when in PCIe low power mode.
The following ephy parameters are for above issue.
{ 0x00, 0x0000,	0x0008 }
{ 0x0c, 0x37d0,	0x0820 }
{ 0x1e, 0x0000,	0x0001 }

rtl8168g may return to PCIe L0 from PCIe L0s low power mode too slow.
The following ephy parameter is for above issue.
{ 0x19, 0x8000,	0x0000 }

Signed-off-by: Chunhao Lin <hau@realtek.com>
---
 drivers/net/ethernet/realtek/r8169.c | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index b9c2f33..b77efcb 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -5919,7 +5919,7 @@ static void rtl_hw_start_8411(struct rtl8169_private *tp)
 	rtl_w0w1_eri(tp, 0x0d4, ERIAR_MASK_0011, 0x0c00, 0x0000, ERIAR_EXGMAC);
 }
 
-static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
+static void rtl_hw_start_8168g(struct rtl8169_private *tp)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
 	struct pci_dev *pdev = tp->pci_dev;
@@ -5954,6 +5954,24 @@ static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
 	rtl_pcie_state_l2l3_enable(tp, false);
 }
 
+static void rtl_hw_start_8168g_1(struct rtl8169_private *tp)
+{
+	void __iomem *ioaddr = tp->mmio_addr;
+	static const struct ephy_info e_info_8168g_1[] = {
+		{ 0x00, 0x0000,	0x0008 },
+		{ 0x0c, 0x37d0,	0x0820 },
+		{ 0x1e, 0x0000,	0x0001 },
+		{ 0x19, 0x8000,	0x0000 }
+	};
+
+	rtl_hw_start_8168g(tp);
+
+	/* disable aspm and clock request before access ephy */
+	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
+	RTL_W8(Config5, RTL_R8(Config5) & ~ASPM_en);
+	rtl_ephy_init(tp, e_info_8168g_1, ARRAY_SIZE(e_info_8168g_1));
+}
+
 static void rtl_hw_start_8168g_2(struct rtl8169_private *tp)
 {
 	void __iomem *ioaddr = tp->mmio_addr;
@@ -5964,7 +5982,7 @@ static void rtl_hw_start_8168g_2(struct rtl8169_private *tp)
 		{ 0x1e, 0xffff,	0x20eb }
 	};
 
-	rtl_hw_start_8168g_1(tp);
+	rtl_hw_start_8168g(tp);
 
 	/* disable aspm and clock request before access ephy */
 	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
@@ -5983,7 +6001,7 @@ static void rtl_hw_start_8411_2(struct rtl8169_private *tp)
 		{ 0x1e, 0x0000,	0x2000 }
 	};
 
-	rtl_hw_start_8168g_1(tp);
+	rtl_hw_start_8168g(tp);
 
 	/* disable aspm and clock request before access ephy */
 	RTL_W8(Config2, RTL_R8(Config2) & ~ClkReqEn);
-- 
1.9.1

^ permalink raw reply related

* [PATCH 3/5] isdn/gigaset: elliminate unnecessary argument from send_cb()
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux
In-Reply-To: <cover.1418214588.git.tilman@imap.cc>

No need to pass a member of the cardstate structure as a separate
argument if the entire structure is already passed.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/isdn/gigaset/usb-gigaset.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c
index c31d746..5f306e2 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -293,7 +293,7 @@ static int gigaset_close_bchannel(struct bc_state *bcs)
 }
 
 static int write_modem(struct cardstate *cs);
-static int send_cb(struct cardstate *cs, struct cmdbuf_t *cb);
+static int send_cb(struct cardstate *cs);
 
 
 /* Write tasklet handler: Continue sending current skb, or send command, or
@@ -303,7 +303,6 @@ static void gigaset_modem_fill(unsigned long data)
 {
 	struct cardstate *cs = (struct cardstate *) data;
 	struct bc_state *bcs = &cs->bcs[0]; /* only one channel */
-	struct cmdbuf_t *cb;
 
 	gig_dbg(DEBUG_OUTPUT, "modem_fill");
 
@@ -314,10 +313,9 @@ static void gigaset_modem_fill(unsigned long data)
 
 again:
 	if (!bcs->tx_skb) {	/* no skb is being sent */
-		cb = cs->cmdbuf;
-		if (cb) {	/* commands to send? */
+		if (cs->cmdbuf) {	/* commands to send? */
 			gig_dbg(DEBUG_OUTPUT, "modem_fill: cb");
-			if (send_cb(cs, cb) < 0) {
+			if (send_cb(cs) < 0) {
 				gig_dbg(DEBUG_OUTPUT,
 					"modem_fill: send_cb failed");
 				goto again; /* no callback will be called! */
@@ -425,9 +423,9 @@ static void gigaset_write_bulk_callback(struct urb *urb)
 	spin_unlock_irqrestore(&cs->lock, flags);
 }
 
-static int send_cb(struct cardstate *cs, struct cmdbuf_t *cb)
+static int send_cb(struct cardstate *cs)
 {
-	struct cmdbuf_t *tcb;
+	struct cmdbuf_t *cb = cs->cmdbuf;
 	unsigned long flags;
 	int count;
 	int status = -ENOENT;
@@ -435,26 +433,27 @@ static int send_cb(struct cardstate *cs, struct cmdbuf_t *cb)
 
 	do {
 		if (!cb->len) {
-			tcb = cb;
-
 			spin_lock_irqsave(&cs->cmdlock, flags);
 			cs->cmdbytes -= cs->curlen;
 			gig_dbg(DEBUG_OUTPUT, "send_cb: sent %u bytes, %u left",
 				cs->curlen, cs->cmdbytes);
-			cs->cmdbuf = cb = cb->next;
-			if (cb) {
-				cb->prev = NULL;
-				cs->curlen = cb->len;
+			cs->cmdbuf = cb->next;
+			if (cs->cmdbuf) {
+				cs->cmdbuf->prev = NULL;
+				cs->curlen = cs->cmdbuf->len;
 			} else {
 				cs->lastcmdbuf = NULL;
 				cs->curlen = 0;
 			}
 			spin_unlock_irqrestore(&cs->cmdlock, flags);
 
-			if (tcb->wake_tasklet)
-				tasklet_schedule(tcb->wake_tasklet);
-			kfree(tcb);
+			if (cb->wake_tasklet)
+				tasklet_schedule(cb->wake_tasklet);
+			kfree(cb);
+
+			cb = cs->cmdbuf;
 		}
+
 		if (cb) {
 			count = min(cb->len, ucs->bulk_out_size);
 			gig_dbg(DEBUG_OUTPUT, "send_cb: send %d bytes", count);
-- 
1.9.2.459.g68773ac

^ permalink raw reply related

* [PATCH 2/5] isdn/gigaset: clarify gigaset_modem_fill control structure
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux
In-Reply-To: <cover.1418214588.git.tilman@imap.cc>

Replace the flag-controlled retry loop by explicit goto statements
in the error branches to make the control structure easier to
understand.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/isdn/gigaset/usb-gigaset.c | 52 ++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c
index a8e652d..c31d746 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -304,7 +304,6 @@ static void gigaset_modem_fill(unsigned long data)
 	struct cardstate *cs = (struct cardstate *) data;
 	struct bc_state *bcs = &cs->bcs[0]; /* only one channel */
 	struct cmdbuf_t *cb;
-	int again;
 
 	gig_dbg(DEBUG_OUTPUT, "modem_fill");
 
@@ -313,36 +312,33 @@ static void gigaset_modem_fill(unsigned long data)
 		return;
 	}
 
-	do {
-		again = 0;
-		if (!bcs->tx_skb) { /* no skb is being sent */
-			cb = cs->cmdbuf;
-			if (cb) { /* commands to send? */
-				gig_dbg(DEBUG_OUTPUT, "modem_fill: cb");
-				if (send_cb(cs, cb) < 0) {
-					gig_dbg(DEBUG_OUTPUT,
-						"modem_fill: send_cb failed");
-					again = 1; /* no callback will be
-						      called! */
-				}
-			} else { /* skbs to send? */
-				bcs->tx_skb = skb_dequeue(&bcs->squeue);
-				if (bcs->tx_skb)
-					gig_dbg(DEBUG_INTR,
-						"Dequeued skb (Adr: %lx)!",
-						(unsigned long) bcs->tx_skb);
-			}
-		}
-
-		if (bcs->tx_skb) {
-			gig_dbg(DEBUG_OUTPUT, "modem_fill: tx_skb");
-			if (write_modem(cs) < 0) {
+again:
+	if (!bcs->tx_skb) {	/* no skb is being sent */
+		cb = cs->cmdbuf;
+		if (cb) {	/* commands to send? */
+			gig_dbg(DEBUG_OUTPUT, "modem_fill: cb");
+			if (send_cb(cs, cb) < 0) {
 				gig_dbg(DEBUG_OUTPUT,
-					"modem_fill: write_modem failed");
-				again = 1; /* no callback will be called! */
+					"modem_fill: send_cb failed");
+				goto again; /* no callback will be called! */
 			}
+			return;
 		}
-	} while (again);
+
+		/* skbs to send? */
+		bcs->tx_skb = skb_dequeue(&bcs->squeue);
+		if (!bcs->tx_skb)
+			return;
+
+		gig_dbg(DEBUG_INTR, "Dequeued skb (Adr: %lx)!",
+			(unsigned long) bcs->tx_skb);
+	}
+
+	gig_dbg(DEBUG_OUTPUT, "modem_fill: tx_skb");
+	if (write_modem(cs) < 0) {
+		gig_dbg(DEBUG_OUTPUT, "modem_fill: write_modem failed");
+		goto again;	/* no callback will be called! */
+	}
 }
 
 /*
-- 
1.9.2.459.g68773ac

^ permalink raw reply related

* [PATCH 4/5] isdn/gigaset: enable Kernel CAPI support by default
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux
In-Reply-To: <cover.1418214588.git.tilman@imap.cc>

Kernel CAPI has been the recommended ISDN subsystem for the Gigaset
driver since kernel release 2.6.34.2. It provides full backwards
compatibility to the old I4L subsystem thanks to the capidrv module.
I4L has been marked as deprecated for more than seven years.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/isdn/gigaset/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/gigaset/Kconfig b/drivers/isdn/gigaset/Kconfig
index dde5e09..83f62b8 100644
--- a/drivers/isdn/gigaset/Kconfig
+++ b/drivers/isdn/gigaset/Kconfig
@@ -20,7 +20,7 @@ if ISDN_DRV_GIGASET
 config GIGASET_CAPI
 	bool "Gigaset CAPI support"
 	depends on ISDN_CAPI='y'||(ISDN_CAPI='m'&&ISDN_DRV_GIGASET='m')
-	default ISDN_I4L='n'
+	default 'y'
 	help
 	  Build the Gigaset driver as a CAPI 2.0 driver interfacing with
 	  the Kernel CAPI subsystem. To use it with the old ISDN4Linux
-- 
1.9.2.459.g68773ac

^ permalink raw reply related

* [PATCH 1/5] isdn/gigaset: drop duplicate declaration
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux
In-Reply-To: <cover.1418214588.git.tilman@imap.cc>

Function gigaset_skb_sent was declared twice, identically, in gigaset.h.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/isdn/gigaset/gigaset.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/isdn/gigaset/gigaset.h b/drivers/isdn/gigaset/gigaset.h
index eb63a0f..166537e 100644
--- a/drivers/isdn/gigaset/gigaset.h
+++ b/drivers/isdn/gigaset/gigaset.h
@@ -751,9 +751,6 @@ void gigaset_stop(struct cardstate *cs);
 /* Tell common.c that the driver is being unloaded. */
 int gigaset_shutdown(struct cardstate *cs);
 
-/* Tell common.c that an skb has been sent. */
-void gigaset_skb_sent(struct bc_state *bcs, struct sk_buff *skb);
-
 /* Append event to the queue.
  * Returns NULL on failure or a pointer to the event on success.
  * ptr must be kmalloc()ed (and not be freed by the caller).
-- 
1.9.2.459.g68773ac

^ permalink raw reply related

* [PATCH 5/5] isdn/capi: correct argument types of command_2_index
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux
In-Reply-To: <cover.1418214588.git.tilman@imap.cc>

Utility function command_2_index is always called with arguments of
type u8. Adapt its declaration accordingly.

Signed-off-by: Tilman Schmidt <tilman@imap.cc>
---
 drivers/isdn/capi/capiutil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/isdn/capi/capiutil.c b/drivers/isdn/capi/capiutil.c
index 36c1b37..9846d82 100644
--- a/drivers/isdn/capi/capiutil.c
+++ b/drivers/isdn/capi/capiutil.c
@@ -201,7 +201,7 @@ static unsigned char *cpars[] =
 #define structTRcpyovl(x, y, l) memmove(y, x, l)
 
 /*-------------------------------------------------------*/
-static unsigned command_2_index(unsigned c, unsigned sc)
+static unsigned command_2_index(u8 c, u8 sc)
 {
 	if (c & 0x80)
 		c = 0x9 + (c & 0x0f);
-- 
1.9.2.459.g68773ac

^ permalink raw reply related

* [PATCH 0/5] ISDN patches for net-next
From: Tilman Schmidt @ 2014-12-10 12:41 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hansjoerg Lipp, Karsten Keil, isdn4linux

Here's a series of patches for the Gigaset ISDN driver and one for
the ISDN CAPI subsystem.  Please merge as appropriate.

Thanks,
Tilman

Tilman Schmidt (5):
  isdn/gigaset: drop duplicate declaration
  isdn/gigaset: clarify gigaset_modem_fill control structure
  isdn/gigaset: elliminate unnecessary argument from send_cb()
  isdn/gigaset: enable Kernel CAPI support by default
  isdn/capi: correct argument types of command_2_index

 drivers/isdn/capi/capiutil.c       |  2 +-
 drivers/isdn/gigaset/Kconfig       |  2 +-
 drivers/isdn/gigaset/gigaset.h     |  3 --
 drivers/isdn/gigaset/usb-gigaset.c | 77 ++++++++++++++++++--------------------
 4 files changed, 38 insertions(+), 46 deletions(-)

-- 
1.9.2.459.g68773ac

^ permalink raw reply

* Re: PROBLEM: bonding status file in /proc not removed when using bond-device as a slave
From: Peter Schmitt @ 2014-12-10 11:49 UTC (permalink / raw)
  To: Andy Gospodarek; +Cc: netdev
In-Reply-To: <20141209135812.GI797@gospo.home.greyhouse.net>

Hi Andy and everyone else!

> > Hi everyone,
> > 
> > I want to create a master-backup bond that has two LACP bonds as slaves. Both
> 
> There is no reason to do this.  The bonding code you are running
> supports the ability to connect a single 802.3ad bond to different
> switches and should do exactly what you need.
> 
> When you put all ports in the same bond you will notice that ports that
> go to one switch will be listed with a particular aggregator ID and all
> ports going to another switch will have a different aggregator ID.  You
> will also see that the bond will list only one active aggregator.  This
> should give you the behaviour you desire.
> 
> Additionally look at 'ad_select' option in the bonding documentation to
> help tune when you switch from one link to another.

Thank you very much for your answer. I did not know about this behaviour. 
The ad_select feature is nice, but you can't explicitly control which switch
(which aggregator id) should be used. So I need some links between those two
switches to handle situations where one machine uses the first switch and the
other uses the second. The Active-Backup mode let's me configure exactly this:
Which bond should be active. With uplinks between the switches, the LACP feature
is indeed very handy.

I wonder what the purpose of the Active-Backup mode is then? Is this the low-budget
solution to the failover problem? Is the setup with Active-Backup and LACP bonds
as slaves supported? This seems to work quite well in my testsetup.

However, the described problem remains. One can create a setup where such a file is left
in the proc-fs and a cat on this file crashes the machine.

Thank you very much for your help.

Best regards,
Peter

> > LACP slaves should be connected to different switches so that I have
> > connectivity even if one switch fails.
> > While experimenting with this setup on the recent LTS kernel 3.14.26 I have found the following behaviour:
> > When I add a bond device (by default an LACP bond, mode 4) to a master-backup
> > bond (mode 1) and then remove it, the corresponding status file for the bond remains in
> > /proc/net/bonding/ and when I do a cat on this file, the machine crashes or I get a
> > general protection fault.
> > 
> > The following snippet creates such a scenario:
> > 
> > #!/bin/bash
> > echo +bond1 > /sys/class/net/bonding_masters
> > echo 1 > /sys/class/net/bond1/bonding/mode
> > echo +bond2 > /sys/class/net/bonding_masters
> > echo +bond2 > /sys/class/net/bond1/bonding/slaves
> > echo -bond2 > /sys/class/net/bond1/bonding/slaves
> > echo -bond2 > /sys/class/net/bonding_masters
> > 
> > After this is executed, the file /proc/net/bonding/bond2 still exists
> > while /sys/class/net/bonding_masters only shows bond1:
> > 
> > > ls -lah /proc/net/bonding/bond*
> > r--r--r-- 1 root root 0 Dec  8 16:53 /proc/net/bonding/bond1
> > r--r--r-- 1 root root 0 Dec  8 16:53 /proc/net/bonding/bond2
> > 
> > > cat /sys/class/net/bonding_masters
> > bond1
> > 
> > When I now make a "cat" on the file in /proc, I get a general protection fault
> > or even worse, the machine just crashes and is unresponsive and it can only be
> > fixed with a power-cycle.
> > 
> > > uname -a
> > Linux bondingtest 3.14.26-x86 #1 SMP Sun Dec 7 11:29:36 CET 2014 i686 GNU/Linux
> > 
> > The bonding module is loaded with the following options:
> > modprobe bonding miimon=100 max_bonds=0 mode=4 lacp_rate=1 xmit_hash_policy=layer2+3
> > 
> > 
> > > cat /proc/net/bonding/bond2
> > general protection fault: 0000 [#1] SMP·
> > Modules linked in: w83627hf hwmon_vid coretemp hwmon ip_set iptable_nat nf_nat_ipv4 ipt_REJECT nf_nat_irc nf_conntrack_irc nf_nat_ftp nf_nat nf_conntrack_ftp msr ipmi_devintf ipmi_msghandler ip_gre gre bonding pcspkr i3200_edac edac_core uhci_hcd ehci_pci ehci_hcd lpc_ich mfd_core pata_acpi ata_generic shpchp e1000e ptp pps_core [last unloaded: cpuid]
> > CPU: 0 PID: 31747 Comm: cat Not tainted 3.14.26-x86_64 #1
> > Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS 080015  06/29/2009
> > task: ffff8800d61d77b0 ti: ffff8800d6068000 task.ti: ffff8800d6068000
> > RIP: 0010:[<ffffffffc00f7ef0>]  [<ffffffffc00f7ef0>] bond_info_seq_show+0x2d0/0x5e0 [bonding]
> > RSP: 0018:ffff8800d6069e08  EFLAGS: 00010212
> > RAX: 5f7367705f656c62 RBX: ffff880198570380 RCX: 0000000000000001
> > RDX: ffffffffc00f9547 RSI: ffffffffc00f954f RDI: ffff880198570380
> > RBP: ffff8800d6069e48 R08: ffffffff9413ed60 R09: ffff8800dba15cd4
> > R10: 0000000000000001 R11: 0000000000000000 R12: ffff8800d60917c0
> > R13: 656b636f6c6e756d R14: ffff880197d469c0 R15: ffff8800d6069e90
> > FS:  0000000000000000(0000) GS:ffff88019fc00000(0063) knlGS:00000000f761c8d0
> > CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
> > CR2: 000000000804ce10 CR3: 00000000db8cd000 CR4: 00000000000407f0
> > Stack:
> > ffff8800d6069e48 ffffffffc00f8318 ffff8801986ba9f0 ffff880197d469c0
> > ffff880198570380 0000000000000001 ffff880197d469c0 ffff8800d6069e90
> > ffff8800d6069ec8 ffffffff9413eed1 ffff880198289a58 0000000008213000
> > Call Trace:
> > [<ffffffffc00f8318>] ? bond_info_seq_start+0x28/0xa8 [bonding]
> > [<ffffffff9413eed1>] seq_read+0x171/0x3f0
> > [<ffffffff94175a7e>] proc_reg_read+0x3e/0x70
> > [<ffffffff9411e0e1>] vfs_read+0xa1/0x160
> > [<ffffffff9411e281>] SyS_read+0x51/0xc0
> > [<ffffffff94039c9c>] ? do_page_fault+0xc/0x10
> > [<ffffffff94516cdf>] sysenter_dispatch+0x7/0x1e
> > Code: 04 49 8b 55 00 48 c7 c6 2a 95 0f c0 48 89 df 31 c0 e8 d5 6c 04 d4 49 8b 04 24 48 c7 c2 47 95 0f c0 48 c7 c6 4f 95 0f c0 48 89 df <48> 8b 40 48 a8 04 48 c7 c0 4c 95 0f c0 48 0f 44 d0 31 c0 e8 a8·
> > RIP  [<ffffffffc00f7ef0>] bond_info_seq_show+0x2d0/0x5e0 [bonding]
> > RSP <ffff8800d6069e08>
> > ---[ end trace 96fae3d9de6068c7 ]---
> > Segmentation fault
> > 
> > ver_linux:
> > Linux bondingtest 3.14.26-x86 #1 SMP Sun Dec 7 11:29:36 CET 2014 i686 GNU/Linux
> > 
> > Gnu C                  4.4.3
> > Gnu make              3.81
> > binutils              2.20.1
> > util-linux            2.17.2
> > mount                  support
> > module-init-tools      found
> > e2fsprogs              1.42.11
> > PPP                    2.4.5
> > Linux C Library        2.11.1
> > Dynamic linker (ldd)  2.11.1
> > Procps                3.2.8
> > Net-tools              1.60
> > Kbd                    1.15
> > Sh-utils              found
> > Modules Loaded        xt_TPROXY xt_set xt_socket nf_defrag_ipv6 xt_REDIRECT ip_set_hash_ip hwmon_vid hwmon bridge ip_set iptable_nat nf_nat_ipv4 ipt_REJECT nf_nat_irc nf_conntrack_irc nf_nat_ftp nf_nat nf_conntrack_ftp msr ipmi_devintf ipmi_msghandler ip_gre gre bonding pcspkr shpchp uhci_hcd ehci_pci ehci_hcd lpc_ich mfd_core pata_acpi ata_generic
> > 
> > If you have any questions or need more information or tests, I will gladly help you with that.
> > 
> > Thank you in advance.
> > 
> > Best regards,
> > Peter Schmitt
> > --
> > To unsubscribe from this list: send the line "unsubscribe netdev" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

^ permalink raw reply

* Re: [PATCH] bridge: Remove BR_PROXYARP flooding check code
From: Jouni Malinen @ 2014-12-10 11:39 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev, Kyeyoon Park
In-Reply-To: <20141209142158.7e513dbf@urahara>

On Tue, Dec 09, 2014 at 02:21:58PM -0800, Stephen Hemminger wrote:
> On Mon,  8 Dec 2014 17:27:40 +0200
> Jouni Malinen <jouni@codeaurora.org> wrote:
> > diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
> > @@ -185,10 +185,6 @@ static void br_flood(struct net_bridge *br, struct sk_buff *skb,
> >  		if (unicast && !(p->flags & BR_FLOOD))
> >  			continue;
> >  
> > -		/* Do not flood to ports that enable proxy ARP */
> > -		if (p->flags & BR_PROXYARP)
> > -			continue;
> > -
> >  		prev = maybe_deliver(prev, p, skb, __packet_hook);

> Aren't you at risk of duplicate ARP responses in some cases.
> You can't assume user will run netfilter.

This is only for the case where BR_PROXYARP has been enabled by the
user, but yes, it would be convenient to handle cases better without
requiring netfilter and skip flooding to BR_PROXYARP port more
selectively here. Would there be some convenient means for
br_do_proxy_arp() to mark the skb that it has replied to in br_input.c
and then use that here in br_forward.c to not flood an ARP request that
has already been replied to? Or should this simply skip flooding of all
ARP packets with something like following?

 		if (unicast && !(p->flags & BR_FLOOD))
 			continue;
 
-		/* Do not flood to ports that enable proxy ARP */
-		if (p->flags & BR_PROXYARP)
+		/* Do not flood ARP to ports that enable proxy ARP */
+		if (p->flags & BR_PROXYARP &&
+		    skb->protocol == htons(ETH_P_ARP))
 			continue;
 
 		prev = maybe_deliver(prev, p, skb, __packet_hook);

-- 
Jouni Malinen                                            PGP id EFC895FA

^ permalink raw reply

* RE: [PATCH 2/2] gianfar: handle map error in gfar_start_xmit()
From: David Laight @ 2014-12-10 11:03 UTC (permalink / raw)
  To: 'David Miller', asolokha@kb.kras.ru
  Cc: claudiu.manoil@freescale.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20141209.153903.1672471330337681259.davem@davemloft.net>

From: David Miller
> From: Arseny Solokha <asolokha@kb.kras.ru>
> Date: Fri,  5 Dec 2014 17:37:54 +0700
> 
> > @@ -2296,6 +2296,12 @@ static int gfar_start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  						   0,
> >  						   frag_len,
> >  						   DMA_TO_DEVICE);
> > +			if (unlikely(dma_mapping_error(priv->dev, bufaddr))) {
> > +				/* As DMA mapping failed, pretend the TX path
> > +				 * is busy to retry later
> > +				 */
> > +				return NETDEV_TX_BUSY;
> > +			}
> 
> You are not "busy", you are dropping the packet due to insufficient system
> resources.
> 
> Therefore the appropriate thing to do is to free the SKB, increment
> the drop statistical counter, and return NETDEV_TX_OK.

Plausibly the error action could depend on the number of messages
in the transmit ring.

If the ring is empty you definitely want to drop the packet.

If mapping a ring full of packets takes more dma map space than
the system has available you may want to be "busy" - otherwise you
get systemic packet loss when transmitting large burst of data.

This could be a problem if all the available dma mapping resources
have been allocated to receive buffers.

Do any common systems actually have limited dma space (apart from
limited bounce buffers)?
If people are only testing on systems with unlimited dma space (eg x86)
then these paths will never be exercised unless an artificial limit
is applied.

	David

^ permalink raw reply

* Re: [PATCH] stmmac: platform: adjust messages and move to dev level
From: Andy Shevchenko @ 2014-12-10 10:47 UTC (permalink / raw)
  To: David Miller; +Cc: peppe.cavallaro, netdev
In-Reply-To: <20141209.182837.2218983730769727387.davem@davemloft.net>

On Tue, 2014-12-09 at 18:28 -0500, David Miller wrote:
> From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> Date: Tue,  9 Dec 2014 11:59:13 +0200
> 
> > This patch amendes error and warning messages across the platform driver. It
> > includes the following changes:
> >  - remove unneccessary message when no memory is available
> >  - change info level to warn in the validation functions
> >  - append \n to the end of messages
> >  - change pr_* macros to dev_*
> > 
> > Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
> 
> This does not apply to the net-next tree, please respin.

Yeah, maybe because the fix (*) you mentioned in previos mail is not in
net-next by some reason?

(*) I refer to commit 28603d13997e2ef47f18589cc9a44553aad49c86

-- 
Andy Shevchenko <andriy.shevchenko@intel.com>
Intel Finland Oy

^ permalink raw reply

* Re: [PATCH] brcmsmac: don't leak kernel memory via printk()
From: Arend van Spriel @ 2014-12-10 10:30 UTC (permalink / raw)
  To: Brian Norris, John W. Linville
  Cc: Brett Rudley, Franky (Zhenhui) Lin, Hante Meuleman,
	linux-wireless, brcm80211-dev-list, netdev
In-Reply-To: <1418204358-8357-1-git-send-email-computersforpeace@gmail.com>

On 10-12-14 10:39, Brian Norris wrote:
> Debug code prints the fifo name via custom dev_warn() wrappers. The
> fifo_names array is only non-zero when debugging is manually enabled,
> which is all well and good. However, it's *not* good that this array
> uses zero-length arrays in the non-debug case, and so it doesn't
> actually have any memory allocated to it. This means that as far as we
> know, fifo_names[i] actually points to garbage memory.
> 
> I've seen this in my log:
> 
> [ 4601.205511] brcmsmac bcma0:1: wl0: brcms_c_d11hdrs_mac80211: �GeL txop exceeded phylen 137/256 dur 1602/1504
> 
> So let's give this array space enough to fill it with a NULL byte.
> 

+ Acked-by: Arend van Spriel <arend@broadcom.com>
> Signed-off-by: Brian Norris <computersforpeace@gmail.com>
> Cc: Brett Rudley <brudley@broadcom.com>
- Cc: Arend van Spriel <arend@broadcom.com>
> Cc: "Franky (Zhenhui) Lin" <frankyl@broadcom.com>
> Cc: Hante Meuleman <meuleman@broadcom.com>
> Cc: "John W. Linville" <linville@tuxdriver.com>
> Cc: linux-wireless@vger.kernel.org
> Cc: brcm80211-dev-list@broadcom.com
> Cc: netdev@vger.kernel.org
> ---
>  drivers/net/wireless/brcm80211/brcmsmac/main.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/wireless/brcm80211/brcmsmac/main.c b/drivers/net/wireless/brcm80211/brcmsmac/main.c
> index 1b474828d5b8..aed0c948dce8 100644
> --- a/drivers/net/wireless/brcm80211/brcmsmac/main.c
> +++ b/drivers/net/wireless/brcm80211/brcmsmac/main.c
> @@ -316,7 +316,7 @@ static const u16 xmtfifo_sz[][NFIFO] = {
>  static const char * const fifo_names[] = {
>  	"AC_BK", "AC_BE", "AC_VI", "AC_VO", "BCMC", "ATIM" };
>  #else
> -static const char fifo_names[6][0];
> +static const char fifo_names[6][1];
>  #endif
>  
>  #ifdef DEBUG
> 

^ permalink raw reply

* [PATCH net-next 3/3] net: fec: only enable mdio interrupt before phy device link up
From: Fugang Duan @ 2014-12-10  9:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, bhutchings, stephen, b38611
In-Reply-To: <1418205289-7730-1-git-send-email-b38611@freescale.com>

Before phy device link up, we only enable FEC mdio interrupt, which
is more reasonable.

Signed-off-by: Fugang Duan <B38611@freescale.com>
---
 drivers/net/ethernet/freescale/fec_main.c |    5 ++++-
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index c045f67..ea71bc8 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1075,7 +1075,10 @@ fec_restart(struct net_device *ndev)
 		fec_ptp_start_cyclecounter(ndev);
 
 	/* Enable interrupts we wish to service */
-	writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
+	if (fep->link)
+		writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
+	else
+		writel(FEC_ENET_MII, fep->hwp + FEC_IMASK);
 
 	/* Init the interrupt coalescing */
 	fec_enet_itr_coal_init(ndev);
-- 
1.7.8

^ permalink raw reply related

* [PATCH net-next 2/3] net: fec: clear all interrupt events to support i.MX6SX
From: Fugang Duan @ 2014-12-10  9:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, bhutchings, stephen, b38611
In-Reply-To: <1418205289-7730-1-git-send-email-b38611@freescale.com>

For i.MX6SX FEC controller, there have interrupt mask and event
field extension. To support all SOCs FEC, we clear all interrupt
events during MAVC initial process.

Signed-off-by: Fugang Duan <B38611@freescale.com>
---
 drivers/net/ethernet/freescale/fec_main.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index e8c7c82..c045f67 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -940,7 +940,7 @@ fec_restart(struct net_device *ndev)
 	}
 
 	/* Clear any outstanding interrupt. */
-	writel(0xffc00000, fep->hwp + FEC_IEVENT);
+	writel(0xffffffff, fep->hwp + FEC_IEVENT);
 
 	fec_enet_bd_init(ndev);
 
-- 
1.7.8

^ permalink raw reply related

* [PATCH net-next 1/3] net: fec: reset fep link status in suspend function
From: Fugang Duan @ 2014-12-10  9:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, bhutchings, stephen, b38611
In-Reply-To: <1418205289-7730-1-git-send-email-b38611@freescale.com>

On some i.MX6 serial boards, phy power and refrence clock are supplied
or controlled by SOC. When do suspend/resume test, the power and clock
are disabled, so phy device link down.

For current driver, fep->link is still up status, which cause extra operation
like below code. To avoid the dumy operation, we set fep->link to down when
phy device is real down.
...
if (fep->link) {
	napi_disable(&fep->napi);
	netif_tx_lock_bh(ndev);
	fec_stop(ndev);
	netif_tx_unlock_bh(ndev);
	napi_enable(&fep->napi);
	fep->link = phy_dev->link;
	status_change = 1;
}
...

Signed-off-by: Fugang Duan <B38611@freescale.com>
---
 drivers/net/ethernet/freescale/fec_main.c |    7 +++++++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
index fee2afe..e8c7c82 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3332,6 +3332,13 @@ static int __maybe_unused fec_suspend(struct device *dev)
 	if (fep->reg_phy)
 		regulator_disable(fep->reg_phy);
 
+	/*
+	 * SOC supply clock to phy, when clock is disabled, phy link down
+	 * SOC control phy regulator, when regulator is disabled, phy link down
+	 */
+	if (fep->clk_enet_out || fep->reg_phy)
+		fep->link = 0;
+
 	return 0;
 }
 
-- 
1.7.8

^ permalink raw reply related

* [PATCH net-next 0/3] net: fec: driver code clean and bug fix
From: Fugang Duan @ 2014-12-10  9:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, bhutchings, stephen, b38611

The patch serial include code clean and bug fix:
Patch#1: avoid dummy operation during suspend/resume test.
Patch#2: bug fix for i.MX6SX SOC that clean all interrupt events during MAC initial process.
Patch#3: before phy device link status is up, only enable MDIO bus interrupt.

Fugang Duan (3):
  net: fec: reset fep link status in suspend function
  net: fec: clear all interrupt events to support i.MX6SX
  net: fec: only enable mdio interrupt before phy device link up

 drivers/net/ethernet/freescale/fec_main.c |   14 ++++++++++++--
 1 files changed, 12 insertions(+), 2 deletions(-)

-- 
1.7.8

^ permalink raw reply

* Re: linux-next: build failure after merge of the net-next tree
From: Hariprasad S @ 2014-12-10 10:14 UTC (permalink / raw)
  To: Stephen Rothwell; +Cc: David Miller, netdev, linux-next, linux-kernel
In-Reply-To: <20141210195405.5af40846@canb.auug.org.au>

On Wed, Dec 10, 2014 at 19:54:05 +1100, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the net-next tree, today's linux-next build (powerpc
> allyesconfig) failed like this:
> 
> drivers/net/ethernet/chelsio/cxgb4vf/built-in.o:(.opd+0x630): multiple definition of `t4_bar2_sge_qregs'
> drivers/net/ethernet/chelsio/cxgb4/built-in.o:(.opd+0x14d0): first defined here
> drivers/net/ethernet/chelsio/cxgb4vf/built-in.o: In function `.t4_bar2_sge_qregs':
> (.text+0x9220): multiple definition of `.t4_bar2_sge_qregs'
> drivers/net/ethernet/chelsio/cxgb4/built-in.o:(.text+0x24e24): first defined here
> 
> Caused by commit e85c9a7abfa4 ("cxgb4/cxgb4vf: Add code to calculate T5
> BAR2 Offsets for SGE Queue Registers") which added both versions.  :-(
> 
> I have applied this fir patch for today (including the subject
> typo :-)):
> 
My bad. Thanks for the fix.

^ permalink raw reply

* Re: [PATCH net-next v2 3/4] bridge: offload bridge port attributes to switch asic if feature flag set
From: Jiri Pirko @ 2014-12-10  9:59 UTC (permalink / raw)
  To: roopa
  Cc: sfeldma, jhs, bcrl, tgraf, john.fastabend, stephen, linville,
	vyasevic, netdev, davem, shm, gospo
In-Reply-To: <1418202320-19491-4-git-send-email-roopa@cumulusnetworks.com>

Wed, Dec 10, 2014 at 10:05:19AM CET, roopa@cumulusnetworks.com wrote:
>From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
>This patch adds support to set/del bridge port attributes in hardware from
>the bridge driver.
>
>When the user sends a bridge setlink message, it will come in with 'master',
>   - go to the bridge driver ndo_bridge_setlink handler,
>   - set settings in the kernel
>   - if offload mode is set on the port, also call the swicthdev api to
>     propagate the attrs to the switchdev hardware
>
>   If you want to act on the hw alone, you can still use the self flag to
>   go to the switch hw or switch port driver directly.
>
>This is done in the bridge driver to rollback kernel settings
>on hw programming failure if required in the future.
>
>With this, it also makes sure a notification goes out only after the
>attributes are set both in the kernel and hw.
>
>The patch calls switchdev api only if BRIDGE_FLAGS_SELF is not set.
>This is because the offload cases with BRIDGE_FLAGS_SELF are handled in
>the caller (in rtnetlink.c). This needed flags (IFLA_BRIDGE_FLAGS) to be
>passed to bridge setlink and dellink functions, to avoid
>another call to parse of IFLA_AF_SPEC in br_setlink/br_getlink respectively.
>
>Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
>---
> drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |    2 +-
> drivers/net/ethernet/rocker/rocker.c          |    2 +-
> include/linux/netdevice.h                     |    4 +--
> net/bridge/br_netlink.c                       |   38 +++++++++++++++++++++----
> net/bridge/br_private.h                       |    4 +--
> net/core/rtnetlink.c                          |    8 +++---
> 6 files changed, 43 insertions(+), 15 deletions(-)
>
>diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>index 9afa167..0b8cf39 100644
>--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
>@@ -7721,7 +7721,7 @@ static int ixgbe_ndo_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
> }
> 
> static int ixgbe_ndo_bridge_setlink(struct net_device *dev,
>-				    struct nlmsghdr *nlh)
>+				    struct nlmsghdr *nlh, u16 flags)
> {
> 	struct ixgbe_adapter *adapter = netdev_priv(dev);
> 	struct nlattr *attr, *br_spec;
>diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
>index fded127..377979c 100644
>--- a/drivers/net/ethernet/rocker/rocker.c
>+++ b/drivers/net/ethernet/rocker/rocker.c
>@@ -3696,7 +3696,7 @@ skip:
> }
> 
> static int rocker_port_bridge_setlink(struct net_device *dev,
>-				      struct nlmsghdr *nlh)
>+				      struct nlmsghdr *nlh, u16 flags)
> {
> 	struct rocker_port *rocker_port = netdev_priv(dev);
> 	struct nlattr *protinfo;
>diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>index 29c92ee..a9c2ce2 100644
>--- a/include/linux/netdevice.h
>+++ b/include/linux/netdevice.h
>@@ -1151,13 +1151,13 @@ struct net_device_ops {
> 						int idx);
> 
> 	int			(*ndo_bridge_setlink)(struct net_device *dev,
>-						      struct nlmsghdr *nlh);
>+						struct nlmsghdr *nlh, u16 flags);
> 	int			(*ndo_bridge_getlink)(struct sk_buff *skb,
> 						      u32 pid, u32 seq,
> 						      struct net_device *dev,
> 						      u32 filter_mask);
> 	int			(*ndo_bridge_dellink)(struct net_device *dev,
>-						      struct nlmsghdr *nlh);
>+						struct nlmsghdr *nlh, u16 flags);
> 	int			(*ndo_change_carrier)(struct net_device *dev,
> 						      bool new_carrier);
> 	int			(*ndo_get_phys_port_id)(struct net_device *dev,
>diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
>index 9f5eb55..b4299d1 100644
>--- a/net/bridge/br_netlink.c
>+++ b/net/bridge/br_netlink.c
>@@ -16,6 +16,7 @@
> #include <net/rtnetlink.h>
> #include <net/net_namespace.h>
> #include <net/sock.h>
>+#include <net/switchdev.h>
> #include <uapi/linux/if_bridge.h>
> 
> #include "br_private.h"
>@@ -359,13 +360,13 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
> }
> 
> /* Change state and parameters on port. */
>-int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
>+int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
> {
> 	struct nlattr *protinfo;
> 	struct nlattr *afspec;
> 	struct net_bridge_port *p;
> 	struct nlattr *tb[IFLA_BRPORT_MAX + 1];
>-	int err = 0;
>+	int err = 0, ret_offload = 0;
		     ^^^^^^^^^^^ you can reuse "err" for this.

> 
> 	protinfo = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_PROTINFO);
> 	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
>@@ -407,19 +408,32 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh)
> 				afspec, RTM_SETLINK);
> 	}
> 
>+	if (!(flags & BRIDGE_FLAGS_SELF)) {
>+		/*
		  ^^^^^^^^^^^^^^^^^ Comment should start here.
		  This is also something that should be
		  discovered by scripts/checkpatch.pl
>+		 * set bridge attributes in hardware if supported
>+		 */
>+		ret_offload = netdev_switch_port_bridge_setlink(dev, nlh, flags);
>+		if (ret_offload && ret_offload != -EOPNOTSUPP) {
>+			/*
			  ^^^^^^^^^ same here

>+			 * XXX Fix this in the future to rollback
>+			 * kernel settings and return error

		How hard would it be to do the rollback now?


>+			 */
>+			br_warn(p->br, "error hw set of bridge attributes on port %u(%s)\n",					(unsigned int) p->port_no, p->dev->name);

											    ^^^^
											    missing
											    "\n"
											    here?

>+		}
>+	}
>+
> 	if (err == 0)
> 		br_ifinfo_notify(RTM_NEWLINK, p);
>-
> out:
> 	return err;
> }
> 
> /* Delete port information */
>-int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
>+int br_dellink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags)
> {
> 	struct nlattr *afspec;
> 	struct net_bridge_port *p;
>-	int err;
>+	int err = 0, ret_offload = 0;
> 
> 	afspec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
> 	if (!afspec)
>@@ -433,6 +447,20 @@ int br_dellink(struct net_device *dev, struct nlmsghdr *nlh)
> 	err = br_afspec((struct net_bridge *)netdev_priv(dev), p,
> 			afspec, RTM_DELLINK);
> 
>+	if (!(flags & BRIDGE_FLAGS_SELF)) {
>+		/*
		  ^^^ and here



>+		 * del bridge attributes in hardware
>+		 */
>+		ret_offload = netdev_switch_port_bridge_dellink(dev, nlh, flags);
>+		if (ret_offload && ret_offload != -EOPNOTSUPP) {
>+			/*
			  ^^^ and here


>+			 * XXX Fix this in the future to rollback
>+			 * kernel settings and return error
>+			 */
>+			br_warn(p->br, "error hw delete of bridge port attributes on port %u(%s)\n", (unsigned int) p->port_no, p->dev->name);
>+		}
>+	}
>+
> 	return err;
> }
> static int br_validate(struct nlattr *tb[], struct nlattr *data[])
>diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
>index aea3d13..0ebad7c 100644
>--- a/net/bridge/br_private.h
>+++ b/net/bridge/br_private.h
>@@ -815,8 +815,8 @@ extern struct rtnl_link_ops br_link_ops;
> int br_netlink_init(void);
> void br_netlink_fini(void);
> void br_ifinfo_notify(int event, struct net_bridge_port *port);
>-int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg);
>-int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg);
>+int br_setlink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
>+int br_dellink(struct net_device *dev, struct nlmsghdr *nlmsg, u16 flags);
> int br_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev,
> 	       u32 filter_mask);
> 
>diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
>index 61cb7e7..3c48d97 100644
>--- a/net/core/rtnetlink.c
>+++ b/net/core/rtnetlink.c
>@@ -2922,7 +2922,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
> 			goto out;
> 		}
> 
>-		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
>+		err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
> 		if (err)
> 			goto out;
> 
>@@ -2933,7 +2933,7 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh)
> 		if (!dev->netdev_ops->ndo_bridge_setlink)
> 			err = -EOPNOTSUPP;
> 		else
>-			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh);
>+			err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags);
> 
> 		if (!err)
> 			flags &= ~BRIDGE_FLAGS_SELF;
>@@ -2995,7 +2995,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
> 			goto out;
> 		}
> 
>-		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
>+		err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
> 		if (err)
> 			goto out;
> 
>@@ -3006,7 +3006,7 @@ static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh)
> 		if (!dev->netdev_ops->ndo_bridge_dellink)
> 			err = -EOPNOTSUPP;
> 		else
>-			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh);
>+			err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags);
> 
> 		if (!err)
> 			flags &= ~BRIDGE_FLAGS_SELF;
>-- 
>1.7.10.4
>

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox