Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next 02/10] net/mlx4_core: Change resource tracking ID to be 64 bit
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Currently the IDs used by the resource tracker are of type u32, so far this was
ok since all the different resources we were tracking could be encoded in 32bit.

As a preparation step for tracking of resources whose IDs need > 32 bits such
as network flow steering rules, who are 64 bit in size, move to use 64 bit
based resource IDs.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    2 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |   28 ++++++++++----------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 1a2f372..a425a98 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1033,7 +1033,7 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port);
 /* resource tracker functions*/
 int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 				    enum mlx4_resource resource_type,
-				    int resource_id, int *slave);
+				    u64 resource_id, int *slave);
 void mlx4_delete_all_resources_for_slave(struct mlx4_dev *dev, int slave_id);
 int mlx4_init_resource_tracker(struct mlx4_dev *dev);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 6f89d44..b8e8969 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -58,7 +58,7 @@ struct mac_res {
 struct res_common {
 	struct list_head	list;
 	struct rb_node		node;
-	u32		        res_id;
+	u64		        res_id;
 	int			owner;
 	int			state;
 	int			from_state;
@@ -315,7 +315,7 @@ static int mpt_mask(struct mlx4_dev *dev)
 	return dev->caps.num_mpts - 1;
 }
 
-static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
+static struct res_common *find_res(struct mlx4_dev *dev, u64 res_id,
 				   enum mlx4_resource type)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
@@ -324,7 +324,7 @@ static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
 				  res_id);
 }
 
-static int get_res(struct mlx4_dev *dev, int slave, int res_id,
+static int get_res(struct mlx4_dev *dev, int slave, u64 res_id,
 		   enum mlx4_resource type,
 		   void *res)
 {
@@ -350,7 +350,7 @@ static int get_res(struct mlx4_dev *dev, int slave, int res_id,
 
 	r->from_state = r->state;
 	r->state = RES_ANY_BUSY;
-	mlx4_dbg(dev, "res %s id 0x%x to busy\n",
+	mlx4_dbg(dev, "res %s id 0x%llx to busy\n",
 		 ResourceType(type), r->res_id);
 
 	if (res)
@@ -363,7 +363,7 @@ exit:
 
 int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 				    enum mlx4_resource type,
-				    int res_id, int *slave)
+				    u64 res_id, int *slave)
 {
 
 	struct res_common *r;
@@ -384,7 +384,7 @@ int mlx4_get_slave_from_resource_id(struct mlx4_dev *dev,
 	return err;
 }
 
-static void put_res(struct mlx4_dev *dev, int slave, int res_id,
+static void put_res(struct mlx4_dev *dev, int slave, u64 res_id,
 		    enum mlx4_resource type)
 {
 	struct res_common *r;
@@ -516,7 +516,7 @@ static struct res_common *alloc_xrcdn_tr(int id)
 	return &ret->com;
 }
 
-static struct res_common *alloc_tr(int id, enum mlx4_resource type, int slave,
+static struct res_common *alloc_tr(u64 id, enum mlx4_resource type, int slave,
 				   int extra)
 {
 	struct res_common *ret;
@@ -558,7 +558,7 @@ static struct res_common *alloc_tr(int id, enum mlx4_resource type, int slave,
 	return ret;
 }
 
-static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
+static int add_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
 	int i;
@@ -727,10 +727,10 @@ static int remove_ok(struct res_common *res, enum mlx4_resource type, int extra)
 	}
 }
 
-static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
+static int rem_res_range(struct mlx4_dev *dev, int slave, u64 base, int count,
 			 enum mlx4_resource type, int extra)
 {
-	int i;
+	u64 i;
 	int err;
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
@@ -784,7 +784,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 	else {
 		switch (state) {
 		case RES_QP_BUSY:
-			mlx4_dbg(dev, "%s: failed RES_QP, 0x%x\n",
+			mlx4_dbg(dev, "%s: failed RES_QP, 0x%llx\n",
 				 __func__, r->com.res_id);
 			err = -EBUSY;
 			break;
@@ -793,7 +793,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 			if (r->com.state == RES_QP_MAPPED && !alloc)
 				break;
 
-			mlx4_dbg(dev, "failed RES_QP, 0x%x\n", r->com.res_id);
+			mlx4_dbg(dev, "failed RES_QP, 0x%llx\n", r->com.res_id);
 			err = -EINVAL;
 			break;
 
@@ -802,7 +802,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 			    r->com.state == RES_QP_HW)
 				break;
 			else {
-				mlx4_dbg(dev, "failed RES_QP, 0x%x\n",
+				mlx4_dbg(dev, "failed RES_QP, 0x%llx\n",
 					  r->com.res_id);
 				err = -EINVAL;
 			}
@@ -2796,7 +2796,7 @@ static int _move_all_busy(struct mlx4_dev *dev, int slave,
 				if (r->state == RES_ANY_BUSY) {
 					if (print)
 						mlx4_dbg(dev,
-							 "%s id 0x%x is busy\n",
+							 "%s id 0x%llx is busy\n",
 							  ResourceType(type),
 							  r->res_id);
 					++busy;
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 01/10] net/mlx4_core: Change resource tracking mechanism to use red-black tree
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Hadar Hen Zion, Or Gerlitz
In-Reply-To: <1341135823-29039-1-git-send-email-ogerlitz@mellanox.com>

From: Hadar Hen Zion <hadarh@mellanox.co.il>

Change the data structure used for managing the SRIOV resource tracking
mechanism from radix tree to red-black tree. This is preparation step
for supporting resource IDs which are 64bit long, such as network flow
steering rules. Such IDs can't be used as radix-tree keys on 32bit
architectures and hence the reason for the change.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |    3 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  112 ++++++++++++++------
 2 files changed, 81 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index e5d2022..1a2f372 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -39,6 +39,7 @@
 
 #include <linux/mutex.h>
 #include <linux/radix-tree.h>
+#include <linux/rbtree.h>
 #include <linux/timer.h>
 #include <linux/semaphore.h>
 #include <linux/workqueue.h>
@@ -509,7 +510,7 @@ struct slave_list {
 struct mlx4_resource_tracker {
 	spinlock_t lock;
 	/* tree for each resources */
-	struct radix_tree_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
+	struct rb_root res_tree[MLX4_NUM_OF_RESOURCE_TYPE];
 	/* num_of_slave's lists, one per slave */
 	struct slave_list *slave_list;
 };
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 766b8c5..6f89d44 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -57,6 +57,7 @@ struct mac_res {
 
 struct res_common {
 	struct list_head	list;
+	struct rb_node		node;
 	u32		        res_id;
 	int			owner;
 	int			state;
@@ -189,6 +190,49 @@ struct res_xrcdn {
 	int			port;
 };
 
+static struct res_common *res_tracker_lookup(struct rb_root *root, u64 res_id)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct res_common *res = container_of(node, struct res_common,
+						      node);
+
+		if (res_id < res->res_id)
+			node = node->rb_left;
+		else if (res_id > res->res_id)
+			node = node->rb_right;
+		else
+			return res;
+	}
+	return NULL;
+}
+
+static int res_tracker_insert(struct rb_root *root, struct res_common *res)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/* Figure out where to put new node */
+	while (*new) {
+		struct res_common *this = container_of(*new, struct res_common,
+						       node);
+
+		parent = *new;
+		if (res->res_id < this->res_id)
+			new = &((*new)->rb_left);
+		else if (res->res_id > this->res_id)
+			new = &((*new)->rb_right);
+		else
+			return -EEXIST;
+	}
+
+	/* Add new node and rebalance tree. */
+	rb_link_node(&res->node, parent, new);
+	rb_insert_color(&res->node, root);
+
+	return 0;
+}
+
 /* For Debug uses */
 static const char *ResourceType(enum mlx4_resource rt)
 {
@@ -228,8 +272,7 @@ int mlx4_init_resource_tracker(struct mlx4_dev *dev)
 	mlx4_dbg(dev, "Started init_resource_tracker: %ld slaves\n",
 		 dev->num_slaves);
 	for (i = 0 ; i < MLX4_NUM_OF_RESOURCE_TYPE; i++)
-		INIT_RADIX_TREE(&priv->mfunc.master.res_tracker.res_tree[i],
-				GFP_ATOMIC|__GFP_NOWARN);
+		priv->mfunc.master.res_tracker.res_tree[i] = RB_ROOT;
 
 	spin_lock_init(&priv->mfunc.master.res_tracker.lock);
 	return 0 ;
@@ -272,13 +315,13 @@ static int mpt_mask(struct mlx4_dev *dev)
 	return dev->caps.num_mpts - 1;
 }
 
-static void *find_res(struct mlx4_dev *dev, int res_id,
-		      enum mlx4_resource type)
+static struct res_common *find_res(struct mlx4_dev *dev, int res_id,
+				   enum mlx4_resource type)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 
-	return radix_tree_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
-				 res_id);
+	return res_tracker_lookup(&priv->mfunc.master.res_tracker.res_tree[type],
+				  res_id);
 }
 
 static int get_res(struct mlx4_dev *dev, int slave, int res_id,
@@ -523,7 +566,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct res_common **res_arr;
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
-	struct radix_tree_root *root = &tracker->res_tree[type];
+	struct rb_root *root = &tracker->res_tree[type];
 
 	res_arr = kzalloc(count * sizeof *res_arr, GFP_KERNEL);
 	if (!res_arr)
@@ -546,7 +589,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 			err = -EEXIST;
 			goto undo;
 		}
-		err = radix_tree_insert(root, base + i, res_arr[i]);
+		err = res_tracker_insert(root, res_arr[i]);
 		if (err)
 			goto undo;
 		list_add_tail(&res_arr[i]->list,
@@ -559,7 +602,7 @@ static int add_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 
 undo:
 	for (--i; i >= base; --i)
-		radix_tree_delete(&tracker->res_tree[type], i);
+		rb_erase(&res_arr[i]->node, root);
 
 	spin_unlock_irq(mlx4_tlock(dev));
 
@@ -695,7 +738,7 @@ static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	for (i = base; i < base + count; ++i) {
-		r = radix_tree_lookup(&tracker->res_tree[type], i);
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
 		if (!r) {
 			err = -ENOENT;
 			goto out;
@@ -710,8 +753,8 @@ static int rem_res_range(struct mlx4_dev *dev, int slave, int base, int count,
 	}
 
 	for (i = base; i < base + count; ++i) {
-		r = radix_tree_lookup(&tracker->res_tree[type], i);
-		radix_tree_delete(&tracker->res_tree[type], i);
+		r = res_tracker_lookup(&tracker->res_tree[type], i);
+		rb_erase(&r->node, &tracker->res_tree[type]);
 		list_del(&r->list);
 		kfree(r);
 	}
@@ -733,7 +776,7 @@ static int qp_res_start_move_to(struct mlx4_dev *dev, int slave, int qpn,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_QP], qpn);
+	r = (struct res_qp *)res_tracker_lookup(&tracker->res_tree[RES_QP], qpn);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -797,7 +840,8 @@ static int mr_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_MPT], index);
+	r = (struct res_mpt *)res_tracker_lookup(&tracker->res_tree[RES_MPT],
+					     index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -850,7 +894,7 @@ static int eq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_EQ], index);
+	r = (struct res_eq *)res_tracker_lookup(&tracker->res_tree[RES_EQ], index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -898,7 +942,7 @@ static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
 	int err;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_CQ], cqn);
+	r = (struct res_cq *)res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -952,7 +996,8 @@ static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 	int err = 0;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[RES_SRQ], index);
+	r = (struct res_srq *)res_tracker_lookup(&tracker->res_tree[RES_SRQ],
+					     index);
 	if (!r)
 		err = -ENOENT;
 	else if (r->com.owner != slave)
@@ -1001,7 +1046,7 @@ static void res_abort_move(struct mlx4_dev *dev, int slave,
 	struct res_common *r;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[type], id);
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
 	if (r && (r->owner == slave))
 		r->state = r->from_state;
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -1015,7 +1060,7 @@ static void res_end_move(struct mlx4_dev *dev, int slave,
 	struct res_common *r;
 
 	spin_lock_irq(mlx4_tlock(dev));
-	r = radix_tree_lookup(&tracker->res_tree[type], id);
+	r = res_tracker_lookup(&tracker->res_tree[type], id);
 	if (r && (r->owner == slave))
 		r->state = r->to_state;
 	spin_unlock_irq(mlx4_tlock(dev));
@@ -2817,8 +2862,8 @@ static void rem_slave_qps(struct mlx4_dev *dev, int slave)
 				switch (state) {
 				case RES_QP_RESERVED:
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_QP],
-							  qp->com.res_id);
+					rb_erase(&qp->com.node,
+						 &tracker->res_tree[RES_QP]);
 					list_del(&qp->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(qp);
@@ -2888,8 +2933,8 @@ static void rem_slave_srqs(struct mlx4_dev *dev, int slave)
 				case RES_SRQ_ALLOCATED:
 					__mlx4_srq_free_icm(dev, srqn);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_SRQ],
-							  srqn);
+					rb_erase(&srq->com.node,
+						 &tracker->res_tree[RES_SRQ]);
 					list_del(&srq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(srq);
@@ -2954,8 +2999,8 @@ static void rem_slave_cqs(struct mlx4_dev *dev, int slave)
 				case RES_CQ_ALLOCATED:
 					__mlx4_cq_free_icm(dev, cqn);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_CQ],
-							  cqn);
+					rb_erase(&cq->com.node,
+						 &tracker->res_tree[RES_CQ]);
 					list_del(&cq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(cq);
@@ -3017,8 +3062,8 @@ static void rem_slave_mrs(struct mlx4_dev *dev, int slave)
 				case RES_MPT_RESERVED:
 					__mlx4_mr_release(dev, mpt->key);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_MPT],
-							  mptn);
+					rb_erase(&mpt->com.node,
+						 &tracker->res_tree[RES_MPT]);
 					list_del(&mpt->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(mpt);
@@ -3086,8 +3131,8 @@ static void rem_slave_mtts(struct mlx4_dev *dev, int slave)
 					__mlx4_free_mtt_range(dev, base,
 							      mtt->order);
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_MTT],
-							  base);
+					rb_erase(&mtt->com.node,
+						 &tracker->res_tree[RES_MTT]);
 					list_del(&mtt->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(mtt);
@@ -3133,8 +3178,8 @@ static void rem_slave_eqs(struct mlx4_dev *dev, int slave)
 				switch (state) {
 				case RES_EQ_RESERVED:
 					spin_lock_irq(mlx4_tlock(dev));
-					radix_tree_delete(&tracker->res_tree[RES_EQ],
-							  eqn);
+					rb_erase(&eq->com.node,
+						 &tracker->res_tree[RES_EQ]);
 					list_del(&eq->com.list);
 					spin_unlock_irq(mlx4_tlock(dev));
 					kfree(eq);
@@ -3191,7 +3236,8 @@ static void rem_slave_counters(struct mlx4_dev *dev, int slave)
 	list_for_each_entry_safe(counter, tmp, counter_list, com.list) {
 		if (counter->com.owner == slave) {
 			index = counter->com.res_id;
-			radix_tree_delete(&tracker->res_tree[RES_COUNTER], index);
+			rb_erase(&counter->com.node,
+				 &tracker->res_tree[RES_COUNTER]);
 			list_del(&counter->com.list);
 			kfree(counter);
 			__mlx4_counter_free(dev, index);
@@ -3220,7 +3266,7 @@ static void rem_slave_xrcdns(struct mlx4_dev *dev, int slave)
 	list_for_each_entry_safe(xrcd, tmp, xrcdn_list, com.list) {
 		if (xrcd->com.owner == slave) {
 			xrcdn = xrcd->com.res_id;
-			radix_tree_delete(&tracker->res_tree[RES_XRCD], xrcdn);
+			rb_erase(&xrcd->com.node, &tracker->res_tree[RES_XRCD]);
 			list_del(&xrcd->com.list);
 			kfree(xrcd);
 			__mlx4_xrcd_free(dev, xrcdn);
-- 
1.7.1

^ permalink raw reply related

* [PATCH net-next 00/10] net/mlx4: Add flow-steering support
From: Or Gerlitz @ 2012-07-01  9:43 UTC (permalink / raw)
  To: davem; +Cc: roland, yevgenyp, oren, netdev, Or Gerlitz, Hadar Hen Zion

Hi Dave, 

This patch series from Hadar adds code to manage L2/L3/L4 network 
flow steering rules, a feature which is supported by the ConnectX-3 device.

The series is built as follows:

The first two patches deal with SRIOV resource tracker, whose mechanism 
is changed to use red-black tree instead of radix tree. The reason for 
this change is that the coming steering patches use flow IDs which are 64 
bits in size, where radix tree keys can't be 64bit on 32bit architecture, 
while RB tree can do that.

Patch #3 is little re-design of the Ethernet driver multicast attachments 
flow to be more efficient and robust.

The fourth patch does a re-org of the checks that deal with the current 
"older" steering modes such that we can easily add soon the new steering 
mode and the code remains easy to manage.

Patch #5 adds the firmware commands for the new steering mode, which is 
called "device managed flow steeering".

Patch 6 is the main patch of this series. It adds support for device-managed flow 
steering all across the place. We had to have this patch also to touch the mlx4 
IB driver, since the steering mode is global to the HCA -- so when being enabled, 
multicast attachment calls done by the IB driver into the mlx4 core driver, 
are now routed to the flow steering firmware commands whose API is a bit different, 
something that the IB driver had to be aware to. Following that, the 7th patch 
adds resource tracking for device-managed flow steering rules.

The 8th patch adds promiscuous mode support under device-managed flow steering,
next, the 9th patch adds implementation for the ethtool APIs for attaching 
L2/L3/L4 based flow steering rules, and the last patch adds support for drop 
action through ethtool.

Or.

Hadar Hen Zion (9):
  net/mlx4_core: Change resource tracking mechanism to use red-black tree
  net/mlx4_core: Change resource tracking ID to be 64 bit
  net/mlx4: Set steering mode according to device capabilities
  net/mlx4_core: Add firmware commands to support device managed flow steering
  {NET,IB}/mlx4: Add device managed flow steering firmware API
  net/mlx4_core: Add resource tracking for device managed flow steering rules
  net/mlx4: Implement promiscuous mode with device managed flow-steering
  net/mlx4_en: Manage flow steering rules with ethtool
  net/mlx4_en: Add support for drop action through ethtool

Yevgeny Petrilin (1):
  net/mlx4_en: Re-design multicast attachments flow

 drivers/infiniband/hw/mlx4/main.c                  |   62 ++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h               |    1 +
 drivers/infiniband/hw/mlx4/qp.c                    |    1 +
 drivers/net/ethernet/mellanox/mlx4/cmd.c           |   19 +
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c    |  373 ++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  313 +++++++++---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c         |   30 ++
 drivers/net/ethernet/mellanox/mlx4/fw.c            |   90 +++-
 drivers/net/ethernet/mellanox/mlx4/fw.h            |    3 +
 drivers/net/ethernet/mellanox/mlx4/main.c          |   60 ++-
 drivers/net/ethernet/mellanox/mlx4/mcg.c           |  524 ++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |   29 +-
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h       |   28 +-
 drivers/net/ethernet/mellanox/mlx4/port.c          |  111 +++--
 drivers/net/ethernet/mellanox/mlx4/profile.c       |   12 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  284 +++++++++--
 include/linux/mlx4/cmd.h                           |    4 +
 include/linux/mlx4/device.h                        |  136 +++++-
 18 files changed, 1848 insertions(+), 232 deletions(-)

-- 
1.7.1

Cc: Hadar Hen Zion <hadarh@mellanox.co.il>

^ permalink raw reply

* [PATCH v5] bonding support for IPv6 transmit hashing
From: John Eaglesham @ 2012-07-01  8:07 UTC (permalink / raw)
  To: netdev; +Cc: John Eaglesham
In-Reply-To: <cover.1341125875.git.linux@8192.net>

Currently the "bonding" driver does not support load balancing outgoing
traffic in LACP mode for IPv6 traffic. IPv4 (and TCP or UDP over IPv4)
are currently supported; this patch adds transmit hashing for IPv6 (and
TCP or UDP over IPv6), bringing IPv6 up to par with IPv4 support in the
bonding driver.

The algorithm chosen (xor'ing the bottom three quads and then xor'ing
the bottom three bytes of that) was chosen after testing almost 400,000
unique IPv6 addresses harvested from server logs. This algorithm had the
most even distribution for both big- and little-endian architectures while
still using few instructions.

The IPv6 flow label was intentionally not included in the hash as it appears
to be unset in the vast majority of IPv6 traffic sampled, and the current
algorithm not using the flow label already offers a very even distribution.

Fragmented IPv6 packets are handled the same way as fragmented IPv4 packets,
ie, they are not balanced based on layer 4 information. Additionally,
IPv6 packets with intermediate headers are not balanced based on layer
4 information. In practice these intermediate headers are not common and
this should not cause any problems, and the alternative (a packet-parsing
loop and look-up table) seemed slow and complicated for little gain.

This is an update to a prior patch I submitted. This version includes
a clarified description, thorough bounds checking, updates functions to
call bond_xmit_hash_policy_l2 rather than re-implement the same logic,
incorporates Jay's style suggestions, patches against net-next, and
squashes the documentation and code patch into one. Patch has been tested
and performs as expected.

John Eaglesham

---
 Documentation/networking/bonding.txt | 31 ++++++++++--
 drivers/net/bonding/bond_main.c      | 91 +++++++++++++++++++++++++-----------
 2 files changed, 90 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index bfea8a3..5db14fe 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -752,12 +752,22 @@ xmit_hash_policy
 		protocol information to generate the hash.
 
 		Uses XOR of hardware MAC addresses and IP addresses to
-		generate the hash.  The formula is
+		generate the hash.  The IPv4 formula is
 
 		(((source IP XOR dest IP) AND 0xffff) XOR
 			( source MAC XOR destination MAC ))
 				modulo slave count
 
+		The IPv6 forumla is
+
+		iphash =
+			(source ip quad 2 XOR dest IP quad 2) XOR
+			(source ip quad 3 XOR dest IP quad 3) XOR
+			(source ip quad 4 XOR dest IP quad 4)
+
+		((iphash >> 16) XOR (iphash >> 8) XOR iphash)
+			modulo slave count
+
 		This algorithm will place all traffic to a particular
 		network peer on the same slave.  For non-IP traffic,
 		the formula is the same as for the layer2 transmit
@@ -778,19 +788,30 @@ xmit_hash_policy
 		slaves, although a single connection will not span
 		multiple slaves.
 
-		The formula for unfragmented TCP and UDP packets is
+		The formula for unfragmented IPv4 TCP and UDP packets is
 
 		((source port XOR dest port) XOR
 			 ((source IP XOR dest IP) AND 0xffff)
 				modulo slave count
 
-		For fragmented TCP or UDP packets and all other IP
-		protocol traffic, the source and destination port
+		The formula for unfragmented IPv6 TCP and UDP packets is
+
+		iphash =
+			(source ip quad 2 XOR dest IP quad 2) XOR
+			(source ip quad 3 XOR dest IP quad 3) XOR
+			(source ip quad 4 XOR dest IP quad 4)
+
+		((source port XOR dest port) XOR
+			(iphash >> 16) XOR (iphash >> 8) XOR iphash)
+				modulo slave count
+
+		For fragmented TCP or UDP packets and all other IPv4 and
+		IPv6 protocol traffic, the source and destination port
 		information is omitted.  For non-IP traffic, the
 		formula is the same as for the layer2 transmit hash
 		policy.
 
-		This policy is intended to mimic the behavior of
+		The IPv4 policy is intended to mimic the behavior of
 		certain switches, notably Cisco switches with PFC2 as
 		well as some Foundry and IBM products.
 
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f5a40b9..b138d84 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3345,56 +3345,93 @@ static struct notifier_block bond_netdev_notifier = {
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
+ * Hash for the output device based upon layer 2 data
+ */
+static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+{
+	struct ethhdr *data = (struct ethhdr *)skb->data;
+
+	if (skb_headlen(skb) >= offsetof(struct ethhdr, h_proto))
+		return (data->h_dest[5] ^ data->h_source[5]) % count;
+
+	return 0;
+}
+
+/*
  * Hash for the output device based upon layer 2 and layer 3 data. If
- * the packet is not IP mimic bond_xmit_hash_policy_l2()
+ * the packet is not IP, fall back on bond_xmit_hash_policy_l2()
  */
 static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
+	u32 v6hash;
 
-	if (skb->protocol == htons(ETH_P_IP)) {
+	if (skb->protocol == htons(ETH_P_IP) &&
+		skb_network_header_len(skb) >= sizeof(struct iphdr)) {
+		iph = ip_hdr(skb);
 		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
 			(data->h_dest[5] ^ data->h_source[5])) % count;
-	}
-
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6) &&
+		skb_network_header_len(skb) >= sizeof(struct ipv6hdr)) {
+		ipv6h = ipv6_hdr(skb);
+		v6hash =
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);
+		v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
+		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
+	}
+
+	return bond_xmit_hash_policy_l2(skb, count);
 }
 
 /*
  * Hash for the output device based upon layer 3 and layer 4 data. If
  * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
- * altogether not IP, mimic bond_xmit_hash_policy_l2()
+ * altogether not IP, fall back on bond_xmit_hash_policy_l2()
  */
 static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-	int layer4_xor = 0;
+	u32 layer4_xor = 0;
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
 
 	if (skb->protocol == htons(ETH_P_IP)) {
+		iph = ip_hdr(skb);
 		if (!ip_is_fragment(iph) &&
-		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP)) {
+			(iph->protocol == IPPROTO_TCP ||
+			iph->protocol == IPPROTO_UDP)) {
+			__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
+			if (iph->ihl * sizeof(u32) + sizeof(__be16) * 2 >
+				skb_headlen(skb) - skb_network_offset(skb))
+				goto short_header;
 			layer4_xor = ntohs((*layer4hdr ^ *(layer4hdr + 1)));
+		} else if (skb_network_header_len(skb) < sizeof(struct iphdr)) {
+			goto short_header;
 		}
-		return (layer4_xor ^
-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-
+		return (layer4_xor ^ ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv6h = ipv6_hdr(skb);
+		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
+			__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(struct ipv6hdr));
+			if (sizeof(struct ipv6hdr) + sizeof(__be16) * 2 >
+				skb_headlen(skb) - skb_network_offset(skb))
+				goto short_header;
+			layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
+		} else if (skb_network_header_len(skb) < sizeof(struct ipv6hdr)) {
+			goto short_header;
+		}
+		layer4_xor ^=
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);
+		return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
 	}
 
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
-}
-
-/*
- * Hash for the output device based upon layer 2 data
- */
-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
-{
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+short_header:
+	return bond_xmit_hash_policy_l2(skb, count);
 }
 
 /*-------------------------- Device entry points ----------------------------*/
-- 
1.7.11

^ permalink raw reply related

* Re: [PATCH v4 2/2] Update bonding driver documentation to include IPv6 transmit hashing algorithm.
From: John Eaglesham @ 2012-07-01  7:42 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20120701.003417.890911456733055306.davem@davemloft.net>

On 7/1/2012 12:34 AM, David Miller wrote:
>
> I think you should combine this into the first patch, there is no
> reason to separate these two changes.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

Thanks, I will do that.

John

^ permalink raw reply

* Re: [PATCH v4 2/2] Update bonding driver documentation to include IPv6 transmit hashing algorithm.
From: David Miller @ 2012-07-01  7:34 UTC (permalink / raw)
  To: linux; +Cc: netdev
In-Reply-To: <f9bff286392a4778369fa35dac2fafaff3d3a1b8.1341125875.git.linux@8192.net>


I think you should combine this into the first patch, there is no
reason to separate these two changes.

^ permalink raw reply

* Re: [PATCH v4 1/2] Add support for IPv6 and bounds checking to transmit hashing functions.
From: David Miller @ 2012-07-01  7:33 UTC (permalink / raw)
  To: linux; +Cc: netdev
In-Reply-To: <9814878c9da75aaadfd70e0ea45b29c3ee8869a0.1341125875.git.linux@8192.net>

From: John Eaglesham <linux@8192.net>
Date: Sun,  1 Jul 2012 00:01:38 -0700

> -	if (skb->protocol == htons(ETH_P_IP)) {
> +	if (skb->protocol == htons(ETH_P_IP) &&
> +		skb_network_header_len(skb) >= sizeof(struct iphdr)) {

This is not indented properly, the goal isn't to use only TAB
characters to indent, the goal it to line things up right after
the openning parenthesis on the previous line, so this should
be:

	if (skb->protocol == htons(ETH_P_IP) &&
	    skb_network_header_len(skb) >= sizeof(struct iphdr)) {

> +	} else if (skb->protocol == htons(ETH_P_IPV6) &&
> +		skb_network_header_len(skb) >= sizeof(struct ipv6hdr)) {

Likewise, in this case you even under-indented it.

> +		v6hash =
> +			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
> +			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
> +			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);

This is rediculous, just put &ipv6h->saddr into a local pointer named
's' and then you won't have use such gymnastics to indent the code.

>  		if (!ip_is_fragment(iph) &&
> -		    (iph->protocol == IPPROTO_TCP ||
> -		     iph->protocol == IPPROTO_UDP)) {
> +			(iph->protocol == IPPROTO_TCP ||
> +			iph->protocol == IPPROTO_UDP)) {

This is what _REALLY_ bothers me.  You took an existing conditional
which _WAS_ indented properly and you made erroneously re-indented.

In fact your only change here is to break the indentation.

Just remove these changes entirely.

> +			if (iph->ihl * sizeof(u32) + sizeof(__be16) * 2 >
> +				skb_headlen(skb) - skb_network_offset(skb))

Similarly, this is indented improperly.

> +		} else if (skb_network_header_len(skb) < sizeof(struct iphdr)) {
> +			goto short_header;
>  		}

Single line basic blocks do not use openning and closing braces.

> -		return (layer4_xor ^
> -			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
> -
> +		return (layer4_xor ^ ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;

Don't reindent code unless you can do it properly, now this line is way
over 80 columns.  The previous code was perfectly fine, leave it alone.

> +		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {

Line is too long, write it as:

		if (ipv6h->nexthdr == IPPROTO_TCP ||
		    ipv6h->nexthdr == IPPROTO_UDP) {

> +			__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(struct ipv6hdr));

Likewise, line is too long.  Just do the variable declaration seperate from the
assignment:
			__be16 *layer4hdrv6;

			layer4hdrv6 = BLAH BLAH BLAH;

> +			if (sizeof(struct ipv6hdr) + sizeof(__be16) * 2 >
> +				skb_headlen(skb) - skb_network_offset(skb))

Improperly indented, fix.

> +		} else if (skb_network_header_len(skb) < sizeof(struct ipv6hdr)) {
> +			goto short_header;
> +		}

Since line basic block, no braces.

> +		layer4_xor ^=
> +			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
> +			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
> +			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);

This is gross, do as I said above using a local pointer variable.

^ permalink raw reply

* Re: [net] e1000e: remove use of IP payload checksum
From: David Miller @ 2012-07-01  7:26 UTC (permalink / raw)
  To: jeffrey.t.kirsher; +Cc: bruce.w.allan, netdev, gospo, sassmann, stable
In-Reply-To: <1341122562-17382-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Sat, 30 Jun 2012 23:02:42 -0700

> From: Bruce Allan <bruce.w.allan@intel.com>
> 
> Currently only used when packet split mode is enabled with jumbo frames,
> IP payload checksum (for fragmented UDP packets) is mutually exclusive with
> receive hashing offload since the hardware uses the same space in the
> receive descriptor for the hardware-provided packet checksum and the RSS
> hash, respectively.  Users currently must disable jumbos when receive
> hashing offload is enabled, or vice versa, because of this incompatibility.
> Since testing has shown that IP payload checksum does not provide any real
> benefit, just remove it so that there is no longer a choice between jumbos
> or receive hashing offload but not both as done in other Intel GbE drivers
> (e.g. e1000, igb).
> 
> Also, add a missing check for IP checksum error reported by the hardware;
> let the stack verify the checksum when this happens.
> 
> CC: stable <stable@vger.kernel.org> [3.4]
> Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
> Tested-by: Aaron Brown <aaron.f.brown@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied, thanks Jeff.

^ permalink raw reply

* [PATCH v4 2/2] Update bonding driver documentation to include IPv6 transmit hashing algorithm.
From: John Eaglesham @ 2012-07-01  7:01 UTC (permalink / raw)
  To: netdev; +Cc: John Eaglesham
In-Reply-To: <cover.1341125875.git.linux@8192.net>

---
 Documentation/networking/bonding.txt | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index bfea8a3..5db14fe 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -752,12 +752,22 @@ xmit_hash_policy
 		protocol information to generate the hash.
 
 		Uses XOR of hardware MAC addresses and IP addresses to
-		generate the hash.  The formula is
+		generate the hash.  The IPv4 formula is
 
 		(((source IP XOR dest IP) AND 0xffff) XOR
 			( source MAC XOR destination MAC ))
 				modulo slave count
 
+		The IPv6 forumla is
+
+		iphash =
+			(source ip quad 2 XOR dest IP quad 2) XOR
+			(source ip quad 3 XOR dest IP quad 3) XOR
+			(source ip quad 4 XOR dest IP quad 4)
+
+		((iphash >> 16) XOR (iphash >> 8) XOR iphash)
+			modulo slave count
+
 		This algorithm will place all traffic to a particular
 		network peer on the same slave.  For non-IP traffic,
 		the formula is the same as for the layer2 transmit
@@ -778,19 +788,30 @@ xmit_hash_policy
 		slaves, although a single connection will not span
 		multiple slaves.
 
-		The formula for unfragmented TCP and UDP packets is
+		The formula for unfragmented IPv4 TCP and UDP packets is
 
 		((source port XOR dest port) XOR
 			 ((source IP XOR dest IP) AND 0xffff)
 				modulo slave count
 
-		For fragmented TCP or UDP packets and all other IP
-		protocol traffic, the source and destination port
+		The formula for unfragmented IPv6 TCP and UDP packets is
+
+		iphash =
+			(source ip quad 2 XOR dest IP quad 2) XOR
+			(source ip quad 3 XOR dest IP quad 3) XOR
+			(source ip quad 4 XOR dest IP quad 4)
+
+		((source port XOR dest port) XOR
+			(iphash >> 16) XOR (iphash >> 8) XOR iphash)
+				modulo slave count
+
+		For fragmented TCP or UDP packets and all other IPv4 and
+		IPv6 protocol traffic, the source and destination port
 		information is omitted.  For non-IP traffic, the
 		formula is the same as for the layer2 transmit hash
 		policy.
 
-		This policy is intended to mimic the behavior of
+		The IPv4 policy is intended to mimic the behavior of
 		certain switches, notably Cisco switches with PFC2 as
 		well as some Foundry and IBM products.
 
-- 
1.7.11

^ permalink raw reply related

* [PATCH v4 1/2] Add support for IPv6 and bounds checking to transmit hashing functions.
From: John Eaglesham @ 2012-07-01  7:01 UTC (permalink / raw)
  To: netdev; +Cc: John Eaglesham
In-Reply-To: <cover.1341125875.git.linux@8192.net>

---
 drivers/net/bonding/bond_main.c | 91 +++++++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 27 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f5a40b9..b138d84 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3345,56 +3345,93 @@ static struct notifier_block bond_netdev_notifier = {
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
+ * Hash for the output device based upon layer 2 data
+ */
+static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
+{
+	struct ethhdr *data = (struct ethhdr *)skb->data;
+
+	if (skb_headlen(skb) >= offsetof(struct ethhdr, h_proto))
+		return (data->h_dest[5] ^ data->h_source[5]) % count;
+
+	return 0;
+}
+
+/*
  * Hash for the output device based upon layer 2 and layer 3 data. If
- * the packet is not IP mimic bond_xmit_hash_policy_l2()
+ * the packet is not IP, fall back on bond_xmit_hash_policy_l2()
  */
 static int bond_xmit_hash_policy_l23(struct sk_buff *skb, int count)
 {
 	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
+	u32 v6hash;
 
-	if (skb->protocol == htons(ETH_P_IP)) {
+	if (skb->protocol == htons(ETH_P_IP) &&
+		skb_network_header_len(skb) >= sizeof(struct iphdr)) {
+		iph = ip_hdr(skb);
 		return ((ntohl(iph->saddr ^ iph->daddr) & 0xffff) ^
 			(data->h_dest[5] ^ data->h_source[5])) % count;
-	}
-
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6) &&
+		skb_network_header_len(skb) >= sizeof(struct ipv6hdr)) {
+		ipv6h = ipv6_hdr(skb);
+		v6hash =
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);
+		v6hash = (v6hash >> 16) ^ (v6hash >> 8) ^ v6hash;
+		return (v6hash ^ data->h_dest[5] ^ data->h_source[5]) % count;
+	}
+
+	return bond_xmit_hash_policy_l2(skb, count);
 }
 
 /*
  * Hash for the output device based upon layer 3 and layer 4 data. If
  * the packet is a frag or not TCP or UDP, just use layer 3 data.  If it is
- * altogether not IP, mimic bond_xmit_hash_policy_l2()
+ * altogether not IP, fall back on bond_xmit_hash_policy_l2()
  */
 static int bond_xmit_hash_policy_l34(struct sk_buff *skb, int count)
 {
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-	struct iphdr *iph = ip_hdr(skb);
-	__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
-	int layer4_xor = 0;
+	u32 layer4_xor = 0;
+	struct iphdr *iph;
+	struct ipv6hdr *ipv6h;
 
 	if (skb->protocol == htons(ETH_P_IP)) {
+		iph = ip_hdr(skb);
 		if (!ip_is_fragment(iph) &&
-		    (iph->protocol == IPPROTO_TCP ||
-		     iph->protocol == IPPROTO_UDP)) {
+			(iph->protocol == IPPROTO_TCP ||
+			iph->protocol == IPPROTO_UDP)) {
+			__be16 *layer4hdr = (__be16 *)((u32 *)iph + iph->ihl);
+			if (iph->ihl * sizeof(u32) + sizeof(__be16) * 2 >
+				skb_headlen(skb) - skb_network_offset(skb))
+				goto short_header;
 			layer4_xor = ntohs((*layer4hdr ^ *(layer4hdr + 1)));
+		} else if (skb_network_header_len(skb) < sizeof(struct iphdr)) {
+			goto short_header;
 		}
-		return (layer4_xor ^
-			((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
-
+		return (layer4_xor ^ ((ntohl(iph->saddr ^ iph->daddr)) & 0xffff)) % count;
+	} else if (skb->protocol == htons(ETH_P_IPV6)) {
+		ipv6h = ipv6_hdr(skb);
+		if (ipv6h->nexthdr == IPPROTO_TCP || ipv6h->nexthdr == IPPROTO_UDP) {
+			__be16 *layer4hdrv6 = (__be16 *)((u8 *)ipv6h + sizeof(struct ipv6hdr));
+			if (sizeof(struct ipv6hdr) + sizeof(__be16) * 2 >
+				skb_headlen(skb) - skb_network_offset(skb))
+				goto short_header;
+			layer4_xor = (*layer4hdrv6 ^ *(layer4hdrv6 + 1));
+		} else if (skb_network_header_len(skb) < sizeof(struct ipv6hdr)) {
+			goto short_header;
+		}
+		layer4_xor ^=
+			(ipv6h->saddr.s6_addr32[1] ^ ipv6h->daddr.s6_addr32[1]) ^
+			(ipv6h->saddr.s6_addr32[2] ^ ipv6h->daddr.s6_addr32[2]) ^
+			(ipv6h->saddr.s6_addr32[3] ^ ipv6h->daddr.s6_addr32[3]);
+		return ((layer4_xor >> 16) ^ (layer4_xor >> 8) ^ layer4_xor) % count;
 	}
 
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
-}
-
-/*
- * Hash for the output device based upon layer 2 data
- */
-static int bond_xmit_hash_policy_l2(struct sk_buff *skb, int count)
-{
-	struct ethhdr *data = (struct ethhdr *)skb->data;
-
-	return (data->h_dest[5] ^ data->h_source[5]) % count;
+short_header:
+	return bond_xmit_hash_policy_l2(skb, count);
 }
 
 /*-------------------------- Device entry points ----------------------------*/
-- 
1.7.11

^ permalink raw reply related

* [PATCH v4 0/2] bonding support for IPv6 transmit hashing
From: John Eaglesham @ 2012-07-01  7:01 UTC (permalink / raw)
  To: netdev; +Cc: John Eaglesham

Currently the "bonding" driver does not support load balancing outgoing
traffic in LACP mode for IPv6 traffic. IPv4 (and TCP or UDP over IPv4)
are currently supported; this patch adds transmit hashing for IPv6 (and
TCP or UDP over IPv6), bringing IPv6 up to par with IPv4 support in the
bonding driver.

The algorithm chosen (xor'ing the bottom three quads and then xor'ing
the bottom three bytes of that) was chosen after testing almost 400,000
unique IPv6 addresses harvested from server logs. This algorithm had the
most even distribution for both big- and little-endian architectures while
still using few instructions.

The IPv6 flow label was intentionally not included in the hash as it appears
to be unset in the vast majority of IPv6 traffic sampled, and the current
algorithm not using the flow label already offers a very even distribution.

Fragmented IPv6 packets are handled the same way as fragmented IPv4 packets,
ie, they are not balanced based on layer 4 information. Additionally,
IPv6 packets with intermediate headers are not balanced based on layer
4 information. In practice these intermediate headers are not common and
this should not cause any problems, and the alternative (a packet-parsing
loop and look-up table) seemed slow and complicated for little gain.

This is an update to a prior patch I submitted. This version includes
a clarified description, thorough bounds checking, updates functions to
call bond_xmit_hash_policy_l2 rather than re-implement the same logic,
incorporates Jay's style suggestions, and patches against net-next. Patch
has been tested and performs as expected.

John Eaglesham (2):
  Add support for IPv6 and bounds checking to transmit hashing
    functions.
  Update bonding driver documentation to include IPv6 transmit hashing
    algorithm.

 Documentation/networking/bonding.txt | 31 ++++++++++--
 drivers/net/bonding/bond_main.c      | 91 +++++++++++++++++++++++++-----------
 2 files changed, 90 insertions(+), 32 deletions(-)

-- 
1.7.11

^ permalink raw reply

* [net] e1000e: remove use of IP payload checksum
From: Jeff Kirsher @ 2012-07-01  6:02 UTC (permalink / raw)
  To: davem; +Cc: Bruce Allan, netdev, gospo, sassmann, stable, Jeff Kirsher

From: Bruce Allan <bruce.w.allan@intel.com>

Currently only used when packet split mode is enabled with jumbo frames,
IP payload checksum (for fragmented UDP packets) is mutually exclusive with
receive hashing offload since the hardware uses the same space in the
receive descriptor for the hardware-provided packet checksum and the RSS
hash, respectively.  Users currently must disable jumbos when receive
hashing offload is enabled, or vice versa, because of this incompatibility.
Since testing has shown that IP payload checksum does not provide any real
benefit, just remove it so that there is no longer a choice between jumbos
or receive hashing offload but not both as done in other Intel GbE drivers
(e.g. e1000, igb).

Also, add a missing check for IP checksum error reported by the hardware;
let the stack verify the checksum when this happens.

CC: stable <stable@vger.kernel.org> [3.4]
Signed-off-by: Bruce Allan <bruce.w.allan@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/e1000e/defines.h |    1 +
 drivers/net/ethernet/intel/e1000e/netdev.c  |   75 +++++----------------------
 2 files changed, 15 insertions(+), 61 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/defines.h b/drivers/net/ethernet/intel/e1000e/defines.h
index 351a409..76edbc1 100644
--- a/drivers/net/ethernet/intel/e1000e/defines.h
+++ b/drivers/net/ethernet/intel/e1000e/defines.h
@@ -103,6 +103,7 @@
 #define E1000_RXD_ERR_SEQ       0x04    /* Sequence Error */
 #define E1000_RXD_ERR_CXE       0x10    /* Carrier Extension Error */
 #define E1000_RXD_ERR_TCPE      0x20    /* TCP/UDP Checksum Error */
+#define E1000_RXD_ERR_IPE       0x40    /* IP Checksum Error */
 #define E1000_RXD_ERR_RXE       0x80    /* Rx Data Error */
 #define E1000_RXD_SPC_VLAN_MASK 0x0FFF  /* VLAN ID is in lower 12 bits */
 
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 31d37a2..623e30b 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -496,7 +496,7 @@ static void e1000_receive_skb(struct e1000_adapter *adapter,
  * @sk_buff: socket buffer with received data
  **/
 static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err,
-			      __le16 csum, struct sk_buff *skb)
+			      struct sk_buff *skb)
 {
 	u16 status = (u16)status_err;
 	u8 errors = (u8)(status_err >> 24);
@@ -511,8 +511,8 @@ static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err,
 	if (status & E1000_RXD_STAT_IXSM)
 		return;
 
-	/* TCP/UDP checksum error bit is set */
-	if (errors & E1000_RXD_ERR_TCPE) {
+	/* TCP/UDP checksum error bit or IP checksum error bit is set */
+	if (errors & (E1000_RXD_ERR_TCPE | E1000_RXD_ERR_IPE)) {
 		/* let the stack verify checksum errors */
 		adapter->hw_csum_err++;
 		return;
@@ -523,19 +523,7 @@ static void e1000_rx_checksum(struct e1000_adapter *adapter, u32 status_err,
 		return;
 
 	/* It must be a TCP or UDP packet with a valid checksum */
-	if (status & E1000_RXD_STAT_TCPCS) {
-		/* TCP checksum is good */
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-	} else {
-		/*
-		 * IP fragment with UDP payload
-		 * Hardware complements the payload checksum, so we undo it
-		 * and then put the value in host order for further stack use.
-		 */
-		__sum16 sum = (__force __sum16)swab16((__force u16)csum);
-		skb->csum = csum_unfold(~sum);
-		skb->ip_summed = CHECKSUM_COMPLETE;
-	}
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	adapter->hw_csum_good++;
 }
 
@@ -954,8 +942,7 @@ static bool e1000_clean_rx_irq(struct e1000_ring *rx_ring, int *work_done,
 		skb_put(skb, length);
 
 		/* Receive Checksum Offload */
-		e1000_rx_checksum(adapter, staterr,
-				  rx_desc->wb.lower.hi_dword.csum_ip.csum, skb);
+		e1000_rx_checksum(adapter, staterr, skb);
 
 		e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb);
 
@@ -1341,8 +1328,7 @@ copydone:
 		total_rx_bytes += skb->len;
 		total_rx_packets++;
 
-		e1000_rx_checksum(adapter, staterr,
-				  rx_desc->wb.lower.hi_dword.csum_ip.csum, skb);
+		e1000_rx_checksum(adapter, staterr, skb);
 
 		e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb);
 
@@ -1512,9 +1498,8 @@ static bool e1000_clean_jumbo_rx_irq(struct e1000_ring *rx_ring, int *work_done,
 			}
 		}
 
-		/* Receive Checksum Offload XXX recompute due to CRC strip? */
-		e1000_rx_checksum(adapter, staterr,
-				  rx_desc->wb.lower.hi_dword.csum_ip.csum, skb);
+		/* Receive Checksum Offload */
+		e1000_rx_checksum(adapter, staterr, skb);
 
 		e1000_rx_hash(netdev, rx_desc->wb.lower.hi_dword.rss, skb);
 
@@ -3098,19 +3083,10 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
 
 	/* Enable Receive Checksum Offload for TCP and UDP */
 	rxcsum = er32(RXCSUM);
-	if (adapter->netdev->features & NETIF_F_RXCSUM) {
+	if (adapter->netdev->features & NETIF_F_RXCSUM)
 		rxcsum |= E1000_RXCSUM_TUOFL;
-
-		/*
-		 * IPv4 payload checksum for UDP fragments must be
-		 * used in conjunction with packet-split.
-		 */
-		if (adapter->rx_ps_pages)
-			rxcsum |= E1000_RXCSUM_IPPCSE;
-	} else {
+	else
 		rxcsum &= ~E1000_RXCSUM_TUOFL;
-		/* no need to clear IPPCSE as it defaults to 0 */
-	}
 	ew32(RXCSUM, rxcsum);
 
 	if (adapter->hw.mac.type == e1000_pch2lan) {
@@ -5241,22 +5217,10 @@ static int e1000_change_mtu(struct net_device *netdev, int new_mtu)
 	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN;
 
 	/* Jumbo frame support */
-	if (max_frame > ETH_FRAME_LEN + ETH_FCS_LEN) {
-		if (!(adapter->flags & FLAG_HAS_JUMBO_FRAMES)) {
-			e_err("Jumbo Frames not supported.\n");
-			return -EINVAL;
-		}
-
-		/*
-		 * IP payload checksum (enabled with jumbos/packet-split when
-		 * Rx checksum is enabled) and generation of RSS hash is
-		 * mutually exclusive in the hardware.
-		 */
-		if ((netdev->features & NETIF_F_RXCSUM) &&
-		    (netdev->features & NETIF_F_RXHASH)) {
-			e_err("Jumbo frames cannot be enabled when both receive checksum offload and receive hashing are enabled.  Disable one of the receive offload features before enabling jumbos.\n");
-			return -EINVAL;
-		}
+	if ((max_frame > ETH_FRAME_LEN + ETH_FCS_LEN) &&
+	    !(adapter->flags & FLAG_HAS_JUMBO_FRAMES)) {
+		e_err("Jumbo Frames not supported.\n");
+		return -EINVAL;
 	}
 
 	/* Supported frame sizes */
@@ -6030,17 +5994,6 @@ static int e1000_set_features(struct net_device *netdev,
 			 NETIF_F_RXALL)))
 		return 0;
 
-	/*
-	 * IP payload checksum (enabled with jumbos/packet-split when Rx
-	 * checksum is enabled) and generation of RSS hash is mutually
-	 * exclusive in the hardware.
-	 */
-	if (adapter->rx_ps_pages &&
-	    (features & NETIF_F_RXCSUM) && (features & NETIF_F_RXHASH)) {
-		e_err("Enabling both receive checksum offload and receive hashing is not possible with jumbo frames.  Disable jumbos or enable only one of the receive offload features.\n");
-		return -EINVAL;
-	}
-
 	if (changed & NETIF_F_RXFCS) {
 		if (features & NETIF_F_RXFCS) {
 			adapter->flags2 &= ~FLAG2_CRC_STRIPPING;
-- 
1.7.10.4

^ permalink raw reply related

* Re: [PATCH v6] sctp: be more restrictive in transport selection on bundled sacks
From: David Miller @ 2012-07-01  5:44 UTC (permalink / raw)
  To: vyasevich; +Cc: nhorman, netdev, linux-sctp
In-Reply-To: <a1fb36e6-783a-4a89-9771-a7010c2da4fb@email.android.com>

From: Vlad Yasevich <vyasevich@gmail.com>
Date: Sat, 30 Jun 2012 23:17:52 -0400

> David Miller <davem@davemloft.net> wrote:
> 
>>Once this has Vlad's ACK I'll apply it.
> 
> Acked-by: Vlad Yasevich <vyasevich@gmail.com>

Applied, thanks everyone.

^ permalink raw reply

* Re: [PATCH] ipv4: Elide fib_validate_source() completely when possible.
From: David Miller @ 2012-07-01  5:39 UTC (permalink / raw)
  To: ja; +Cc: netdev
In-Reply-To: <alpine.LFD.2.00.1206301300530.1593@ja.ssi.bg>

From: Julian Anastasov <ja@ssi.bg>
Date: Sat, 30 Jun 2012 13:45:52 +0300 (EEST)

> 	If we really want a change in behavior we should
> at least update the accept_local info in
> Documentation/networking/ip-sysctl.txt ?

Thanks for pointing this out, that's what I will do.

====================
ipv4: Clarify in docs that accept_local requires rp_filter.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 99d0e05..47b6c79 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -857,9 +857,14 @@ accept_source_route - BOOLEAN
 		FALSE (host)
 
 accept_local - BOOLEAN
-	Accept packets with local source addresses. In combination with
-	suitable routing, this can be used to direct packets between two
-	local interfaces over the wire and have them accepted properly.
+	Accept packets with local source addresses. In combination
+	with suitable routing, this can be used to direct packets
+	between two local interfaces over the wire and have them
+	accepted properly.
+
+	rp_filter must be set to a non-zero value in order for
+	accept_local to have an effect.
+
 	default FALSE
 
 route_localnet - BOOLEAN
-- 
1.7.10.4

^ permalink raw reply related

* Re: [net-next] e1000e: remove use of IP payload checksum
From: Jeff Kirsher @ 2012-07-01  5:32 UTC (permalink / raw)
  To: David Miller; +Cc: ben, bruce.w.allan, netdev, gospo, sassmann
In-Reply-To: <20120630.173752.1993136000245136259.davem@davemloft.net>

[-- Attachment #1: Type: text/plain, Size: 1889 bytes --]

On Sat, 2012-06-30 at 17:37 -0700, David Miller wrote:
> From: Ben Hutchings <ben@decadent.org.uk>
> Date: Sat, 30 Jun 2012 22:36:36 +0100
> 
> > On Sat, 2012-06-30 at 03:35 -0700, Jeff Kirsher wrote:
> >> From: Bruce Allan <bruce.w.allan@intel.com>
> >> 
> >> Currently only used when packet split mode is enabled with jumbo frames,
> >> IP payload checksum (for fragmented UDP packets) is mutually exclusive with
> >> receive hashing offload since the hardware uses the same space in the
> >> receive descriptor for the hardware-provided packet checksum and the RSS
> >> hash, respectively.  Users currently must disable jumbos when receive
> >> hashing offload is enabled, or vice versa, because of this incompatibility.
> >> Since testing has shown that IP payload checksum does not provide any real
> >> benefit, just remove it so that there is no longer a choice between jumbos
> >> or receive hashing offload but not both as done in other Intel GbE drivers
> >> (e.g. e1000, igb).
> >> 
> >> Also, add a missing check for IP checksum error reported by the hardware;
> >> let the stack verify the checksum when this happens.
> > [...]
> > 
> > The change to enable RX hashing in 3.4, with this odd restriction seems
> > to have broken most existing systems using jumbo MTU on e1000e.  None of
> > the distro scripts or network management daemons will automatically
> > change offload configuration before MTU; how could they know?
> > 
> > Therefore this needs to be fixed in 3.5 and 3.4.y, not net-next.
> 
> Agreed.

Ok, I will prepare it for net and stable 3.4.  I know it will require a
backported patch for stable 3.4.y since the current patch only applied
to net & net-next.

Bruce was wanting to have it applied to net & stable, and I was not sure
based on the patch content and description, so I that is why I submitted
it for net-next.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 836 bytes --]

^ permalink raw reply

* Re: [PATCH V3 2/2] bonding support for IPv6 transmit hashing
From: Hannes Frederic Sowa @ 2012-07-01  3:57 UTC (permalink / raw)
  To: John; +Cc: netdev
In-Reply-To: <4FEF55A7.6070502@8192.net>

On Sat, Jun 30, 2012 at 9:38 PM, John <linux@8192.net> wrote:
> On 6/30/2012 4:59 AM, Hannes Frederic Sowa wrote:
>> On Sat, Jun 30, 2012 at 8:17 AM, John <linux@8192.net> wrote:
>>>
>>> diff --git a/Documentation/networking/bonding.txt
>>> b/Documentation/networking/bonding.txt
>>> index bfea8a3..5db14fe 100644
>>> --- a/Documentation/networking/bonding.txt
>>> +++ b/Documentation/networking/bonding.txt
>>> @@ -752,12 +752,22 @@ xmit_hash_policy
>>>                  protocol information to generate the hash.
>>>
>>>                  Uses XOR of hardware MAC addresses and IP addresses to
>>> -               generate the hash.  The formula is
>>> +               generate the hash.  The IPv4 formula is
>>>
>>>                  (((source IP XOR dest IP) AND 0xffff) XOR
>>>                          ( source MAC XOR destination MAC ))
>>>                                  modulo slave count
>>>
>>> +               The IPv6 forumla is
>>> +
>>> +               iphash =
>>> +                       (source ip quad 2 XOR dest IP quad 2) XOR
>>> +                       (source ip quad 3 XOR dest IP quad 3) XOR
>>> +                       (source ip quad 4 XOR dest IP quad 4)
>>> +
>>> +               ((iphash >> 16) XOR (iphash >> 8) XOR iphash)
>>> +                       modulo slave count
>>> +
>>
>>
>> Wouldn't it be beneficial to include the ipv6 flow label in the hash
>> calculation?
>
> Hannes,
>
> In all of the traffic I inspected I don't believe I saw a single flow label
> set. Even if it were set 100% of the time by Linux, any packets routed or
> bridged from another operating system wouldn't see any benefit. The current
> algorithm distributes the traffic very well, I don't believe adding the flow
> label would be beneficial even if it were set more frequently.
>
> If you feel strongly about its inclusion, though, I am willing to
> reconsider.

It would definitely help to load balance tunnelled traffic over a
bonded interface. But as I currently don't use such a setup, I don't
have a strong opinion on that.

Greetings,

  Hannes

^ permalink raw reply

* Re: [PATCH v6] sctp: be more restrictive in transport selection on bundled sacks
From: Vlad Yasevich @ 2012-07-01  3:17 UTC (permalink / raw)
  To: David Miller, nhorman; +Cc: netdev, linux-sctp
In-Reply-To: <20120630.173945.173993639982489712.davem@davemloft.net>

David Miller <davem@davemloft.net> wrote:

>From: Neil Horman <nhorman@tuxdriver.com>
>Date: Sat, 30 Jun 2012 09:04:26 -0400
>
>> It was noticed recently that when we send data on a transport, its
>possible that
>> we might bundle a sack that arrived on a different transport.  While
>this isn't
>> a major problem, it does go against the SHOULDAcm requirement in section
>6.4 of RFC
>> 2960:
>> 
>>  An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK,
>>    etc.) to the same destination transport address from which it
>>    received the DATA or control chunk to which it is replying.  This
>>    rule should also be followed if the endpoint is bundling DATA
>chunks
>>    together with the reply chunk.
>> 
>> This patch seeks to correct that.  It restricts the bundling of sack
>operations
>> to only those transports which have moved the ctsn of the association
>forward
>> since the last sack.  By doing this we guarantee that we only bundle
>outbound
>> saks on a transport that has received a chunk since the last sack. 
>This brings
>> us into stricter compliance with the RFC.
>> 
>> Vlad had initially suggested that we strictly allow only sack
>bundling on the
>> transport that last moved the ctsn forward.  While this makes sense,
>I was
>> concerned that doing so prevented us from bundling in the case where
>we had
>> received chunks that moved the ctsn on multiple transports.  In those
>cases, the
>> RFC allows us to select any of the transports having received chunks
>to bundle
>> the sack on.  so I've modified the approach to allow for that, by
>adding a state
>> variable to each transport that tracks weather it has moved the ctsn
>since the
>> last sack.  This I think keeps our behavior (and performance), close
>enough to
>> our current profile that I think we can do this without a sysctl knob
>to
>> enable/disable it.
>> 
>> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
>> CC: Vlad Yaseivch <vyasevich@gmail.com>
>> CC: David S. Miller <davem@davemloft.net>
>> CC: linux-sctp@vger.kernel.org
>> Reported-by: Michele Baldessari <michele@redhat.com>
>> Reported-by: sorin serban <sserban@redhat.com>
>
>Once this has Vlad's ACK I'll apply it.
>

Acked-by: Vlad Yasevich <vyasevich@gmail.com>

Sorry for the delay.

-vlad

>There has to be a better way to handle this situation, wherein the
>responsible party has ACK'd the patch but I just ask for a few coding
>style fixups and whatnot.  As it stands now I have to twiddle my
>thumbs waiting for the new ACK.


-- 
Sent from my Android phone with SkitMail. Please excuse my brevity.

^ permalink raw reply

* Re: [PATCH net-next 06/15] netfilter: Add NFPROTO_BUS hook constant for AF_BUS socket family
From: Jan Engelhardt @ 2012-07-01  2:15 UTC (permalink / raw)
  To: Vincent Sanders
  Cc: netdev, linux-kernel, David S. Miller, Javier Martinez Canillas
In-Reply-To: <1340988354-26981-7-git-send-email-vincent.sanders@collabora.co.uk>

On Friday 2012-06-29 18:45, Vincent Sanders wrote:

>AF_BUS sockets add a netfilter NF_HOOK() on the packet sending path.
>This allows packet to be mangled by registered netfilter hooks.

If you do touch netfiler, consider adding that mailing list as well.

>diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
>index c613cf0..0698924 100644
>--- a/include/linux/netfilter.h
>+++ b/include/linux/netfilter.h
>@@ -67,6 +67,7 @@ enum {
> 	NFPROTO_BRIDGE =  7,
> 	NFPROTO_IPV6   = 10,
> 	NFPROTO_DECNET = 12,
>+	NFPROTO_BUS,
> 	NFPROTO_NUMPROTO,
> };

Make use of the holes that were left.

^ permalink raw reply

* Re: [net] igbvf: fix divide by zero
From: David Miller @ 2012-07-01  0:41 UTC (permalink / raw)
  To: jeffrey.t.kirsher
  Cc: mitch.a.williams, netdev, gospo, sassmann, stable, daahern
In-Reply-To: <1341051799-8824-1-git-send-email-jeffrey.t.kirsher@intel.com>

From: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Date: Sat, 30 Jun 2012 03:23:19 -0700

> From: Mitch A Williams <mitch.a.williams@intel.com>
> 
> Using ethtool -C ethX rx-usecs 0 crashes with a divide by zero.
> Refactor this function to fix this issue and make it more clear
> what the intent of each conditional is. Add comment regarding
> using a setting of zero.
> 
> CC: stable <stable@vger.kernel.org> [3.3+]
> CC: David Ahern <daahern@cisco.com>
> Signed-off-by: Mitch Williams <mitch.a.williams@intel.com>
> Tested-by: Aaron Brown <aaron.f.brown@intel.com>
> Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>

Applied, thanks.

^ permalink raw reply

* Re: [PATCH v6] sctp: be more restrictive in transport selection on bundled sacks
From: David Miller @ 2012-07-01  0:39 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, vyasevich, linux-sctp
In-Reply-To: <1341061466-4186-1-git-send-email-nhorman@tuxdriver.com>

From: Neil Horman <nhorman@tuxdriver.com>
Date: Sat, 30 Jun 2012 09:04:26 -0400

> It was noticed recently that when we send data on a transport, its possible that
> we might bundle a sack that arrived on a different transport.  While this isn't
> a major problem, it does go against the SHOULD requirement in section 6.4 of RFC
> 2960:
> 
>  An endpoint SHOULD transmit reply chunks (e.g., SACK, HEARTBEAT ACK,
>    etc.) to the same destination transport address from which it
>    received the DATA or control chunk to which it is replying.  This
>    rule should also be followed if the endpoint is bundling DATA chunks
>    together with the reply chunk.
> 
> This patch seeks to correct that.  It restricts the bundling of sack operations
> to only those transports which have moved the ctsn of the association forward
> since the last sack.  By doing this we guarantee that we only bundle outbound
> saks on a transport that has received a chunk since the last sack.  This brings
> us into stricter compliance with the RFC.
> 
> Vlad had initially suggested that we strictly allow only sack bundling on the
> transport that last moved the ctsn forward.  While this makes sense, I was
> concerned that doing so prevented us from bundling in the case where we had
> received chunks that moved the ctsn on multiple transports.  In those cases, the
> RFC allows us to select any of the transports having received chunks to bundle
> the sack on.  so I've modified the approach to allow for that, by adding a state
> variable to each transport that tracks weather it has moved the ctsn since the
> last sack.  This I think keeps our behavior (and performance), close enough to
> our current profile that I think we can do this without a sysctl knob to
> enable/disable it.
> 
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Vlad Yaseivch <vyasevich@gmail.com>
> CC: David S. Miller <davem@davemloft.net>
> CC: linux-sctp@vger.kernel.org
> Reported-by: Michele Baldessari <michele@redhat.com>
> Reported-by: sorin serban <sserban@redhat.com>

Once this has Vlad's ACK I'll apply it.

There has to be a better way to handle this situation, wherein the
responsible party has ACK'd the patch but I just ask for a few coding
style fixups and whatnot.  As it stands now I have to twiddle my
thumbs waiting for the new ACK.

^ permalink raw reply

* Re: [PATCH v5] sctp: be more restrictive in transport selection on bundled sacks
From: David Miller @ 2012-07-01  0:38 UTC (permalink / raw)
  To: nhorman; +Cc: netdev, vyasevich, linux-sctp
In-Reply-To: <20120630122647.GA22647@neilslaptop.think-freely.org>

From: Neil Horman <nhorman@tuxdriver.com>
Date: Sat, 30 Jun 2012 08:26:47 -0400

> This is wrong.  Its a counter that increments every time we call sctp_make_sack,
> so that we can create a unique generation identifier for use in tagging which
> transports move ctsn in a given generation.  It saves us from having to iterate
> over a list every time we send a sack. 

Sorry, I missed the counter bump.

^ permalink raw reply

* Re: [net-next] e1000e: remove use of IP payload checksum
From: David Miller @ 2012-07-01  0:37 UTC (permalink / raw)
  To: ben; +Cc: jeffrey.t.kirsher, bruce.w.allan, netdev, gospo, sassmann
In-Reply-To: <1341092196.4852.43.camel@deadeye.wl.decadent.org.uk>

From: Ben Hutchings <ben@decadent.org.uk>
Date: Sat, 30 Jun 2012 22:36:36 +0100

> On Sat, 2012-06-30 at 03:35 -0700, Jeff Kirsher wrote:
>> From: Bruce Allan <bruce.w.allan@intel.com>
>> 
>> Currently only used when packet split mode is enabled with jumbo frames,
>> IP payload checksum (for fragmented UDP packets) is mutually exclusive with
>> receive hashing offload since the hardware uses the same space in the
>> receive descriptor for the hardware-provided packet checksum and the RSS
>> hash, respectively.  Users currently must disable jumbos when receive
>> hashing offload is enabled, or vice versa, because of this incompatibility.
>> Since testing has shown that IP payload checksum does not provide any real
>> benefit, just remove it so that there is no longer a choice between jumbos
>> or receive hashing offload but not both as done in other Intel GbE drivers
>> (e.g. e1000, igb).
>> 
>> Also, add a missing check for IP checksum error reported by the hardware;
>> let the stack verify the checksum when this happens.
> [...]
> 
> The change to enable RX hashing in 3.4, with this odd restriction seems
> to have broken most existing systems using jumbo MTU on e1000e.  None of
> the distro scripts or network management daemons will automatically
> change offload configuration before MTU; how could they know?
> 
> Therefore this needs to be fixed in 3.5 and 3.4.y, not net-next.

Agreed.

^ permalink raw reply

* Re: AF_BUS socket address family
From: David Miller @ 2012-07-01  0:33 UTC (permalink / raw)
  To: alan; +Cc: vincent.sanders, netdev, linux-kernel
In-Reply-To: <20120630141222.60df95a5@pyramind.ukuu.org.uk>

From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Sat, 30 Jun 2012 14:12:22 +0100

> In fact if you look up the stack you'll find a large number of multicast
> messaging systems which do reliable transport built on top of IP. In fact
> Red Hat provides a high level messaging cluster service that does exactly
> this. (as well as dbus which does it on the deskop level) plus a ton of
> stuff on top of that (JGroups etc)
> 
> Everybody at the application level has been using these 'receiver
> reliable'  multicast services for years (Websphere MQ, TIBCO, RTPGM,
> OpenPGM, MS-PGM, you name it). There are even accelerators for PGM based
> protocols in things like Cisco routers and Solarflare can do much of it
> on the card for 10Gbit.

The issue is that what to do when a receiver goes deaf is a policy
issue.

^ permalink raw reply

* Re: [patch -next] netfilter: use kfree_skb() not kfree()
From: David Miller @ 2012-07-01  0:27 UTC (permalink / raw)
  To: dan.carpenter
  Cc: netfilter, coreteam, netdev, bridge, kernel-janitors,
	bart.de.schuymer, netfilter-devel, shemminger, pablo
In-Reply-To: <20120630114853.GA22767@elgon.mountain>

From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Sat, 30 Jun 2012 14:48:53 +0300

> This was should be a kfree_skb() here to free the sk_buff pointer.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

My bad, applied, thanks Dan.

^ permalink raw reply

* Re: [net-next] e1000e: remove use of IP payload checksum
From: Ben Hutchings @ 2012-06-30 21:36 UTC (permalink / raw)
  To: Jeff Kirsher; +Cc: davem, Bruce Allan, netdev, gospo, sassmann
In-Reply-To: <1341052528-2444-1-git-send-email-jeffrey.t.kirsher@intel.com>

[-- Attachment #1: Type: text/plain, Size: 1435 bytes --]

On Sat, 2012-06-30 at 03:35 -0700, Jeff Kirsher wrote:
> From: Bruce Allan <bruce.w.allan@intel.com>
> 
> Currently only used when packet split mode is enabled with jumbo frames,
> IP payload checksum (for fragmented UDP packets) is mutually exclusive with
> receive hashing offload since the hardware uses the same space in the
> receive descriptor for the hardware-provided packet checksum and the RSS
> hash, respectively.  Users currently must disable jumbos when receive
> hashing offload is enabled, or vice versa, because of this incompatibility.
> Since testing has shown that IP payload checksum does not provide any real
> benefit, just remove it so that there is no longer a choice between jumbos
> or receive hashing offload but not both as done in other Intel GbE drivers
> (e.g. e1000, igb).
> 
> Also, add a missing check for IP checksum error reported by the hardware;
> let the stack verify the checksum when this happens.
[...]

The change to enable RX hashing in 3.4, with this odd restriction seems
to have broken most existing systems using jumbo MTU on e1000e.  None of
the distro scripts or network management daemons will automatically
change offload configuration before MTU; how could they know?

Therefore this needs to be fixed in 3.5 and 3.4.y, not net-next.

Ben.

-- 
Ben Hutchings
Lowery's Law:
             If it jams, force it. If it breaks, it needed replacing anyway.

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 828 bytes --]

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox