Netdev List
 help / color / mirror / Atom feed
* [PATCHv2 RFC 3/4] virtio_net: limit xmit polling
From: Michael S. Tsirkin @ 2011-06-02 15:43 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <cover.1307029008.git.mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

Current code might introduce a lot of latency variation
if there are many pending bufs at the time we
attempt to transmit a new one. This is bad for
real-time applications and can't be good for TCP either.

Free up just enough to both clean up all buffers
eventually and to be able to xmit the next packet.

Signed-off-by: Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++-----------------
 1 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index a0ee78d..b25db1c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -509,17 +509,33 @@ again:
 	return received;
 }
 
-static void free_old_xmit_skbs(struct virtnet_info *vi)
+static bool free_old_xmit_skb(struct virtnet_info *vi)
 {
 	struct sk_buff *skb;
 	unsigned int len;
 
-	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
-		pr_debug("Sent skb %p\n", skb);
-		vi->dev->stats.tx_bytes += skb->len;
-		vi->dev->stats.tx_packets++;
-		dev_kfree_skb_any(skb);
-	}
+	skb = virtqueue_get_buf(vi->svq, &len);
+	if (unlikely(!skb))
+		return false;
+	pr_debug("Sent skb %p\n", skb);
+	vi->dev->stats.tx_bytes += skb->len;
+	vi->dev->stats.tx_packets++;
+	dev_kfree_skb_any(skb);
+	return true;
+}
+
+/* Check capacity and try to free enough pending old buffers to enable queueing
+ * new ones. Return true if we can guarantee that a following
+ * virtqueue_add_buf will succeed. */
+static bool free_xmit_capacity(struct virtnet_info *vi)
+{
+	struct sk_buff *skb;
+	unsigned int len;
+
+	while (virtqueue_min_capacity(vi->svq) < MAX_SKB_FRAGS + 2)
+		if (unlikely(!free_old_xmit_skb))
+			return false;
+	return true;
 }
 
 static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
@@ -572,30 +588,34 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
-	int capacity;
-
-	/* Free up any pending old buffers before queueing new ones. */
-	free_old_xmit_skbs(vi);
-
-	/* Try to transmit */
-	capacity = xmit_skb(vi, skb);
-
-	/* This can happen with OOM and indirect buffers. */
-	if (unlikely(capacity < 0)) {
-		if (net_ratelimit()) {
-			if (likely(capacity == -ENOMEM)) {
-				dev_warn(&dev->dev,
-					 "TX queue failure: out of memory\n");
-			} else {
-				dev->stats.tx_fifo_errors++;
+	int ret, i;
+
+	/* We normally do have space in the ring, so try to queue the skb as
+	 * fast as possible. */
+	ret = xmit_skb(vi, skb);
+	if (unlikely(ret < 0)) {
+		/* This triggers on the first xmit after ring full condition.
+		 * We need to free up some skbs first. */
+		if (likely(free_xmit_capacity(vi))) {
+			ret = xmit_skb(vi, skb);
+			/* This should never fail. Check, just in case. */
+			if (unlikely(ret < 0)) {
 				dev_warn(&dev->dev,
 					 "Unexpected TX queue failure: %d\n",
-					 capacity);
+					 ret);
+				dev->stats.tx_fifo_errors++;
+				dev->stats.tx_dropped++;
+				kfree_skb(skb);
+				return NETDEV_TX_OK;
 			}
+		} else {
+			/* Ring full: it might happen if we get a callback while
+			 * the queue is still mostly full. This should be
+			 * extremely rare. */
+			dev->stats.tx_dropped++;
+			kfree_skb(skb);
+			goto stop;
 		}
-		dev->stats.tx_dropped++;
-		kfree_skb(skb);
-		return NETDEV_TX_OK;
 	}
 	virtqueue_kick(vi->svq);
 
@@ -603,18 +623,26 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	skb_orphan(skb);
 	nf_reset(skb);
 
-	/* Apparently nice girls don't return TX_BUSY; stop the queue
-	 * before it gets out of hand.  Naturally, this wastes entries. */
-	if (capacity < 2+MAX_SKB_FRAGS) {
-		netif_stop_queue(dev);
-		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
-			/* More just got used, free them then recheck. */
-			free_old_xmit_skbs(vi);
-			capacity = virtqueue_min_capacity(vi->svq);
-			if (capacity >= 2+MAX_SKB_FRAGS) {
-				netif_start_queue(dev);
-				virtqueue_disable_cb(vi->svq);
-			}
+	/* We transmit one skb, so try to free at least two pending skbs.
+	 * This is so that we don't hog the skb memory unnecessarily. *
+	 * Doing this after kick means there's a chance we'll free
+	 * the skb we have just sent, which is hot in cache. */
+	for (i = 0; i < 2; i++)
+		free_old_xmit_skb(v);
+
+	if (likely(free_xmit_capacity(vi)))
+		return NETDEV_TX_OK;
+
+stop:
+	/* Apparently nice girls don't return TX_BUSY; check capacity and stop
+	 * the queue before it gets out of hand.
+	 * Naturally, this wastes entries. */
+	netif_stop_queue(dev);
+	if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
+		/* More just got used, free them and recheck. */
+		if (free_xmit_capacity(vi)) {
+			netif_start_queue(dev);
+			virtqueue_disable_cb(vi->svq);
 		}
 	}
 
-- 
1.7.5.53.gc233e

^ permalink raw reply related

* [PATCHv2 RFC 4/4] Revert "virtio: make add_buf return capacity remaining:
From: Michael S. Tsirkin @ 2011-06-02 15:43 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <cover.1307029008.git.mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

This reverts commit 3c1b27d5043086a485f8526353ae9fe37bfa1065.
The only user was virtio_net, and it switched to
min_capacity instead.

Signed-off-by: Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
 drivers/virtio/virtio_ring.c |    2 +-
 include/linux/virtio.h       |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 23422f1..a6c21eb 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -233,7 +233,7 @@ add_head:
 	pr_debug("Added buffer head %i to %p\n", head, vq);
 	END_USE(vq);
 
-	return vq->num_free;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp);
 
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 209220d..63c4908 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -34,7 +34,7 @@ struct virtqueue {
  *	in_num: the number of sg which are writable (after readable ones)
  *	data: the token identifying the buffer.
  *	gfp: how to do memory allocations (if necessary).
- *      Returns remaining capacity of queue (sg segments) or a negative error.
+ *      Returns 0 on success or a negative error.
  * virtqueue_kick: update after add_buf
  *	vq: the struct virtqueue
  *	After one or more add_buf calls, invoke this to kick the other side.
-- 
1.7.5.53.gc233e

^ permalink raw reply related

* Re: [PATCH RFC 3/3] virtio_net: limit xmit polling
From: Michael S. Tsirkin @ 2011-06-02 15:44 UTC (permalink / raw)
  To: Krishna Kumar2
  Cc: habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	lguest-uLR06cmDAlY/bJ5BZ2RsiQ, Shirley Ma,
	kvm-u79uwXL29TY76Z2rM5mHXA, Carsten Otte,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, netdev-u79uwXL29TY76Z2rM5mHXA, Martin Schwidefsky,
	linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <OFFF109303.D838CAE2-ON652578A3.0053357D-652578A3.00549366-xthvdsQ13ZrQT0dZR+AlfA@public.gmane.org>

On Thu, Jun 02, 2011 at 08:56:42PM +0530, Krishna Kumar2 wrote:
> Return value in free_old_xmit() was wrong.

Could you check my latest RFC pls?

-- 
MST

^ permalink raw reply

* [PATCH] bridge: Forward EAPOL when STP off
From: Nick Carter @ 2011-06-02 15:59 UTC (permalink / raw)
  To: netdev; +Cc: benjamin.poirier, davem, shemminger

Signed-off-by: Nick Carter <ncarter100@gmail.com>

If STP is disabled then forward frames destined to the 802.1X PAE group
address (01-80-C2-00-00-03)

This change is required to support virtual machines running an 802.1X
supplicant and bridged to an ethernet interface.

This change has been tested and works fine with a range of supplicants.

I don't think this change will break 802.3ad bonding inside of a bridge.
[See commit f01cb5fbea1c1613621f9f32f385e12c1a29dde0
 Revert "bridge: Forward reserved group addresses if !STP"]
Bonding uses the IEEE Std 802.3ad Slow_Protocols_Multicast address
"#define MULTICAST_LACPDU_ADDR    {0x01, 0x80, 0xC2, 0x00, 0x00, 0x02}"
Which will not be caught by this patch.
---
 net/bridge/br_input.c |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index f3ac1e8..d6b4479 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -165,7 +165,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
 			goto drop;

 		/* If STP is turned off, then forward */
-		if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0)
+		if (p->br->stp_enabled == BR_NO_STP &&
+		    (dest[5] == 0 || /* Bridge group address */
+		     dest[5] == 3))  /* 802.1X PAE address */
 			goto forward;

 		if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev,
-- 
1.7.4.1

^ permalink raw reply related

* Re: [PATCHv2 RFC 0/4] virtio and vhost-net capacity handling
From: Michael S. Tsirkin @ 2011-06-02 17:17 UTC (permalink / raw)
  To: linux-kernel-u79uwXL29TY76Z2rM5mHXA
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <cover.1307029008.git.mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>

On Thu, Jun 02, 2011 at 06:42:35PM +0300, Michael S. Tsirkin wrote:
> OK, here's a new attempt to use the new capacity api.  I also added more
> comments to clarify the logic.  Hope this is more readable.  Let me know
> pls.
> 
> This is on top of the patches applied by Rusty.
> 
> Warning: untested. Posting now to give people chance to
> comment on the API.
> 
> Changes from v1:
> - fix comment in patch 2 to correct confusion noted by Rusty
> - rewrite patch 3 along the lines suggested by Rusty
>   note: it's not exactly the same but I hope it's close
>   enough, the main difference is that mine does limited
>   polling even in the unlikely xmit failure case.
> - added a patch to not return capacity from add_buf
>   it always looked like a weird hack
> 
> Michael S. Tsirkin (4):
>   virtio_ring: add capacity check API
>   virtio_net: fix tx capacity checks using new API
>   virtio_net: limit xmit polling
>   Revert "virtio: make add_buf return capacity remaining:
> 
>  drivers/net/virtio_net.c     |  111 ++++++++++++++++++++++++++----------------
>  drivers/virtio/virtio_ring.c |   10 +++-
>  include/linux/virtio.h       |    7 ++-
>  3 files changed, 84 insertions(+), 44 deletions(-)
> 
> -- 
> 1.7.5.53.gc233e


And just FYI, here's a patch (on top) that I considered but
decided against. This does a single get_buf before
xmit. I thought it's not really needed as the capacity
check in add_buf is relatively cheap, and we removed
the kick in xmit_skb. But the point is that the loop
will now be easy to move around if someone manages
to show this benefits speed (which I doubt).

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b25db1c..75af5be 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -590,6 +590,9 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	struct virtnet_info *vi = netdev_priv(dev);
 	int ret, i;
 
+	/* Try to pop an skb, to increase the chance we can add this one. */
+	free_old_xmit_skb(v);
+
 	/* We normally do have space in the ring, so try to queue the skb as
 	 * fast as possible. */
 	ret = xmit_skb(vi, skb);
@@ -627,8 +630,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * This is so that we don't hog the skb memory unnecessarily. *
 	 * Doing this after kick means there's a chance we'll free
 	 * the skb we have just sent, which is hot in cache. */
-	for (i = 0; i < 2; i++)
-		free_old_xmit_skb(v);
+	free_old_xmit_skb(v);
 
 	if (likely(free_xmit_capacity(vi)))
 		return NETDEV_TX_OK;

^ permalink raw reply related

* [PATCH] bonding: reset queue mapping prior to transmission to physical device
From: Neil Horman @ 2011-06-02 18:03 UTC (permalink / raw)
  To: netdev; +Cc: Neil Horman, Jay Vosburgh, Andy Gospodarek, David S. Miller

The bonding driver is multiqueue enabled, in which each queue represents a slave
to enable optional steering of output frames to given slaves against the default
output policy.  However, it needs to reset the skb->queue_mapping prior to
queuing to the physical device or the physical slave (if it is multiqueue) could
wind up transmitting on an unintended tx queue (one that was reserved for
specific traffic classes for instance)

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Jay Vosburgh <fubar@us.ibm.com>
CC: Andy Gospodarek <andy@greyhouse.net>
CC: "David S. Miller" <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c |    5 +++++
 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 17b4dd9..812c70c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4200,6 +4200,11 @@ static inline int bond_slave_override(struct bonding *bond,
 	/* If the slave isn't UP, use default transmit policy. */
 	if (slave && slave->queue_id && IS_UP(slave->dev) &&
 	    (slave->link == BOND_LINK_UP)) {
+		/*
+ 		 * Reset the queue mapping to allow the underlying slave	
+ 		 * the chance to re-select an appropriate tx queue
+ 		 */
+		skb_set_queue_mapping(skb, 0);
 		res = bond_dev_queue_xmit(bond, skb, slave->dev);
 	}
 
-- 
1.7.3.4


^ permalink raw reply related

* Re: [PATCHv2 RFC 3/4] virtio_net: limit xmit polling
From: Sridhar Samudrala @ 2011-06-02 18:09 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Rusty Russell, Carsten Otte, Christian Borntraeger,
	linux390, Martin Schwidefsky, Heiko Carstens, Shirley Ma, lguest,
	virtualization, netdev, linux-s390, kvm, Krishna Kumar,
	Tom Lendacky, steved, habanero
In-Reply-To: <a80199422de16ae355e56ee1b2abc9b2bf91a7f6.1307029009.git.mst@redhat.com>

On Thu, 2011-06-02 at 18:43 +0300, Michael S. Tsirkin wrote:
> Current code might introduce a lot of latency variation
> if there are many pending bufs at the time we
> attempt to transmit a new one. This is bad for
> real-time applications and can't be good for TCP either.
> 
> Free up just enough to both clean up all buffers
> eventually and to be able to xmit the next packet.
> 
> Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>  drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++-----------------
>  1 files changed, 67 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index a0ee78d..b25db1c 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -509,17 +509,33 @@ again:
>  	return received;
>  }
> 
> -static void free_old_xmit_skbs(struct virtnet_info *vi)
> +static bool free_old_xmit_skb(struct virtnet_info *vi)
>  {
>  	struct sk_buff *skb;
>  	unsigned int len;
> 
> -	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
> -		pr_debug("Sent skb %p\n", skb);
> -		vi->dev->stats.tx_bytes += skb->len;
> -		vi->dev->stats.tx_packets++;
> -		dev_kfree_skb_any(skb);
> -	}
> +	skb = virtqueue_get_buf(vi->svq, &len);
> +	if (unlikely(!skb))
> +		return false;
> +	pr_debug("Sent skb %p\n", skb);
> +	vi->dev->stats.tx_bytes += skb->len;
> +	vi->dev->stats.tx_packets++;
> +	dev_kfree_skb_any(skb);
> +	return true;
> +}
> +
> +/* Check capacity and try to free enough pending old buffers to enable queueing
> + * new ones. Return true if we can guarantee that a following
> + * virtqueue_add_buf will succeed. */
> +static bool free_xmit_capacity(struct virtnet_info *vi)
> +{
> +	struct sk_buff *skb;
> +	unsigned int len;
> +
> +	while (virtqueue_min_capacity(vi->svq) < MAX_SKB_FRAGS + 2)
> +		if (unlikely(!free_old_xmit_skb))
> +			return false;
If we are using INDIRECT descriptors, 1 descriptor entry is good enough
to guarantee that an skb can be queued unless we run out of memory.
Is it worth checking if 'indirect' is set on the svq and then only free
1 descriptor? Otherwise, we will be dropping the packet if there are
less than 18 free descriptors although we ony need 1.


> +	return true;
>  }
> 
>  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
> @@ -572,30 +588,34 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
>  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct virtnet_info *vi = netdev_priv(dev);
> -	int capacity;
> -
> -	/* Free up any pending old buffers before queueing new ones. */
> -	free_old_xmit_skbs(vi);
> -
> -	/* Try to transmit */
> -	capacity = xmit_skb(vi, skb);
> -
> -	/* This can happen with OOM and indirect buffers. */
> -	if (unlikely(capacity < 0)) {
> -		if (net_ratelimit()) {
> -			if (likely(capacity == -ENOMEM)) {
> -				dev_warn(&dev->dev,
> -					 "TX queue failure: out of memory\n");
> -			} else {
> -				dev->stats.tx_fifo_errors++;
> +	int ret, i;
> +
> +	/* We normally do have space in the ring, so try to queue the skb as
> +	 * fast as possible. */
> +	ret = xmit_skb(vi, skb);
> +	if (unlikely(ret < 0)) {
> +		/* This triggers on the first xmit after ring full condition.
> +		 * We need to free up some skbs first. */
> +		if (likely(free_xmit_capacity(vi))) {
> +			ret = xmit_skb(vi, skb);
> +			/* This should never fail. Check, just in case. */
> +			if (unlikely(ret < 0)) {
>  				dev_warn(&dev->dev,
>  					 "Unexpected TX queue failure: %d\n",
> -					 capacity);
> +					 ret);
> +				dev->stats.tx_fifo_errors++;
> +				dev->stats.tx_dropped++;
> +				kfree_skb(skb);
> +				return NETDEV_TX_OK;
>  			}
> +		} else {
> +			/* Ring full: it might happen if we get a callback while
> +			 * the queue is still mostly full. This should be
> +			 * extremely rare. */
> +			dev->stats.tx_dropped++;
> +			kfree_skb(skb);
> +			goto stop;
>  		}
> -		dev->stats.tx_dropped++;
> -		kfree_skb(skb);
> -		return NETDEV_TX_OK;
>  	}
>  	virtqueue_kick(vi->svq);
> 
> @@ -603,18 +623,26 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	skb_orphan(skb);
>  	nf_reset(skb);
> 
> -	/* Apparently nice girls don't return TX_BUSY; stop the queue
> -	 * before it gets out of hand.  Naturally, this wastes entries. */
> -	if (capacity < 2+MAX_SKB_FRAGS) {
> -		netif_stop_queue(dev);
> -		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
> -			/* More just got used, free them then recheck. */
> -			free_old_xmit_skbs(vi);
> -			capacity = virtqueue_min_capacity(vi->svq);
> -			if (capacity >= 2+MAX_SKB_FRAGS) {
> -				netif_start_queue(dev);
> -				virtqueue_disable_cb(vi->svq);
> -			}
> +	/* We transmit one skb, so try to free at least two pending skbs.
> +	 * This is so that we don't hog the skb memory unnecessarily. *
> +	 * Doing this after kick means there's a chance we'll free
> +	 * the skb we have just sent, which is hot in cache. */
> +	for (i = 0; i < 2; i++)
> +		free_old_xmit_skb(v);
> +
> +	if (likely(free_xmit_capacity(vi)))
> +		return NETDEV_TX_OK;
> +
> +stop:
> +	/* Apparently nice girls don't return TX_BUSY; check capacity and stop
> +	 * the queue before it gets out of hand.
> +	 * Naturally, this wastes entries. */
> +	netif_stop_queue(dev);
> +	if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
> +		/* More just got used, free them and recheck. */
> +		if (free_xmit_capacity(vi)) {
> +			netif_start_queue(dev);
> +			virtqueue_disable_cb(vi->svq);
>  		}
>  	}
> 

^ permalink raw reply

* [PATCHv5] ethtool: Added FW dump support
From: Anirban Chakraborty @ 2011-06-02 18:27 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev, davem, Anirban Chakraborty

Ben,

I've updated the patch with your review comments. Please apply.

Thanks,
Anirban

---------------
Added support to take FW dump via ethtool.

Changes from v4:
Removed updates to ethtool-copy.h file
Added check to compare length of data to copy vs. actual data length copied
in dump path.
Fixed return values in error path.
Fixed documentation issues

Changes since v3:
Updated documentation for ethtool_dump data structure

Changes from v2:
Folded get dump flag and data into one option
Added man page support

Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 ethtool.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 118 insertions(+), 1 deletions(-)

diff --git a/ethtool.c b/ethtool.c
index b6fa6bd..699a24c 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <stdio.h>
+#include <stddef.h>
 #include <errno.h>
 #include <sys/utsname.h>
 #include <limits.h>
@@ -96,6 +97,8 @@ static int do_srxclsrule(int fd, struct ifreq *ifr);
 static int do_grxclsrule(int fd, struct ifreq *ifr);
 static int do_flash(int fd, struct ifreq *ifr);
 static int do_permaddr(int fd, struct ifreq *ifr);
+static int do_getfwdump(int fd, struct ifreq *ifr);
+static int do_setfwdump(int fd, struct ifreq *ifr);
 
 static int send_ioctl(int fd, struct ifreq *ifr);
 
@@ -128,6 +131,8 @@ static enum {
 	MODE_GCLSRULE,
 	MODE_FLASHDEV,
 	MODE_PERMADDR,
+	MODE_SET_DUMP,
+	MODE_GET_DUMP,
 } mode = MODE_GSET;
 
 static struct option {
@@ -255,6 +260,12 @@ static struct option {
 		"		[ rule %d ]\n"},
     { "-P", "--show-permaddr", MODE_PERMADDR,
 		"Show permanent hardware address" },
+    { "-w", "--get-dump", MODE_GET_DUMP,
+		"Get dump flag, data",
+		"		[ data FILENAME ]\n" },
+    { "-W", "--set-dump", MODE_SET_DUMP,
+		"Set dump flag of the device",
+		"		N\n"},
     { "-h", "--help", MODE_HELP, "Show this help" },
     { NULL, "--version", MODE_VERSION, "Show version number" },
     {}
@@ -385,6 +396,8 @@ static int flash_region = -1;
 static int msglvl_changed;
 static u32 msglvl_wanted = 0;
 static u32 msglvl_mask = 0;
+static u32 dump_flag;
+static char *dump_file = NULL;
 
 static int rx_class_rule_get = -1;
 static int rx_class_rule_del = -1;
@@ -777,7 +790,9 @@ static void parse_cmdline(int argc, char **argp)
 			    (mode == MODE_GCLSRULE) ||
 			    (mode == MODE_PHYS_ID) ||
 			    (mode == MODE_FLASHDEV) ||
-			    (mode == MODE_PERMADDR)) {
+			    (mode == MODE_PERMADDR) ||
+			    (mode == MODE_SET_DUMP) ||
+			    (mode == MODE_GET_DUMP)) {
 				devname = argp[i];
 				break;
 			}
@@ -799,6 +814,9 @@ static void parse_cmdline(int argc, char **argp)
 				flash_file = argp[i];
 				flash = 1;
 				break;
+			} else if (mode == MODE_SET_DUMP) {
+				dump_flag = get_u32(argp[i], 0);
+				break;
 			}
 			/* fallthrough */
 		default:
@@ -974,6 +992,21 @@ static void parse_cmdline(int argc, char **argp)
 				}
 				break;
 			}
+			if (mode == MODE_GET_DUMP) {
+				if (argc != i + 2) {
+					exit_bad_args();
+					break;
+				}
+				if (!strcmp(argp[i++], "data"))
+					dump_flag = ETHTOOL_GET_DUMP_DATA;
+				else {
+					exit_bad_args();
+					break;
+				}
+				dump_file = argp[i];
+				i = argc;
+				break;
+			}
 			if (mode != MODE_SSET)
 				exit_bad_args();
 			if (!strcmp(argp[i], "speed")) {
@@ -1898,6 +1931,10 @@ static int doit(void)
 		return do_flash(fd, &ifr);
 	} else if (mode == MODE_PERMADDR) {
 		return do_permaddr(fd, &ifr);
+	} else if (mode == MODE_GET_DUMP) {
+		return do_getfwdump(fd, &ifr);
+	} else if (mode == MODE_SET_DUMP) {
+		return do_setfwdump(fd, &ifr);
 	}
 
 	return 69;
@@ -3204,6 +3241,86 @@ static int do_grxclsrule(int fd, struct ifreq *ifr)
 	return err ? 1 : 0;
 }
 
+static int do_writefwdump(struct ethtool_dump *dump)
+{
+	int err = 0;
+	FILE *f;
+	size_t bytes;
+
+	f = fopen(dump_file, "wb+");
+
+	if (!f) {
+		fprintf(stderr, "Can't open file %s: %s\n",
+			dump_file, strerror(errno));
+		return 1;
+	}
+	bytes = fwrite(dump->data, 1, dump->len, f);
+	if (bytes != dump->len) {
+		fprintf(stderr, "Can not write all of dump data\n");
+		err = 1;
+	}
+	if (fclose(f)) {
+		fprintf(stderr, "Can't close file %s: %s\n",
+			dump_file, strerror(errno));
+		err = 1;
+	}
+	return err;
+}
+
+static int do_getfwdump(int fd, struct ifreq *ifr)
+{
+	int err;
+	struct ethtool_dump edata;
+	struct ethtool_dump *data;
+
+	edata.cmd = ETHTOOL_GET_DUMP_FLAG;
+
+	ifr->ifr_data = (caddr_t) &edata;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not get dump level\n");
+		return 1;
+	}
+	if (dump_flag != ETHTOOL_GET_DUMP_DATA) {
+		fprintf(stdout, "flag: %u, version: %u, length: %u\n",
+			edata.flag, edata.version, edata.len);
+		return 0;
+	}
+	data = calloc(1, offsetof(struct ethtool_dump, data) + edata.len);
+	if (!data) {
+		perror("Can not allocate enough memory\n");
+		return 1;
+	}
+	data->cmd = ETHTOOL_GET_DUMP_DATA;
+	data->len = edata.len;
+	ifr->ifr_data = (caddr_t) data;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not get dump data\n");
+		goto free;
+	}
+	err = do_writefwdump(data);
+free:
+	free(data);
+	return err;
+}
+
+static int do_setfwdump(int fd, struct ifreq *ifr)
+{
+	int err;
+	struct ethtool_dump dump;
+
+	dump.cmd = ETHTOOL_SET_DUMP;
+	dump.flag = dump_flag;
+	ifr->ifr_data = (caddr_t)&dump;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not set dump level\n");
+		return 1;
+	}
+	return 0;
+}
+
 static int send_ioctl(int fd, struct ifreq *ifr)
 {
 	return ioctl(fd, SIOCETHTOOL, ifr);
-- 
1.7.4.1


^ permalink raw reply related

* Re: [PATCH] bonding: reset queue mapping prior to transmission to physical device
From: Ben Hutchings @ 2011-06-02 18:35 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Jay Vosburgh, Andy Gospodarek, David S. Miller
In-Reply-To: <1307037799-32315-1-git-send-email-nhorman@tuxdriver.com>

On Thu, 2011-06-02 at 14:03 -0400, Neil Horman wrote:
> The bonding driver is multiqueue enabled, in which each queue represents a slave
> to enable optional steering of output frames to given slaves against the default
> output policy.  However, it needs to reset the skb->queue_mapping prior to
> queuing to the physical device or the physical slave (if it is multiqueue) could
> wind up transmitting on an unintended tx queue (one that was reserved for
> specific traffic classes for instance)
>
> Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> CC: Jay Vosburgh <fubar@us.ibm.com>
> CC: Andy Gospodarek <andy@greyhouse.net>
> CC: "David S. Miller" <davem@davemloft.net>
> ---
>  drivers/net/bonding/bond_main.c |    5 +++++
>  1 files changed, 5 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index 17b4dd9..812c70c 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -4200,6 +4200,11 @@ static inline int bond_slave_override(struct bonding *bond,
>  	/* If the slave isn't UP, use default transmit policy. */
>  	if (slave && slave->queue_id && IS_UP(slave->dev) &&
>  	    (slave->link == BOND_LINK_UP)) {
> +		/*
> + 		 * Reset the queue mapping to allow the underlying slave	
> + 		 * the chance to re-select an appropriate tx queue
> + 		 */
> +		skb_set_queue_mapping(skb, 0);
>  		res = bond_dev_queue_xmit(bond, skb, slave->dev);
>  	}
>  

So far as I can see, this has no effect, because dev_queue_xmit() always
sets queue_mapping (in dev_pick_tx()).

What is the problem you're seeing?

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* [ANNOUNCE] IPTV-Analyzer project released v0.9.2
From: Jesper Dangaard Brouer @ 2011-06-02 18:37 UTC (permalink / raw)
  To: hawk

The new release 0.9.2 of the IPTV-Analyzer project is coming to you
from Montreal, Canada.

 The new feature is SNMP traps, for no_signal detection.

The collector sends SNMP "no_signal" traps, when the state
transition from, signal to no-signal, and back from no-signal to
signal.  The SNMP traps are described via an compliant MIB.

 Homepage:
  http://www.iptv-analyzer.org

 Git tree:
  https://github.com/netoptimizer/IPTV-Analyzer

The IPTV-Analyzer is a continuous/real-time tool for analyzing the
contents of MPEG2 Transport Stream (TS) packets, which is commonly
used for IPTV multicast signals. The main purpose is continuous
quality measurement, with a focus on detecting MPEG2 TS/CC packet
drops.

 Bugreports and feature requests please use:
  https://github.com/netoptimizer/IPTV-Analyzer/issues

-- 
Best regards,
  Jesper Dangaard Brouer
  ComX Networks A/S
  Linux Network Kernel Developer
  Cand. Scient Datalog / MSc.CS
  Author of http://adsl-optimizer.dk
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply

* Re: [PATCHv5] ethtool: Added FW dump support
From: Ben Hutchings @ 2011-06-02 18:40 UTC (permalink / raw)
  To: Anirban Chakraborty; +Cc: netdev, davem
In-Reply-To: <1307039239-15196-1-git-send-email-anirban.chakraborty@qlogic.com>

On Thu, 2011-06-02 at 11:27 -0700, Anirban Chakraborty wrote:
> Ben,
> 
> I've updated the patch with your review comments. Please apply.

You haven't included the manual page changes.  Maybe you edited
ethtool.8, which is generated from ethtool.8.in (substituting the
version string)?  You need to edit and commit ethtool.8.in.

[...]
> +static int do_getfwdump(int fd, struct ifreq *ifr)
> +{
> +	int err;
> +	struct ethtool_dump edata;
> +	struct ethtool_dump *data;
> +
> +	edata.cmd = ETHTOOL_GET_DUMP_FLAG;
> +
> +	ifr->ifr_data = (caddr_t) &edata;
> +	err = send_ioctl(fd, ifr);
> +	if (err < 0) {
> +		perror("Can not get dump level\n");
> +		return 1;
> +	}
> +	if (dump_flag != ETHTOOL_GET_DUMP_DATA) {
> +		fprintf(stdout, "flag: %u, version: %u, length: %u\n",
> +			edata.flag, edata.version, edata.len);
> +		return 0;
> +	}
> +	data = calloc(1, offsetof(struct ethtool_dump, data) + edata.len);
> +	if (!data) {
> +		perror("Can not allocate enough memory\n");
> +		return 1;
> +	}
> +	data->cmd = ETHTOOL_GET_DUMP_DATA;
> +	data->len = edata.len;
> +	ifr->ifr_data = (caddr_t) data;
> +	err = send_ioctl(fd, ifr);
> +	if (err < 0) {
> +		perror("Can not get dump data\n");
> +		goto free;
> +	}
> +	err = do_writefwdump(data);
> +free:
> +	free(data);
> +	return err;
> +}
[...]

I think the last statement should be:
	return err ? 1 : 0;
Otherwise we may return -1.

Otherwise, this looks good.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCH] bonding: reset queue mapping prior to transmission to physical device
From: Neil Horman @ 2011-06-02 18:56 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Jay Vosburgh, Andy Gospodarek, David S. Miller
In-Reply-To: <1307039753.2812.16.camel@bwh-desktop>

On Thu, Jun 02, 2011 at 07:35:53PM +0100, Ben Hutchings wrote:
> On Thu, 2011-06-02 at 14:03 -0400, Neil Horman wrote:
> > The bonding driver is multiqueue enabled, in which each queue represents a slave
> > to enable optional steering of output frames to given slaves against the default
> > output policy.  However, it needs to reset the skb->queue_mapping prior to
> > queuing to the physical device or the physical slave (if it is multiqueue) could
> > wind up transmitting on an unintended tx queue (one that was reserved for
> > specific traffic classes for instance)
> >
> > Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
> > CC: Jay Vosburgh <fubar@us.ibm.com>
> > CC: Andy Gospodarek <andy@greyhouse.net>
> > CC: "David S. Miller" <davem@davemloft.net>
> > ---
> >  drivers/net/bonding/bond_main.c |    5 +++++
> >  1 files changed, 5 insertions(+), 0 deletions(-)
> > 
> > diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> > index 17b4dd9..812c70c 100644
> > --- a/drivers/net/bonding/bond_main.c
> > +++ b/drivers/net/bonding/bond_main.c
> > @@ -4200,6 +4200,11 @@ static inline int bond_slave_override(struct bonding *bond,
> >  	/* If the slave isn't UP, use default transmit policy. */
> >  	if (slave && slave->queue_id && IS_UP(slave->dev) &&
> >  	    (slave->link == BOND_LINK_UP)) {
> > +		/*
> > + 		 * Reset the queue mapping to allow the underlying slave	
> > + 		 * the chance to re-select an appropriate tx queue
> > + 		 */
> > +		skb_set_queue_mapping(skb, 0);
> >  		res = bond_dev_queue_xmit(bond, skb, slave->dev);
> >  	}
> >  
> 
> So far as I can see, this has no effect, because dev_queue_xmit() always
> sets queue_mapping (in dev_pick_tx()).
> 
it resets the queue mapping exactly as you would expect it to.  bonding is a
multiqueue enabled device and selects a potentially non-zero queue based on the
output of bond_select_queue.

> What is the problem you're seeing?
> 
The problem is exctly that.  dev_pick_tx() on the bond device sets the
queue_mapping as per the result of bond_select_queue (the ndo_select_queue
method for the bonding driver).  The implementation there is based on the use of
tc with bonding, so that output slaves can be selected for certain types of
traffic.  But when that mechanism is used, skb->queue_mapping is preserved when
the bonding driver queues the frame to the underlying slave.  This denies the
slave (if its also a multiqueue device) the opportunity to reselect the queue
properly, because of this call path:

bond_queue_xmit
 dev_queue_xmit(slave_dev)
  dev_pick_tx()
   skb_tx_hash()
    __skb_tx_hash()

__skb_tx_hash sees that skb_queue_recorded returns true, and assigns a hardware queue mapping
based on what the bonding driver chose using its own internal logic.  Since
bonding uses the multiqueue infrastructure to do slave output selection without
any regard for slave output queue selection, it seems to me we should really
reset the queue mapping to zero so the slave device can pick its own tx queue.

Neil

> Ben.
> 
> -- 
> Ben Hutchings, Senior Software Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
> 
> 

^ permalink raw reply

* [PATCH v2 1/5] ep93xx: set DMA masks for the ep93xx_eth
From: Mika Westerberg @ 2011-06-02 18:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-arm-kernel, kernel, hsweeten, ryan, Mika Westerberg

As the driver is now passing platform device to the DMA mapping functions,
we should give it valid DMA masks.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
Changes to previous version are:
	- moved this patch to be first
	- added Russell's acks

 arch/arm/mach-ep93xx/core.c |    6 +++++-
 1 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c
index 8207954..1d4b65f 100644
--- a/arch/arm/mach-ep93xx/core.c
+++ b/arch/arm/mach-ep93xx/core.c
@@ -402,11 +402,15 @@ static struct resource ep93xx_eth_resource[] = {
 	}
 };
 
+static u64 ep93xx_eth_dma_mask = DMA_BIT_MASK(32);
+
 static struct platform_device ep93xx_eth_device = {
 	.name		= "ep93xx-eth",
 	.id		= -1,
 	.dev		= {
-		.platform_data	= &ep93xx_eth_data,
+		.platform_data		= &ep93xx_eth_data,
+		.coherent_dma_mask	= DMA_BIT_MASK(32),
+		.dma_mask		= &ep93xx_eth_dma_mask,
 	},
 	.num_resources	= ARRAY_SIZE(ep93xx_eth_resource),
 	.resource	= ep93xx_eth_resource,
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH v2 2/5] net: ep93xx_eth: pass struct device to DMA API functions
From: Mika Westerberg @ 2011-06-02 18:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-arm-kernel, kernel, hsweeten, ryan, Mika Westerberg
In-Reply-To: <f20d8fe52f823e7e65423720c53f2935d3e08967.1307040443.git.mika.westerberg@iki.fi>

We shouldn't use NULL for any DMA API functions, unless we are dealing with
ISA or EISA device. So pass correct struct dev pointer to these functions.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/net/arm/ep93xx_eth.c |   26 ++++++++++++++++----------
 1 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 5a77001..8779d3b 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -159,6 +159,8 @@ struct ep93xx_priv
 	void __iomem		*base_addr;
 	int			irq;
 
+	struct device		*dma_dev;
+
 	struct ep93xx_descs	*descs;
 	dma_addr_t		descs_dma_addr;
 
@@ -284,7 +286,8 @@ static int ep93xx_rx(struct net_device *dev, int processed, int budget)
 		skb = dev_alloc_skb(length + 2);
 		if (likely(skb != NULL)) {
 			skb_reserve(skb, 2);
-			dma_sync_single_for_cpu(NULL, ep->descs->rdesc[entry].buf_addr,
+			dma_sync_single_for_cpu(ep->dma_dev,
+						ep->descs->rdesc[entry].buf_addr,
 						length, DMA_FROM_DEVICE);
 			skb_copy_to_linear_data(skb, ep->rx_buf[entry], length);
 			skb_put(skb, length);
@@ -362,7 +365,7 @@ static int ep93xx_xmit(struct sk_buff *skb, struct net_device *dev)
 	ep->descs->tdesc[entry].tdesc1 =
 		TDESC1_EOF | (entry << 16) | (skb->len & 0xfff);
 	skb_copy_and_csum_dev(skb, ep->tx_buf[entry]);
-	dma_sync_single_for_cpu(NULL, ep->descs->tdesc[entry].buf_addr,
+	dma_sync_single_for_cpu(ep->dma_dev, ep->descs->tdesc[entry].buf_addr,
 				skb->len, DMA_TO_DEVICE);
 	dev_kfree_skb(skb);
 
@@ -457,6 +460,7 @@ static irqreturn_t ep93xx_irq(int irq, void *dev_id)
 
 static void ep93xx_free_buffers(struct ep93xx_priv *ep)
 {
+	struct device *dev = ep->dma_dev;
 	int i;
 
 	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
@@ -464,7 +468,7 @@ static void ep93xx_free_buffers(struct ep93xx_priv *ep)
 
 		d = ep->descs->rdesc[i].buf_addr;
 		if (d)
-			dma_unmap_single(NULL, d, PAGE_SIZE, DMA_FROM_DEVICE);
+			dma_unmap_single(dev, d, PAGE_SIZE, DMA_FROM_DEVICE);
 
 		if (ep->rx_buf[i] != NULL)
 			free_page((unsigned long)ep->rx_buf[i]);
@@ -475,13 +479,13 @@ static void ep93xx_free_buffers(struct ep93xx_priv *ep)
 
 		d = ep->descs->tdesc[i].buf_addr;
 		if (d)
-			dma_unmap_single(NULL, d, PAGE_SIZE, DMA_TO_DEVICE);
+			dma_unmap_single(dev, d, PAGE_SIZE, DMA_TO_DEVICE);
 
 		if (ep->tx_buf[i] != NULL)
 			free_page((unsigned long)ep->tx_buf[i]);
 	}
 
-	dma_free_coherent(NULL, sizeof(struct ep93xx_descs), ep->descs,
+	dma_free_coherent(dev, sizeof(struct ep93xx_descs), ep->descs,
 							ep->descs_dma_addr);
 }
 
@@ -491,9 +495,10 @@ static void ep93xx_free_buffers(struct ep93xx_priv *ep)
  */
 static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 {
+	struct device *dev = ep->dma_dev;
 	int i;
 
-	ep->descs = dma_alloc_coherent(NULL, sizeof(struct ep93xx_descs),
+	ep->descs = dma_alloc_coherent(dev, sizeof(struct ep93xx_descs),
 				&ep->descs_dma_addr, GFP_KERNEL | GFP_DMA);
 	if (ep->descs == NULL)
 		return 1;
@@ -506,8 +511,8 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 		if (page == NULL)
 			goto err;
 
-		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (dma_mapping_error(NULL, d)) {
+		d = dma_map_single(dev, page, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (dma_mapping_error(dev, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
@@ -529,8 +534,8 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 		if (page == NULL)
 			goto err;
 
-		d = dma_map_single(NULL, page, PAGE_SIZE, DMA_TO_DEVICE);
-		if (dma_mapping_error(NULL, d)) {
+		d = dma_map_single(dev, page, PAGE_SIZE, DMA_TO_DEVICE);
+		if (dma_mapping_error(dev, d)) {
 			free_page((unsigned long)page);
 			goto err;
 		}
@@ -829,6 +834,7 @@ static int ep93xx_eth_probe(struct platform_device *pdev)
 	}
 	ep = netdev_priv(dev);
 	ep->dev = dev;
+	ep->dma_dev = &pdev->dev;
 	netif_napi_add(dev, &ep->napi, ep93xx_poll, 64);
 
 	platform_set_drvdata(pdev, dev);
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH v2 3/5] net: ep93xx_eth: allocate buffers using kmalloc()
From: Mika Westerberg @ 2011-06-02 18:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-arm-kernel, kernel, hsweeten, ryan, Mika Westerberg
In-Reply-To: <f20d8fe52f823e7e65423720c53f2935d3e08967.1307040443.git.mika.westerberg@iki.fi>

We can use simply kmalloc() to allocate the buffers. This also simplifies the
code and allows us to perform DMA sync operations more easily.

Memory is allocated with only GFP_KERNEL since there are no DMA allocation
restrictions on this platform.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/net/arm/ep93xx_eth.c |   51 ++++++++++++++++-------------------------
 1 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 8779d3b..0c9df11 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -463,36 +463,32 @@ static void ep93xx_free_buffers(struct ep93xx_priv *ep)
 	struct device *dev = ep->dma_dev;
 	int i;
 
-	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
+	for (i = 0; i < RX_QUEUE_ENTRIES; i++) {
 		dma_addr_t d;
 
 		d = ep->descs->rdesc[i].buf_addr;
 		if (d)
-			dma_unmap_single(dev, d, PAGE_SIZE, DMA_FROM_DEVICE);
+			dma_unmap_single(dev, d, PKT_BUF_SIZE, DMA_FROM_DEVICE);
 
 		if (ep->rx_buf[i] != NULL)
-			free_page((unsigned long)ep->rx_buf[i]);
+			kfree(ep->rx_buf[i]);
 	}
 
-	for (i = 0; i < TX_QUEUE_ENTRIES; i += 2) {
+	for (i = 0; i < TX_QUEUE_ENTRIES; i++) {
 		dma_addr_t d;
 
 		d = ep->descs->tdesc[i].buf_addr;
 		if (d)
-			dma_unmap_single(dev, d, PAGE_SIZE, DMA_TO_DEVICE);
+			dma_unmap_single(dev, d, PKT_BUF_SIZE, DMA_TO_DEVICE);
 
 		if (ep->tx_buf[i] != NULL)
-			free_page((unsigned long)ep->tx_buf[i]);
+			kfree(ep->tx_buf[i]);
 	}
 
 	dma_free_coherent(dev, sizeof(struct ep93xx_descs), ep->descs,
 							ep->descs_dma_addr);
 }
 
-/*
- * The hardware enforces a sub-2K maximum packet size, so we put
- * two buffers on every hardware page.
- */
 static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 {
 	struct device *dev = ep->dma_dev;
@@ -503,48 +499,41 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 	if (ep->descs == NULL)
 		return 1;
 
-	for (i = 0; i < RX_QUEUE_ENTRIES; i += 2) {
-		void *page;
+	for (i = 0; i < RX_QUEUE_ENTRIES; i++) {
+		void *buf;
 		dma_addr_t d;
 
-		page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
-		if (page == NULL)
+		buf = kmalloc(PKT_BUF_SIZE, GFP_KERNEL);
+		if (buf == NULL)
 			goto err;
 
-		d = dma_map_single(dev, page, PAGE_SIZE, DMA_FROM_DEVICE);
+		d = dma_map_single(dev, buf, PKT_BUF_SIZE, DMA_FROM_DEVICE);
 		if (dma_mapping_error(dev, d)) {
-			free_page((unsigned long)page);
+			kfree(buf);
 			goto err;
 		}
 
-		ep->rx_buf[i] = page;
+		ep->rx_buf[i] = buf;
 		ep->descs->rdesc[i].buf_addr = d;
 		ep->descs->rdesc[i].rdesc1 = (i << 16) | PKT_BUF_SIZE;
-
-		ep->rx_buf[i + 1] = page + PKT_BUF_SIZE;
-		ep->descs->rdesc[i + 1].buf_addr = d + PKT_BUF_SIZE;
-		ep->descs->rdesc[i + 1].rdesc1 = ((i + 1) << 16) | PKT_BUF_SIZE;
 	}
 
-	for (i = 0; i < TX_QUEUE_ENTRIES; i += 2) {
-		void *page;
+	for (i = 0; i < TX_QUEUE_ENTRIES; i++) {
+		void *buf;
 		dma_addr_t d;
 
-		page = (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
-		if (page == NULL)
+		buf = kmalloc(PKT_BUF_SIZE, GFP_KERNEL);
+		if (buf == NULL)
 			goto err;
 
-		d = dma_map_single(dev, page, PAGE_SIZE, DMA_TO_DEVICE);
+		d = dma_map_single(dev, buf, PKT_BUF_SIZE, DMA_TO_DEVICE);
 		if (dma_mapping_error(dev, d)) {
-			free_page((unsigned long)page);
+			kfree(buf);
 			goto err;
 		}
 
-		ep->tx_buf[i] = page;
+		ep->tx_buf[i] = buf;
 		ep->descs->tdesc[i].buf_addr = d;
-
-		ep->tx_buf[i + 1] = page + PKT_BUF_SIZE;
-		ep->descs->tdesc[i + 1].buf_addr = d + PKT_BUF_SIZE;
 	}
 
 	return 0;
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH v2 4/5] net: ep93xx_eth: drop GFP_DMA from call to dma_alloc_coherent()
From: Mika Westerberg @ 2011-06-02 18:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-arm-kernel, kernel, hsweeten, ryan, Mika Westerberg
In-Reply-To: <f20d8fe52f823e7e65423720c53f2935d3e08967.1307040443.git.mika.westerberg@iki.fi>

Commit a197b59ae6e8 (mm: fail GFP_DMA allocations when ZONE_DMA is not
configured) made page allocator to return NULL if GFP_DMA is set but
CONFIG_ZONE_DMA is disabled.

This causes ep93xx_eth to fail:

 WARNING: at mm/page_alloc.c:2251 __alloc_pages_nodemask+0x11c/0x638()
 Modules linked in:
 [<c0035498>] (unwind_backtrace+0x0/0xf4) from [<c0043da4>] (warn_slowpath_common+0x48/0x60)
 [<c0043da4>] (warn_slowpath_common+0x48/0x60) from [<c0043dd8>] (warn_slowpath_null+0x1c/0x24)
 [<c0043dd8>] (warn_slowpath_null+0x1c/0x24) from [<c0083b6c>] (__alloc_pages_nodemask+0x11c/0x638)
 [<c0083b6c>] (__alloc_pages_nodemask+0x11c/0x638) from [<c00366fc>] (__dma_alloc+0x8c/0x3ec)
 [<c00366fc>] (__dma_alloc+0x8c/0x3ec) from [<c0036adc>] (dma_alloc_coherent+0x54/0x60)
 [<c0036adc>] (dma_alloc_coherent+0x54/0x60) from [<c0227808>] (ep93xx_open+0x20/0x864)
 [<c0227808>] (ep93xx_open+0x20/0x864) from [<c0283144>] (__dev_open+0xb8/0x108)
 [<c0283144>] (__dev_open+0xb8/0x108) from [<c0280528>] (__dev_change_flags+0x70/0x128)
 [<c0280528>] (__dev_change_flags+0x70/0x128) from [<c0283054>] (dev_change_flags+0x10/0x48)
 [<c0283054>] (dev_change_flags+0x10/0x48) from [<c001a720>] (ip_auto_config+0x190/0xf68)
 [<c001a720>] (ip_auto_config+0x190/0xf68) from [<c00233b0>] (do_one_initcall+0x34/0x18c)
 [<c00233b0>] (do_one_initcall+0x34/0x18c) from [<c0008400>] (kernel_init+0x94/0x134)
 [<c0008400>] (kernel_init+0x94/0x134) from [<c0030858>] (kernel_thread_exit+0x0/0x8)

Since there is no restrictions for DMA on ep93xx, we can fix this by just
removing the GFP_DMA flag from the call.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/net/arm/ep93xx_eth.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 0c9df11..56b51a1 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -495,7 +495,7 @@ static int ep93xx_alloc_buffers(struct ep93xx_priv *ep)
 	int i;
 
 	ep->descs = dma_alloc_coherent(dev, sizeof(struct ep93xx_descs),
-				&ep->descs_dma_addr, GFP_KERNEL | GFP_DMA);
+				&ep->descs_dma_addr, GFP_KERNEL);
 	if (ep->descs == NULL)
 		return 1;
 
-- 
1.7.4.4


^ permalink raw reply related

* [PATCH v2 5/5] net: ep93xx_eth: fix DMA API violations
From: Mika Westerberg @ 2011-06-02 18:59 UTC (permalink / raw)
  To: netdev; +Cc: linux-arm-kernel, kernel, hsweeten, ryan, Mika Westerberg
In-Reply-To: <f20d8fe52f823e7e65423720c53f2935d3e08967.1307040443.git.mika.westerberg@iki.fi>

Russell King said:
>
> So, to summarize what its doing:
>
> 1. It allocates buffers for rx and tx.
> 2. It maps them with dma_map_single().
>       This transfers ownership of the buffer to the DMA device.
> 3. In ep93xx_xmit,
> 3a. It copies the data into the buffer with skb_copy_and_csum_dev()
>       This violates the DMA buffer ownership rules - the CPU should
>       not be writing to this buffer while it is (in principle) owned
>       by the DMA device.
> 3b. It then calls dma_sync_single_for_cpu() for the buffer.
>       This transfers ownership of the buffer to the CPU, which surely
>       is the wrong direction.
> 4. In ep93xx_rx,
> 4a. It calls dma_sync_single_for_cpu() for the buffer.
>       This at least transfers the DMA buffer ownership to the CPU
>       before the CPU reads the buffer
> 4b. It then uses skb_copy_to_linear_data() to copy the data out.
>       At no point does it transfer ownership back to the DMA device.
> 5. When the driver is removed, it dma_unmap_single()'s the buffer.
>       This transfers ownership of the buffer to the CPU.
> 6. It frees the buffer.
>
> While it may work on ep93xx, it's not respecting the DMA API rules,
> and with DMA debugging enabled it will probably encounter quite a few
> warnings.

This patch fixes these violations.

Signed-off-by: Mika Westerberg <mika.westerberg@iki.fi>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 drivers/net/arm/ep93xx_eth.c |   19 +++++++++++++------
 1 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/net/arm/ep93xx_eth.c b/drivers/net/arm/ep93xx_eth.c
index 56b51a1..c2a7847 100644
--- a/drivers/net/arm/ep93xx_eth.c
+++ b/drivers/net/arm/ep93xx_eth.c
@@ -285,11 +285,14 @@ static int ep93xx_rx(struct net_device *dev, int processed, int budget)
 
 		skb = dev_alloc_skb(length + 2);
 		if (likely(skb != NULL)) {
+			struct ep93xx_rdesc *rxd = &ep->descs->rdesc[entry];
 			skb_reserve(skb, 2);
-			dma_sync_single_for_cpu(ep->dma_dev,
-						ep->descs->rdesc[entry].buf_addr,
+
+			dma_sync_single_for_cpu(ep->dma_dev, rxd->buf_addr,
 						length, DMA_FROM_DEVICE);
 			skb_copy_to_linear_data(skb, ep->rx_buf[entry], length);
+			dma_sync_single_for_device(ep->dma_dev, rxd->buf_addr,
+						length, DMA_FROM_DEVICE);
 			skb_put(skb, length);
 			skb->protocol = eth_type_trans(skb, dev);
 
@@ -351,6 +354,7 @@ poll_some_more:
 static int ep93xx_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ep93xx_priv *ep = netdev_priv(dev);
+	struct ep93xx_tdesc *txd;
 	int entry;
 
 	if (unlikely(skb->len > MAX_PKT_SIZE)) {
@@ -362,11 +366,14 @@ static int ep93xx_xmit(struct sk_buff *skb, struct net_device *dev)
 	entry = ep->tx_pointer;
 	ep->tx_pointer = (ep->tx_pointer + 1) & (TX_QUEUE_ENTRIES - 1);
 
-	ep->descs->tdesc[entry].tdesc1 =
-		TDESC1_EOF | (entry << 16) | (skb->len & 0xfff);
+	txd = &ep->descs->tdesc[entry];
+
+	txd->tdesc1 = TDESC1_EOF | (entry << 16) | (skb->len & 0xfff);
+	dma_sync_single_for_cpu(ep->dma_dev, txd->buf_addr, skb->len,
+				DMA_TO_DEVICE);
 	skb_copy_and_csum_dev(skb, ep->tx_buf[entry]);
-	dma_sync_single_for_cpu(ep->dma_dev, ep->descs->tdesc[entry].buf_addr,
-				skb->len, DMA_TO_DEVICE);
+	dma_sync_single_for_device(ep->dma_dev, txd->buf_addr, skb->len,
+				   DMA_TO_DEVICE);
 	dev_kfree_skb(skb);
 
 	spin_lock_irq(&ep->tx_pending_lock);
-- 
1.7.4.4


^ permalink raw reply related

* [PATCHv6] ethtool: Added FW dump support
From: Anirban Chakraborty @ 2011-06-02 19:00 UTC (permalink / raw)
  To: bhutchings; +Cc: netdev, davem, Anirban Chakraborty

Ben,

Thanks. This time I actually committed ethtool.8.in.

-Anirban

-----

Added support to take FW dump via ethtool.

Changes from v5:
Actually added the ethtool.8.in diffs.
Took care of the return value in get dump data path.

Changes from v4:
Removed updates to ethtool-copy.h file
Added check to compare length of data to copy vs. actual data length copied
in dump path.
Fixed return values in error path.
Fixed documentation issues

Changes since v3:
Updated documentation for ethtool_dump data structure

Changes from v2:
Folded get dump flag and data into one option
Added man page support

Signed-off-by: Anirban Chakraborty <anirban.chakraborty@qlogic.com>
---
 ethtool.8.in |   20 ++++++++++
 ethtool.c    |  120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 139 insertions(+), 1 deletions(-)

diff --git a/ethtool.8.in b/ethtool.8.in
index 42d923b..7b1cdf5 100644
--- a/ethtool.8.in
+++ b/ethtool.8.in
@@ -254,6 +254,15 @@ ethtool \- query or control network driver and hardware settings
 .I ethX
 .RB [ rx\-flow\-hash \ \*(FL \  \*(HO]
 .HP
+.B ethtool \-w|\-\-get\-dump
+.I ethX
+.RB [ data
+.IR filename ]
+.HP
+.B ethtool\ \-W|\-\-set\-dump
+.I ethX
+.BI \ N
+.HP
 .B ethtool \-x|\-\-show\-rxfh\-indir
 .I ethX
 .HP
@@ -595,6 +604,17 @@ other options are ignored.
 T}
 .TE
 .TP
+.B \-w \-\-get\-dump
+Retrieves and prints firmware dump for the specified network device.
+By default, it prints out the dump flag, version and length of the dump data.
+When
+.I data
+is indicated, then ethtool fetches the dump data and directs it to a
+.I file.
+.TP
+.B \-W \-\-set\-dump
+Sets the dump flag for the device.
+.TP
 .B \-x \-\-show\-rxfh\-indir
 Retrieves the receive flow hash indirection table.
 .TP
diff --git a/ethtool.c b/ethtool.c
index b6fa6bd..49614e2 100644
--- a/ethtool.c
+++ b/ethtool.c
@@ -28,6 +28,7 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include <stdio.h>
+#include <stddef.h>
 #include <errno.h>
 #include <sys/utsname.h>
 #include <limits.h>
@@ -96,6 +97,8 @@ static int do_srxclsrule(int fd, struct ifreq *ifr);
 static int do_grxclsrule(int fd, struct ifreq *ifr);
 static int do_flash(int fd, struct ifreq *ifr);
 static int do_permaddr(int fd, struct ifreq *ifr);
+static int do_getfwdump(int fd, struct ifreq *ifr);
+static int do_setfwdump(int fd, struct ifreq *ifr);
 
 static int send_ioctl(int fd, struct ifreq *ifr);
 
@@ -128,6 +131,8 @@ static enum {
 	MODE_GCLSRULE,
 	MODE_FLASHDEV,
 	MODE_PERMADDR,
+	MODE_SET_DUMP,
+	MODE_GET_DUMP,
 } mode = MODE_GSET;
 
 static struct option {
@@ -255,6 +260,12 @@ static struct option {
 		"		[ rule %d ]\n"},
     { "-P", "--show-permaddr", MODE_PERMADDR,
 		"Show permanent hardware address" },
+    { "-w", "--get-dump", MODE_GET_DUMP,
+		"Get dump flag, data",
+		"		[ data FILENAME ]\n" },
+    { "-W", "--set-dump", MODE_SET_DUMP,
+		"Set dump flag of the device",
+		"		N\n"},
     { "-h", "--help", MODE_HELP, "Show this help" },
     { NULL, "--version", MODE_VERSION, "Show version number" },
     {}
@@ -385,6 +396,8 @@ static int flash_region = -1;
 static int msglvl_changed;
 static u32 msglvl_wanted = 0;
 static u32 msglvl_mask = 0;
+static u32 dump_flag;
+static char *dump_file = NULL;
 
 static int rx_class_rule_get = -1;
 static int rx_class_rule_del = -1;
@@ -777,7 +790,9 @@ static void parse_cmdline(int argc, char **argp)
 			    (mode == MODE_GCLSRULE) ||
 			    (mode == MODE_PHYS_ID) ||
 			    (mode == MODE_FLASHDEV) ||
-			    (mode == MODE_PERMADDR)) {
+			    (mode == MODE_PERMADDR) ||
+			    (mode == MODE_SET_DUMP) ||
+			    (mode == MODE_GET_DUMP)) {
 				devname = argp[i];
 				break;
 			}
@@ -799,6 +814,9 @@ static void parse_cmdline(int argc, char **argp)
 				flash_file = argp[i];
 				flash = 1;
 				break;
+			} else if (mode == MODE_SET_DUMP) {
+				dump_flag = get_u32(argp[i], 0);
+				break;
 			}
 			/* fallthrough */
 		default:
@@ -974,6 +992,21 @@ static void parse_cmdline(int argc, char **argp)
 				}
 				break;
 			}
+			if (mode == MODE_GET_DUMP) {
+				if (argc != i + 2) {
+					exit_bad_args();
+					break;
+				}
+				if (!strcmp(argp[i++], "data"))
+					dump_flag = ETHTOOL_GET_DUMP_DATA;
+				else {
+					exit_bad_args();
+					break;
+				}
+				dump_file = argp[i];
+				i = argc;
+				break;
+			}
 			if (mode != MODE_SSET)
 				exit_bad_args();
 			if (!strcmp(argp[i], "speed")) {
@@ -1898,6 +1931,10 @@ static int doit(void)
 		return do_flash(fd, &ifr);
 	} else if (mode == MODE_PERMADDR) {
 		return do_permaddr(fd, &ifr);
+	} else if (mode == MODE_GET_DUMP) {
+		return do_getfwdump(fd, &ifr);
+	} else if (mode == MODE_SET_DUMP) {
+		return do_setfwdump(fd, &ifr);
 	}
 
 	return 69;
@@ -3204,6 +3241,87 @@ static int do_grxclsrule(int fd, struct ifreq *ifr)
 	return err ? 1 : 0;
 }
 
+static int do_writefwdump(struct ethtool_dump *dump)
+{
+	int err = 0;
+	FILE *f;
+	size_t bytes;
+
+	f = fopen(dump_file, "wb+");
+
+	if (!f) {
+		fprintf(stderr, "Can't open file %s: %s\n",
+			dump_file, strerror(errno));
+		return 1;
+	}
+	bytes = fwrite(dump->data, 1, dump->len, f);
+	if (bytes != dump->len) {
+		fprintf(stderr, "Can not write all of dump data\n");
+		err = 1;
+	}
+	if (fclose(f)) {
+		fprintf(stderr, "Can't close file %s: %s\n",
+			dump_file, strerror(errno));
+		err = 1;
+	}
+	return err;
+}
+
+static int do_getfwdump(int fd, struct ifreq *ifr)
+{
+	int err;
+	struct ethtool_dump edata;
+	struct ethtool_dump *data;
+
+	edata.cmd = ETHTOOL_GET_DUMP_FLAG;
+
+	ifr->ifr_data = (caddr_t) &edata;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not get dump level\n");
+		return 1;
+	}
+	if (dump_flag != ETHTOOL_GET_DUMP_DATA) {
+		fprintf(stdout, "flag: %u, version: %u, length: %u\n",
+			edata.flag, edata.version, edata.len);
+		return 0;
+	}
+	data = calloc(1, offsetof(struct ethtool_dump, data) + edata.len);
+	if (!data) {
+		perror("Can not allocate enough memory\n");
+		return 1;
+	}
+	data->cmd = ETHTOOL_GET_DUMP_DATA;
+	data->len = edata.len;
+	ifr->ifr_data = (caddr_t) data;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not get dump data\n");
+		err = 1;
+		goto free;
+	}
+	err = do_writefwdump(data);
+free:
+	free(data);
+	return err;
+}
+
+static int do_setfwdump(int fd, struct ifreq *ifr)
+{
+	int err;
+	struct ethtool_dump dump;
+
+	dump.cmd = ETHTOOL_SET_DUMP;
+	dump.flag = dump_flag;
+	ifr->ifr_data = (caddr_t)&dump;
+	err = send_ioctl(fd, ifr);
+	if (err < 0) {
+		perror("Can not set dump level\n");
+		return 1;
+	}
+	return 0;
+}
+
 static int send_ioctl(int fd, struct ifreq *ifr)
 {
 	return ioctl(fd, SIOCETHTOOL, ifr);
-- 
1.7.4.1


^ permalink raw reply related

* Re: [PATCH] bonding: reset queue mapping prior to transmission to physical device
From: Ben Hutchings @ 2011-06-02 19:09 UTC (permalink / raw)
  To: Neil Horman; +Cc: netdev, Jay Vosburgh, Andy Gospodarek, David S. Miller
In-Reply-To: <20110602185659.GA2749@hmsreliant.think-freely.org>

On Thu, 2011-06-02 at 14:56 -0400, Neil Horman wrote:
> On Thu, Jun 02, 2011 at 07:35:53PM +0100, Ben Hutchings wrote:
> > On Thu, 2011-06-02 at 14:03 -0400, Neil Horman wrote:
> > > The bonding driver is multiqueue enabled, in which each queue represents a slave
> > > to enable optional steering of output frames to given slaves against the default
> > > output policy.  However, it needs to reset the skb->queue_mapping prior to
> > > queuing to the physical device or the physical slave (if it is multiqueue) could
> > > wind up transmitting on an unintended tx queue (one that was reserved for
> > > specific traffic classes for instance)
[...]
> > So far as I can see, this has no effect, because dev_queue_xmit() always
> > sets queue_mapping (in dev_pick_tx()).
> > 
> it resets the queue mapping exactly as you would expect it to.  bonding is a
> multiqueue enabled device and selects a potentially non-zero queue based on the
> output of bond_select_queue.
> 
> > What is the problem you're seeing?
> > 
> The problem is exctly that.  dev_pick_tx() on the bond device sets the
> queue_mapping as per the result of bond_select_queue (the ndo_select_queue
> method for the bonding driver).  The implementation there is based on the use of
> tc with bonding, so that output slaves can be selected for certain types of
> traffic.  But when that mechanism is used, skb->queue_mapping is preserved when
> the bonding driver queues the frame to the underlying slave.  This denies the
> slave (if its also a multiqueue device) the opportunity to reselect the queue
> properly, because of this call path:
> 
> bond_queue_xmit
>  dev_queue_xmit(slave_dev)
>   dev_pick_tx()
>    skb_tx_hash()
>     __skb_tx_hash()
> 
> __skb_tx_hash sees that skb_queue_recorded returns true, and assigns a hardware queue mapping
> based on what the bonding driver chose using its own internal logic.  Since
> bonding uses the multiqueue infrastructure to do slave output selection without
> any regard for slave output queue selection, it seems to me we should really
> reset the queue mapping to zero so the slave device can pick its own tx queue.

So you're effectively clearing the *RX queue* number (as this is before
dev_pick_tx()) in order to influence TX queue selection.

Here, the bonding device seems to be behaving as a forwarding device.
If TX queue selection can go wrong for certain combinations of queue
configuration when forwarding, then this is a problem for IP forwarding
and bridging as well, isn't it?

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* Re: [PATCHv6] ethtool: Added FW dump support
From: Ben Hutchings @ 2011-06-02 19:19 UTC (permalink / raw)
  To: Anirban Chakraborty; +Cc: netdev, davem
In-Reply-To: <1307041226-11998-1-git-send-email-anirban.chakraborty@qlogic.com>

On Thu, 2011-06-02 at 12:00 -0700, Anirban Chakraborty wrote:
> Ben,
> 
> Thanks. This time I actually committed ethtool.8.in.
[...]

Applied, thanks.

Ben.

-- 
Ben Hutchings, Senior Software Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.


^ permalink raw reply

* forwarding pings out ingress interface
From: Jeff Haran @ 2011-06-02 19:10 UTC (permalink / raw)
  To: netdev

Running Redhat Enterprize 6.0, seeing the following behavior.

Two Ethernet network segments: 172.30/16 and 169.254.160/24.

Three "devices":

[linux] is the blade running RHEL6.0. It has multiple network interfaces
and has IP forwarding enabled.
[host] is an IP host of some sort.
[lb] is an L3 load balancer.

They are connected like so:

[host] 172.30.0.254 <-> 172.30.0.2 [lb] 169.254.160.20 <-> 169.254.160.1
[linux]

I have not shown the other network interfaces that [linux] is attached
to for brevity in the diagram.

One of the peculiarities of [lb] is if [host] pings it at 172.30.0.2, it
will not respond to the ping itself but instead forward the ping to
[linux]. I realize this is broken, but I have to deal with it (its
cooked into [lb]'s silicon). [lb]'s vendor assures me that if [linux]
would only forward the ping back to [lb] out the same interface it came
in on, then [lb] will generate a ping response back to [host].

My problem is [linux] won't forward the ping back to [lb]. [linux] has a
route to 172.30/16 out the interface that connects it to [lb] and I can
see from tcpdump that [linux] is getting the ICMP Echo requests with
source address 172.30.0.254 and destination address 172.30.0.2, but
nothing gets transmitted out that interface in response.

Now if I modify the route on [linux] to so that packets it receives that
are destined to 172.30/16 are forwarded out another of its network
interfaces, it forwards the pings out that interface just fine. It seems
like [linux] just doesn't want to forward pings it receives on a given
interface back out that same interface.

I am wondering if there is some switch under /proc or /sys that I need
to modify to get [linux] to forward the pings out the interface.
Is there such a thing that defaults to preventing this for security
reasons maybe?

In digging into this I discovered that /proc/1/net/stat/rt_cache was
reporting some non-zero statistics under the "in_martian_src" source
column. I figured maybe that [linux]'s stack was for some reason
determining that these pings were martians and dropping them so I
echo'ed 1 to log_martians. What I then see is a bunch of logs like this:

2011-06-01T17:55:51.103140-07:00 s01b01 kernel: martian source
169.254.160.7 from 169.254.160.6, on dev eth3
2011-06-01T17:55:51.103147-07:00 s01b01 kernel: ll header:
ff:ff:ff:ff:ff:ff:e4:1f:13:f6:60:d8:08:06

I don't know if this is related to the immediate ping problem, there are
other linux hosts on 169.254.160/24 that I've omitted from the diagram,
but I am truly perplexed at the lines beginning with "ll header".
Reading the code that generates these logs, ip_handle_martian_source()
in net/ipv4/route.c, it suggests that what is being shown in those "ll
header" lines is the MAC header of these packets it considers martians.

What is perplexing is that "08:06" at the end of the line. That's the
Ethertype for ARP, not IP. Why are ARP frames being treated as IP
packets? They shouldn't even be getting to ip_handle_martian_source() if
I am following the receive path correctly.

Just thought somebody on this list might have some insights into what I
am seeing here.

Thanks,

Jeff Haran
Bytemobile







^ permalink raw reply

* Re: [PATCHv2 RFC 3/4] virtio_net: limit xmit polling
From: Michael S. Tsirkin @ 2011-06-02 19:23 UTC (permalink / raw)
  To: Sridhar Samudrala
  Cc: Krishna Kumar, Carsten Otte, lguest-uLR06cmDAlY/bJ5BZ2RsiQ,
	Shirley Ma, kvm-u79uwXL29TY76Z2rM5mHXA,
	linux-s390-u79uwXL29TY76Z2rM5mHXA, netdev-u79uwXL29TY76Z2rM5mHXA,
	habanero-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8, Heiko Carstens,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	virtualization-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	steved-r/Jw6+rmf7HQT0dZR+AlfA, Christian Borntraeger,
	Tom Lendacky, Martin Schwidefsky, linux390-tA70FqPdS9bQT0dZR+AlfA
In-Reply-To: <1307038193.2321.38.camel-gzECXbv5Qf5llG67XfQwVQGz3MnDiSnoQQ4Iyu8u01E@public.gmane.org>

On Thu, Jun 02, 2011 at 11:09:53AM -0700, Sridhar Samudrala wrote:
> On Thu, 2011-06-02 at 18:43 +0300, Michael S. Tsirkin wrote:
> > Current code might introduce a lot of latency variation
> > if there are many pending bufs at the time we
> > attempt to transmit a new one. This is bad for
> > real-time applications and can't be good for TCP either.
> > 
> > Free up just enough to both clean up all buffers
> > eventually and to be able to xmit the next packet.
> > 
> > Signed-off-by: Michael S. Tsirkin <mst-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> > ---
> >  drivers/net/virtio_net.c |  106 +++++++++++++++++++++++++++++-----------------
> >  1 files changed, 67 insertions(+), 39 deletions(-)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index a0ee78d..b25db1c 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -509,17 +509,33 @@ again:
> >  	return received;
> >  }
> > 
> > -static void free_old_xmit_skbs(struct virtnet_info *vi)
> > +static bool free_old_xmit_skb(struct virtnet_info *vi)
> >  {
> >  	struct sk_buff *skb;
> >  	unsigned int len;
> > 
> > -	while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {
> > -		pr_debug("Sent skb %p\n", skb);
> > -		vi->dev->stats.tx_bytes += skb->len;
> > -		vi->dev->stats.tx_packets++;
> > -		dev_kfree_skb_any(skb);
> > -	}
> > +	skb = virtqueue_get_buf(vi->svq, &len);
> > +	if (unlikely(!skb))
> > +		return false;
> > +	pr_debug("Sent skb %p\n", skb);
> > +	vi->dev->stats.tx_bytes += skb->len;
> > +	vi->dev->stats.tx_packets++;
> > +	dev_kfree_skb_any(skb);
> > +	return true;
> > +}
> > +
> > +/* Check capacity and try to free enough pending old buffers to enable queueing
> > + * new ones. Return true if we can guarantee that a following
> > + * virtqueue_add_buf will succeed. */
> > +static bool free_xmit_capacity(struct virtnet_info *vi)
> > +{
> > +	struct sk_buff *skb;
> > +	unsigned int len;
> > +
> > +	while (virtqueue_min_capacity(vi->svq) < MAX_SKB_FRAGS + 2)
> > +		if (unlikely(!free_old_xmit_skb))
> > +			return false;
> If we are using INDIRECT descriptors, 1 descriptor entry is good enough
> to guarantee that an skb can be queued unless we run out of memory.
> Is it worth checking if 'indirect' is set on the svq and then only free
> 1 descriptor? Otherwise, we will be dropping the packet if there are
> less than 18 free descriptors although we ony need 1.

This is not introduced with this patch: the
issue is that we need to consider worst case
as indirect memory allocation can fail.


I don't think it matters much: 18 out of 256
descriptors is not too expensive.

> > +	return true;
> >  }
> > 
> >  static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
> > @@ -572,30 +588,34 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
> >  static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  {
> >  	struct virtnet_info *vi = netdev_priv(dev);
> > -	int capacity;
> > -
> > -	/* Free up any pending old buffers before queueing new ones. */
> > -	free_old_xmit_skbs(vi);
> > -
> > -	/* Try to transmit */
> > -	capacity = xmit_skb(vi, skb);
> > -
> > -	/* This can happen with OOM and indirect buffers. */
> > -	if (unlikely(capacity < 0)) {
> > -		if (net_ratelimit()) {
> > -			if (likely(capacity == -ENOMEM)) {
> > -				dev_warn(&dev->dev,
> > -					 "TX queue failure: out of memory\n");
> > -			} else {
> > -				dev->stats.tx_fifo_errors++;
> > +	int ret, i;
> > +
> > +	/* We normally do have space in the ring, so try to queue the skb as
> > +	 * fast as possible. */
> > +	ret = xmit_skb(vi, skb);
> > +	if (unlikely(ret < 0)) {
> > +		/* This triggers on the first xmit after ring full condition.
> > +		 * We need to free up some skbs first. */
> > +		if (likely(free_xmit_capacity(vi))) {
> > +			ret = xmit_skb(vi, skb);
> > +			/* This should never fail. Check, just in case. */
> > +			if (unlikely(ret < 0)) {
> >  				dev_warn(&dev->dev,
> >  					 "Unexpected TX queue failure: %d\n",
> > -					 capacity);
> > +					 ret);
> > +				dev->stats.tx_fifo_errors++;
> > +				dev->stats.tx_dropped++;
> > +				kfree_skb(skb);
> > +				return NETDEV_TX_OK;
> >  			}
> > +		} else {
> > +			/* Ring full: it might happen if we get a callback while
> > +			 * the queue is still mostly full. This should be
> > +			 * extremely rare. */
> > +			dev->stats.tx_dropped++;
> > +			kfree_skb(skb);
> > +			goto stop;
> >  		}
> > -		dev->stats.tx_dropped++;
> > -		kfree_skb(skb);
> > -		return NETDEV_TX_OK;
> >  	}
> >  	virtqueue_kick(vi->svq);
> > 
> > @@ -603,18 +623,26 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
> >  	skb_orphan(skb);
> >  	nf_reset(skb);
> > 
> > -	/* Apparently nice girls don't return TX_BUSY; stop the queue
> > -	 * before it gets out of hand.  Naturally, this wastes entries. */
> > -	if (capacity < 2+MAX_SKB_FRAGS) {
> > -		netif_stop_queue(dev);
> > -		if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
> > -			/* More just got used, free them then recheck. */
> > -			free_old_xmit_skbs(vi);
> > -			capacity = virtqueue_min_capacity(vi->svq);
> > -			if (capacity >= 2+MAX_SKB_FRAGS) {
> > -				netif_start_queue(dev);
> > -				virtqueue_disable_cb(vi->svq);
> > -			}
> > +	/* We transmit one skb, so try to free at least two pending skbs.
> > +	 * This is so that we don't hog the skb memory unnecessarily. *
> > +	 * Doing this after kick means there's a chance we'll free
> > +	 * the skb we have just sent, which is hot in cache. */
> > +	for (i = 0; i < 2; i++)
> > +		free_old_xmit_skb(v);
> > +
> > +	if (likely(free_xmit_capacity(vi)))
> > +		return NETDEV_TX_OK;
> > +
> > +stop:
> > +	/* Apparently nice girls don't return TX_BUSY; check capacity and stop
> > +	 * the queue before it gets out of hand.
> > +	 * Naturally, this wastes entries. */
> > +	netif_stop_queue(dev);
> > +	if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
> > +		/* More just got used, free them and recheck. */
> > +		if (free_xmit_capacity(vi)) {
> > +			netif_start_queue(dev);
> > +			virtqueue_disable_cb(vi->svq);
> >  		}
> >  	}
> > 

^ permalink raw reply

* Re: [PATCH] Use unsigned variables for packet lengths in ip[6]_queue.
From: Dave Jones @ 2011-06-02 19:24 UTC (permalink / raw)
  To: netdev
In-Reply-To: <20110528003651.GA8380@redhat.com>

On Fri, May 27, 2011 at 08:36:51PM -0400, Dave Jones wrote:

 >  > > Not catastrophic, but ipqueue seems to be too trusting of what it gets
 >  > > passed from userspace, and passes it on down to the page allocator,
 >  > > where it will spew warnings if the page order is too high.
 >  > > 
 >  > > __ipq_rcv_skb has several checks for lengths too small, but doesn't
 >  > > seem to have any for oversized ones.   I'm not sure what the maximum
 >  > > we should check for is. I'll code up a diff if anyone has any ideas
 >  > > on a sane maximum.
 >  > 
 >  > Maybe the thing to do is to simply pass __GFP_NOWARN to nlmsg_new()
 >  > in netlink_ack()?
 >  > 
 >  > Anyone else have a better idea?
 > 
 > So I went back to this today, and found something that doesn't look right.
 > After adding some instrumentation, and re-running my tests, I found that
 > the reason we were blowing up with enormous allocations was that we
 > were passing down a nlmsglen's like -1061109568
 > 
 > Is there any reason for that to be signed ?
 > The nlmsg_len entry of nlmsghdr is a u32, so I'm assuming this is a bug.
 > 
 > With the patch below, I haven't been able to reproduce the problem, but
 > I don't know if I've inadvertantly broken some other behaviour somewhere
 > deeper in netlink where this is valid.

any feedback on this ? am I barking up the wrong tree ?

	Dave


^ permalink raw reply

* Re: [Bugme-new] [Bug 36562] New: e1000 network driver causes kernel oops
From: Andrew Morton @ 2011-06-02 19:32 UTC (permalink / raw)
  To: netdev; +Cc: bugzilla-daemon, bugme-daemon, bryan.christ
In-Reply-To: <bug-36562-10286@https.bugzilla.kernel.org/>


(switched to email.  Please respond via emailed reply-to-all, not via the
bugzilla web interface).

On Thu, 2 Jun 2011 19:20:08 GMT
bugzilla-daemon@bugzilla.kernel.org wrote:

> https://bugzilla.kernel.org/show_bug.cgi?id=36562
> 
>            Summary: e1000 network driver causes kernel oops
>            Product: Drivers
>            Version: 2.5
>     Kernel Version: 2.6.38.6-26.rc1.fc14.x86_64
>           Platform: All
>         OS/Version: Linux
>               Tree: Fedora
>             Status: NEW
>           Severity: high
>           Priority: P1
>          Component: Network
>         AssignedTo: drivers_network@kernel-bugs.osdl.org
>         ReportedBy: bryan.christ@gmail.com
>         Regression: No
> 
> 
> After running for about 12-48 hours we typically have several servers which
> kernel oops on the e1000
> 
> http://www.mediafire.com/imgbnc.php/e65c4a4b220096cabfa8746195898c1c5520b65cdde108921f027a309c179f6a6g.jpg
> 

It crashed deep down in TCP.  That may not be e1000-related.

I think we still have two drivers: e1000 and e1000e.  You're using
e1000e.


^ permalink raw reply

* Re: [PATCH] bonding: reset queue mapping prior to transmission to physical device
From: Neil Horman @ 2011-06-02 19:46 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Jay Vosburgh, Andy Gospodarek, David S. Miller
In-Reply-To: <1307041789.2812.27.camel@bwh-desktop>

On Thu, Jun 02, 2011 at 08:09:49PM +0100, Ben Hutchings wrote:
> On Thu, 2011-06-02 at 14:56 -0400, Neil Horman wrote:
> > On Thu, Jun 02, 2011 at 07:35:53PM +0100, Ben Hutchings wrote:
> > > On Thu, 2011-06-02 at 14:03 -0400, Neil Horman wrote:
> > > > The bonding driver is multiqueue enabled, in which each queue represents a slave
> > > > to enable optional steering of output frames to given slaves against the default
> > > > output policy.  However, it needs to reset the skb->queue_mapping prior to
> > > > queuing to the physical device or the physical slave (if it is multiqueue) could
> > > > wind up transmitting on an unintended tx queue (one that was reserved for
> > > > specific traffic classes for instance)
> [...]
> > > So far as I can see, this has no effect, because dev_queue_xmit() always
> > > sets queue_mapping (in dev_pick_tx()).
> > > 
> > it resets the queue mapping exactly as you would expect it to.  bonding is a
> > multiqueue enabled device and selects a potentially non-zero queue based on the
> > output of bond_select_queue.
> > 
> > > What is the problem you're seeing?
> > > 
> > The problem is exctly that.  dev_pick_tx() on the bond device sets the
> > queue_mapping as per the result of bond_select_queue (the ndo_select_queue
> > method for the bonding driver).  The implementation there is based on the use of
> > tc with bonding, so that output slaves can be selected for certain types of
> > traffic.  But when that mechanism is used, skb->queue_mapping is preserved when
> > the bonding driver queues the frame to the underlying slave.  This denies the
> > slave (if its also a multiqueue device) the opportunity to reselect the queue
> > properly, because of this call path:
> > 
> > bond_queue_xmit
> >  dev_queue_xmit(slave_dev)
> >   dev_pick_tx()
> >    skb_tx_hash()
> >     __skb_tx_hash()
> > 
> > __skb_tx_hash sees that skb_queue_recorded returns true, and assigns a hardware queue mapping
> > based on what the bonding driver chose using its own internal logic.  Since
> > bonding uses the multiqueue infrastructure to do slave output selection without
> > any regard for slave output queue selection, it seems to me we should really
> > reset the queue mapping to zero so the slave device can pick its own tx queue.
> 
> So you're effectively clearing the *RX queue* number (as this is before
> dev_pick_tx()) in order to influence TX queue selection.
> 
No, you're not seeing it properly.  In bonding (as with all stacked devices) we
make two passes through dev_pick_tx.

1) The first time we call dev_pick_tx is when the network stack calls
dev_queue_xmit on the bond device.  here we call ndo_select_queue, which calls
bond_select_queue.  This method tells the stack which queue to enqueue the skb
on for the bond device.  We can use tc's skbedit action to select a particular
queue on the bond device for various classes of traffic, and those queues
correspond to individual slave devices.

2) the second time we call dev_pick_tx is when the bonding bond_queue_xmit
routine (which is the bonding drivers ndo_start_xmit method, called after the
frist dev_pick_tx), calls dev_queue_xmit, passing the slave net_device pointer).
In this case we do whatever the slave device has configured (either reset the
queue to zero, or call the slaves ndo_select queue method).

What I'm fixing is the fact that the bonding drivers queue_mappnig value is
'leaking' down to the slave.

Lets say, for example we're bonding two multiqueue devices, and have the bonding
driver configured to send all traffic to 192.168.1.1 over the first slave (which
we can accomplish using an appropriate tc rule on the bond device, setting
frames with that dest ip to have a skb->queue_mapping of 1).

In the above example, frames bound for 192.168.1.1 have skb->queue_mapping set
to 1 when they enter bond_queue_xmit.  bond_queue_xmit, calls
dev_queue_xmit(slave, skb), which calls dev_pick_tx(slave, skb).  If we (for
simplicity's sake) assume the slave has no ndo_select_queue method, dev_pick_tx
calls __skb_tx_hash to determine the output queue.  But the bonding driver
already set skb->queue_mapping to 1 (because it wanted this frame output on the
first slave, not because it wanted to transmit the frame on the slaves tx queue
1).  Nevertheless, __skb_tx_hash() sees that skb_rx_queue_recorded returns true
here and as a result, calls skb_rx_get_queue, which returns 0.  Because of that
we wind up transmitting on hardware queue 0 all the time, which is not at all
what we want.  What we want is for the bonding driver to be able to use
queue_mapping for its own purposes, and then allow the physical device to make
its own output queue selection independently.  To do this, queue_mapping needs
to be zero, prior to queuenig the skb to the slave, which is exactly what this
patch does.

> Here, the bonding device seems to be behaving as a forwarding device.
> If TX queue selection can go wrong for certain combinations of queue
> configuration when forwarding, then this is a problem for IP forwarding
> and bridging as well, isn't it?
> 
Potentially, yes.  I only fixed this because I was looking at bonding and its
queue_mapping behavior, and saw that this needed fixing.  Bridging and IP
forwarding should also likely clear the queue mapping in the forwarding path
somewhere to avoid selecting an output tx queue that is a function of whatever
queue and device it arrived on during ingress.  I've not yet looked to see if
thats already being done.

Neil

> Ben.
> 
> -- 
> Ben Hutchings, Senior Software Engineer, Solarflare
> Not speaking for my employer; that's the marketing department's job.
> They asked us to note that Solarflare product names are trademarked.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox