Netdev List
 help / color / mirror / Atom feed
* [PATCH net-next v5 5/9] xen-netback: Add stat counters for zerocopy
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1390253069-25507-1-git-send-email-zoltan.kiss@citrix.com>

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    3 +++
 drivers/net/xen-netback/interface.c |   15 +++++++++++++++
 drivers/net/xen-netback/netback.c   |    9 ++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 419e63c..e3c28ff 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -155,6 +155,9 @@ struct xenvif {
 
 	/* Statistics */
 	unsigned long rx_gso_checksum_fixup;
+	unsigned long tx_zerocopy_sent;
+	unsigned long tx_zerocopy_success;
+	unsigned long tx_zerocopy_fail;
 
 	/* Miscellaneous private stuff. */
 	struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af5216f..75fe683 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -239,6 +239,21 @@ static const struct xenvif_stat {
 		"rx_gso_checksum_fixup",
 		offsetof(struct xenvif, rx_gso_checksum_fixup)
 	},
+	/* If (sent != success + fail), there are probably packets never
+	 * freed up properly!
+	 */
+	{
+		"tx_zerocopy_sent",
+		offsetof(struct xenvif, tx_zerocopy_sent),
+	},
+	{
+		"tx_zerocopy_success",
+		offsetof(struct xenvif, tx_zerocopy_success),
+	},
+	{
+		"tx_zerocopy_fail",
+		offsetof(struct xenvif, tx_zerocopy_fail)
+	},
 };
 
 static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a1b03e4..e2dd565 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1611,8 +1611,10 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
 		 * skb_copy_ubufs while we are still in control of the skb. E.g.
 		 * the __pskb_pull_tail earlier can do such thing.
 		 */
-		if (skb_shinfo(skb)->destructor_arg)
+		if (skb_shinfo(skb)->destructor_arg) {
 			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			vif->tx_zerocopy_sent++;
+		}
 
 		netif_receive_skb(skb);
 	}
@@ -1645,6 +1647,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 		napi_schedule(&vif->napi);
 	} while (ubuf);
 	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+
+	if (likely(zerocopy_success))
+		vif->tx_zerocopy_success++;
+	else
+		vif->tx_zerocopy_fail++;
 }
 
 static inline void xenvif_tx_action_dealloc(struct xenvif *vif)

^ permalink raw reply related

* [PATCH net-next v5 6/9] xen-netback: Handle guests with too many frags
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1390253069-25507-1-git-send-email-zoltan.kiss@citrix.com>

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy the whole stuff into a brand new skb and send it to the stack
- unmap the 2 old skb's pages

v3:
- adding extra check for frag number
- consolidate alloc_skb's into xenvif_alloc_skb()
- BUG_ON(frag_overflow > MAX_SKB_FRAGS)

v4:
- handle error of skb_copy_expand()

v5:
- ratelimit error messages
- remove a tx_flags setting from xenvif_tx_submit 

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

---
 drivers/net/xen-netback/netback.c |  124 ++++++++++++++++++++++++++++++++++---
 1 file changed, 114 insertions(+), 10 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 22d05de..031258c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -803,6 +803,20 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
 	       sizeof(*txp));
 }
 
+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb =
+		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
+			  GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	/* Packets passed to netif_rx() must have some headroom. */
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+	return skb;
+}
+
 static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 							struct sk_buff *skb,
 							struct xen_netif_tx_request *txp,
@@ -813,11 +827,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	u16 pending_idx = *((u16 *)skb->data);
 	int start;
 	pending_ring_idx_t index;
-	unsigned int nr_slots;
+	unsigned int nr_slots, frag_overflow = 0;
 
 	/* At this point shinfo->nr_frags is in fact the number of
 	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
 	 */
+	if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+		frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+		BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+		shinfo->nr_frags = MAX_SKB_FRAGS;
+	}
 	nr_slots = shinfo->nr_frags;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -833,6 +852,30 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
 
+	if (frag_overflow) {
+		struct sk_buff *nskb = xenvif_alloc_skb(0);
+		if (unlikely(nskb == NULL)) {
+			if (net_ratelimit())
+				netdev_err(vif->dev,
+					   "Can't allocate the frag_list skb.\n");
+			return NULL;
+		}
+
+		shinfo = skb_shinfo(nskb);
+		frags = shinfo->frags;
+
+		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+		     shinfo->nr_frags++, txp++, gop++) {
+			index = pending_index(vif->pending_cons++);
+			pending_idx = vif->pending_ring[index];
+			xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+			frag_set_pending_idx(&frags[shinfo->nr_frags],
+					     pending_idx);
+		}
+
+		skb_shinfo(skb)->frag_list = nskb;
+	}
+
 	return gop;
 }
 
@@ -846,6 +889,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
+	struct sk_buff *first_skb = NULL;
 
 	/* Check status of header. */
 	err = gop->status;
@@ -866,6 +910,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
+check_frags:
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
 
@@ -900,11 +945,20 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		/* Not the first error? Preceding frags already invalidated. */
 		if (err)
 			continue;
-
 		/* First error: invalidate header and preceding fragments. */
-		pending_idx = *((u16 *)skb->data);
-		xenvif_idx_unmap(vif, pending_idx);
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+		if (!first_skb) {
+			pending_idx = *((u16 *)skb->data);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif,
+					   pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		} else {
+			pending_idx = *((u16 *)first_skb->data);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif,
+					   pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		}
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
 			xenvif_idx_unmap(vif, pending_idx);
@@ -916,6 +970,32 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		err = newerr;
 	}
 
+	if (shinfo->frag_list) {
+		first_skb = skb;
+		skb = shinfo->frag_list;
+		shinfo = skb_shinfo(skb);
+		nr_frags = shinfo->nr_frags;
+		start = 0;
+
+		goto check_frags;
+	}
+
+	/* There was a mapping error in the frag_list skb. We have to unmap
+	 * the first skb's frags
+	 */
+	if (first_skb && err) {
+		int j;
+		shinfo = skb_shinfo(first_skb);
+		pending_idx = *((u16 *)first_skb->data);
+		start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+		for (j = start; j < shinfo->nr_frags; j++) {
+			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif, pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		}
+	}
+
 	*gopp = gop + 1;
 	return err;
 }
@@ -1419,8 +1499,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
 			PKT_PROT_LEN : txreq.size;
 
-		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
-				GFP_ATOMIC | __GFP_NOWARN);
+		skb = xenvif_alloc_skb(data_len);
 		if (unlikely(skb == NULL)) {
 			netdev_dbg(vif->dev,
 				   "Can't allocate a skb in start_xmit.\n");
@@ -1428,9 +1507,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			break;
 		}
 
-		/* Packets passed to netif_rx() must have some headroom. */
-		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
 			struct xen_netif_extra_info *gso;
 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1492,6 +1568,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		struct xen_netif_tx_request *txp;
 		u16 pending_idx;
 		unsigned data_len;
+		struct sk_buff *nskb = NULL;
 
 		pending_idx = *((u16 *)skb->data);
 		txp = &vif->pending_tx_info[pending_idx].req;
@@ -1534,6 +1611,30 @@ static int xenvif_tx_submit(struct xenvif *vif)
 				  pending_idx :
 				  INVALID_PENDING_IDX);
 
+		if (skb_shinfo(skb)->frag_list) {
+			nskb = skb_shinfo(skb)->frag_list;
+			xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
+			skb->len += nskb->len;
+			skb->data_len += nskb->len;
+			skb->truesize += nskb->truesize;
+			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			vif->tx_zerocopy_sent += 2;
+			nskb = skb;
+
+			skb = skb_copy_expand(skb,
+					      0,
+					      0,
+					      GFP_ATOMIC | __GFP_NOWARN);
+			if (!skb) {
+				if (net_ratelimit())
+					netdev_dbg(vif->dev,
+						   "Can't consolidate skb with too many fragments\n");
+				kfree_skb(nskb);
+				continue;
+			}
+			skb_shinfo(skb)->destructor_arg = NULL;
+		}
 		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
 			int target = min_t(int, skb->len, PKT_PROT_LEN);
 			__pskb_pull_tail(skb, target - skb_headlen(skb));
@@ -1587,6 +1688,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		}
 
 		netif_receive_skb(skb);
+
+		if (nskb)
+			kfree_skb(nskb);
 	}
 
 	return work_done;

^ permalink raw reply related

* [PATCH net-next v5 7/9] xen-netback: Add stat counters for frag_list skbs
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1390253069-25507-1-git-send-email-zoltan.kiss@citrix.com>

These counters help determine how often the guest sends a packet with more
than MAX_SKB_FRAGS frags.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    1 +
 drivers/net/xen-netback/interface.c |    7 +++++++
 drivers/net/xen-netback/netback.c   |    1 +
 3 files changed, 9 insertions(+)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index e3c28ff..c037efb 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -158,6 +158,7 @@ struct xenvif {
 	unsigned long tx_zerocopy_sent;
 	unsigned long tx_zerocopy_success;
 	unsigned long tx_zerocopy_fail;
+	unsigned long tx_frag_overflow;
 
 	/* Miscellaneous private stuff. */
 	struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index ac27af3..b7daf8d 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -254,6 +254,13 @@ static const struct xenvif_stat {
 		"tx_zerocopy_fail",
 		offsetof(struct xenvif, tx_zerocopy_fail)
 	},
+	/* Number of packets exceeding MAX_SKB_FRAG slots. You should use
+	 * a guest with the same MAX_SKB_FRAG
+	 */
+	{
+		"tx_frag_overflow",
+		offsetof(struct xenvif, tx_frag_overflow)
+	},
 };
 
 static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 9841429..4305965 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1656,6 +1656,7 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
 			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 			skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
 			vif->tx_zerocopy_sent += 2;
+			vif->tx_frag_overflow++;
 			nskb = skb;
 
 			skb = skb_copy_expand(skb, 0, 0, GFP_ATOMIC | __GFP_NOWARN);

^ permalink raw reply related

* [PATCH net-next v5 8/9] xen-netback: Timeout packets in RX path
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1390253069-25507-1-git-send-email-zoltan.kiss@citrix.com>

A malicious or buggy guest can leave its queue filled indefinitely, in which
case qdisc start to queue packets for that VIF. If those packets came from an
another guest, it can block its slots and prevent shutdown. To avoid that, we
make sure the queue is drained in every 10 seconds.
The QDisc queue in worst case takes 3 round to flush usually.

v3:
- remove stale debug log
- tie unmap timeout in xenvif_free to this timeout

v4:
- due to RX flow control changes now start_xmit doesn't drop the packets but
  place them on the internal queue. So the timer set rx_queue_purge and kick in
  the thread to drop the packets there
- we shoot down the timer if a previously filled internal queue drains
- adjust the teardown timeout as in worst case it can take more time now

v5:
- create separate variable worst_case_skb_lifetime and add an explanation about
  why is it so long

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    6 ++++++
 drivers/net/xen-netback/interface.c |   37 +++++++++++++++++++++++++++++++++--
 drivers/net/xen-netback/netback.c   |   23 +++++++++++++++++++---
 3 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 109c29f..d1cd8ce 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -129,6 +129,9 @@ struct xenvif {
 	struct xen_netif_rx_back_ring rx;
 	struct sk_buff_head rx_queue;
 	RING_IDX rx_last_skb_slots;
+	bool rx_queue_purge;
+
+	struct timer_list wake_queue;
 
 	/* This array is allocated seperately as it is large */
 	struct gnttab_copy *grant_copy_op;
@@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
 
 extern bool separate_tx_rx_irq;
 
+extern unsigned int rx_drain_timeout_msecs;
+extern unsigned int rx_drain_timeout_jiffies;
+
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af6b3e1..40aa500 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -114,6 +114,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static void xenvif_wake_queue(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+
+	if (netif_queue_stopped(vif->dev)) {
+		netdev_err(vif->dev, "draining TX queue\n");
+		vif->rx_queue_purge = true;
+		xenvif_kick_thread(vif);
+		netif_wake_queue(vif->dev);
+	}
+}
+
 static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct xenvif *vif = netdev_priv(dev);
@@ -143,8 +155,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	 * then turn off the queue to give the ring a chance to
 	 * drain.
 	 */
-	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
+	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
+		vif->wake_queue.function = xenvif_wake_queue;
+		vif->wake_queue.data = (unsigned long)vif;
 		xenvif_stop_queue(vif);
+		mod_timer(&vif->wake_queue,
+			jiffies + rx_drain_timeout_jiffies);
+	}
 
 	skb_queue_tail(&vif->rx_queue, skb);
 	xenvif_kick_thread(vif);
@@ -352,6 +369,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 	init_timer(&vif->credit_timeout);
 	vif->credit_window_start = get_jiffies_64();
 
+	init_timer(&vif->wake_queue);
+
 	dev->netdev_ops	= &xenvif_netdev_ops;
 	dev->hw_features = NETIF_F_SG |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
@@ -532,6 +551,7 @@ void xenvif_disconnect(struct xenvif *vif)
 		xenvif_carrier_off(vif);
 
 	if (vif->task) {
+		del_timer_sync(&vif->wake_queue);
 		kthread_stop(vif->task);
 		vif->task = NULL;
 	}
@@ -557,12 +577,25 @@ void xenvif_disconnect(struct xenvif *vif)
 void xenvif_free(struct xenvif *vif)
 {
 	int i, unmap_timeout = 0;
+	/* Here we want to avoid timeout messages if an skb can be legitimatly
+	 * stucked somewhere else. Realisticly this could be an another vif's
+	 * internal or QDisc queue. That another vif also has this
+	 * rx_drain_timeout_msecs timeout, but the timer only ditches the
+	 * internal queue. After that, the QDisc queue can put in worst case
+	 * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
+	 * internal queue, so we need several rounds of such timeouts until we
+	 * can be sure that no another vif should have skb's from us. We are
+	 * not sending more skb's, so newly stucked packets are not interesting
+	 * for us here.
+	 */
+	unsigned int worst_case_skb_lifetime = (rx_drain_timeout_msecs/1000) *
+		DIV_ROUND_UP(XENVIF_QUEUE_LENGTH, (XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS));
 
 	for (i = 0; i < MAX_PENDING_REQS; ++i) {
 		if (vif->grant_tx_handle[i] != NETBACK_INVALID_HANDLE) {
 			unmap_timeout++;
 			schedule_timeout(msecs_to_jiffies(1000));
-			if (unmap_timeout > 9 &&
+			if (unmap_timeout > worst_case_skb_lifetime &&
 			    net_ratelimit())
 				netdev_err(vif->dev,
 					   "Page still granted! Index: %x\n",
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 560950e..bb65c7c 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -63,6 +63,13 @@ module_param(separate_tx_rx_irq, bool, 0644);
 static unsigned int fatal_skb_slots = FATAL_SKB_SLOTS_DEFAULT;
 module_param(fatal_skb_slots, uint, 0444);
 
+/* When guest ring is filled up, qdisc queues the packets for us, but we have
+ * to timeout them, otherwise other guests' packets can get stucked there
+ */
+unsigned int rx_drain_timeout_msecs = 10000;
+module_param(rx_drain_timeout_msecs, uint, 0444);
+unsigned int rx_drain_timeout_jiffies;
+
 /*
  * To avoid confusion, we define XEN_NETBK_LEGACY_SLOTS_MAX indicating
  * the maximum slots a valid packet can use. Now this value is defined
@@ -1909,8 +1916,9 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
 
 static inline int rx_work_todo(struct xenvif *vif)
 {
-	return !skb_queue_empty(&vif->rx_queue) &&
-	       xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots);
+	return (!skb_queue_empty(&vif->rx_queue) &&
+	       xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots)) ||
+	       vif->rx_queue_purge;
 }
 
 static inline int tx_work_todo(struct xenvif *vif)
@@ -1998,12 +2006,19 @@ int xenvif_kthread(void *data)
 		if (kthread_should_stop())
 			break;
 
+		if (vif->rx_queue_purge) {
+			skb_queue_purge(&vif->rx_queue);
+			vif->rx_queue_purge = false;
+		}
+
 		if (!skb_queue_empty(&vif->rx_queue))
 			xenvif_rx_action(vif);
 
 		if (skb_queue_empty(&vif->rx_queue) &&
-		    netif_queue_stopped(vif->dev))
+		    netif_queue_stopped(vif->dev)) {
+			del_timer_sync(&vif->wake_queue);
 			xenvif_start_queue(vif);
+		}
 
 		cond_resched();
 	}
@@ -2054,6 +2069,8 @@ static int __init netback_init(void)
 	if (rc)
 		goto failed_init;
 
+	rx_drain_timeout_jiffies = msecs_to_jiffies(rx_drain_timeout_msecs);
+
 	return 0;
 
 failed_init:

^ permalink raw reply related

* [PATCH net-next v5 9/9] xen-netback: Aggregate TX unmap operations
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1390253069-25507-1-git-send-email-zoltan.kiss@citrix.com>

Unmapping causes TLB flushing, therefore we should make it in the largest
possible batches. However we shouldn't starve the guest for too long. So if
the guest has space for at least two big packets and we don't have at least a
quarter ring to unmap, delay it for at most 1 milisec.

v4:
- use bool for tx_dealloc_work_todo

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    2 ++
 drivers/net/xen-netback/interface.c |    2 ++
 drivers/net/xen-netback/netback.c   |   34 +++++++++++++++++++++++++++++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index d1cd8ce..95498c8 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -118,6 +118,8 @@ struct xenvif {
 	u16 dealloc_ring[MAX_PENDING_REQS];
 	struct task_struct *dealloc_task;
 	wait_queue_head_t dealloc_wq;
+	struct timer_list dealloc_delay;
+	bool dealloc_delay_timed_out;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index 40aa500..f925af5 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -407,6 +407,7 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
 			  .desc = i };
 		vif->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
 	}
+	init_timer(&vif->dealloc_delay);
 
 	/*
 	 * Initialise a dummy MAC address. We choose the numerically
@@ -557,6 +558,7 @@ void xenvif_disconnect(struct xenvif *vif)
 	}
 
 	if (vif->dealloc_task) {
+		del_timer_sync(&vif->dealloc_delay);
 		kthread_stop(vif->dealloc_task);
 		vif->dealloc_task = NULL;
 	}
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index bb65c7c..c098276 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -135,6 +135,11 @@ static inline pending_ring_idx_t nr_pending_reqs(struct xenvif *vif)
 		vif->pending_prod + vif->pending_cons;
 }
 
+static inline pending_ring_idx_t nr_free_slots(struct xen_netif_tx_back_ring *ring)
+{
+	return ring->nr_ents -	(ring->sring->req_prod - ring->rsp_prod_pvt);
+}
+
 bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed)
 {
 	RING_IDX prod, cons;
@@ -1932,9 +1937,36 @@ static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static void xenvif_dealloc_delay(unsigned long data)
+{
+	struct xenvif *vif = (struct xenvif *)data;
+
+	vif->dealloc_delay_timed_out = true;
+	wake_up(&vif->dealloc_wq);
+}
+
 static inline bool tx_dealloc_work_todo(struct xenvif *vif)
 {
-	return vif->dealloc_cons != vif->dealloc_prod
+	if (vif->dealloc_cons != vif->dealloc_prod) {
+		if ((nr_free_slots(&vif->tx) > 2 * XEN_NETBK_LEGACY_SLOTS_MAX) &&
+		    (vif->dealloc_prod - vif->dealloc_cons < MAX_PENDING_REQS / 4) &&
+		    !vif->dealloc_delay_timed_out) {
+			if (!timer_pending(&vif->dealloc_delay)) {
+				vif->dealloc_delay.function =
+					xenvif_dealloc_delay;
+				vif->dealloc_delay.data = (unsigned long)vif;
+				mod_timer(&vif->dealloc_delay,
+					  jiffies + msecs_to_jiffies(1));
+
+			}
+			return false;
+		}
+		del_timer_sync(&vif->dealloc_delay);
+		vif->dealloc_delay_timed_out = false;
+		return true;
+	}
+
+	return false;
 }
 
 void xenvif_unmap_frontend_rings(struct xenvif *vif)

^ permalink raw reply related

* [PATCH net-next v5 0/9] xen-netback: TX grant mapping with SKBTX_DEV_ZEROCOPY instead of copy
From: Zoltan Kiss @ 2014-01-20 21:24 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss

A long known problem of the upstream netback implementation that on the TX
path (from guest to Dom0) it copies the whole packet from guest memory into
Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
huge perfomance penalty. The classic kernel version of netback used grant
mapping, and to get notified when the page can be unmapped, it used page
destructors. Unfortunately that destructor is not an upstreamable solution.
Ian Campbell's skb fragment destructor patch series [1] tried to solve this
problem, however it seems to be very invasive on the network stack's code,
and therefore haven't progressed very well.
This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
know when the skb is freed up. That is the way KVM solved the same problem,
and based on my initial tests it can do the same for us. Avoiding the extra
copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
switch)
Based on my investigations the packet get only copied if it is delivered to
Dom0 stack, which is due to this [2] patch. That's a bit unfortunate, but
luckily it doesn't cause a major regression for this usecase. In the future
we should try to eliminate that copy somehow.
There are a few spinoff tasks which will be addressed in separate patches:
- grant copy the header directly instead of map and memcpy. This should help
  us avoiding TLB flushing
- use something else than ballooned pages
- fix grant map to use page->index properly
I will run some more extensive tests, but some basic XenRT tests were already
passed with good results.
I've tried to broke it down to smaller patches, with mixed results, so I
welcome suggestions on that part as well:
1: Introduce TX grant map definitions
2: Change TX path from grant copy to mapping
3: Remove old TX grant copy definitons and fix indentations
4: Change RX path for mapped SKB fragments
5: Add stat counters for zerocopy
6: Handle guests with too many frags
7: Add stat counters for frag_list skbs
8: Timeout packets in RX path
9: Aggregate TX unmap operations

v2: I've fixed some smaller things, see the individual patches. I've added a
few new stat counters, and handling the important use case when an older guest
sends lots of slots. Instead of delayed copy now we timeout packets on the RX
path, based on the assumption that otherwise packets should get stucked
anywhere else. Finally some unmap batching to avoid too much TLB flush

v3: Apart from fixing a few things mentioned in responses the important change
is the use the hypercall directly for grant [un]mapping, therefore we can
avoid m2p override.

v4: Now we are using a new grant mapping API to avoid m2p_override. The RX queue
timeout logic changed also.

v5: Only minor fixes based on Wei's comments

[1] http://lwn.net/Articles/491522/
[2] https://lkml.org/lkml/2012/7/20/363

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

^ permalink raw reply

* Re: [PATCH net-next] net: vxlan: do not use vxlan_net before checking event type
From: Eric W. Biederman @ 2014-01-20 21:51 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: davem, netdev, Jesse Brandeburg, Cong Wang
In-Reply-To: <1389959706-30976-1-git-send-email-dborkman@redhat.com>

Daniel Borkmann <dborkman@redhat.com> writes:

> Jesse Brandeburg reported that commit acaf4e70997f caused a panic
> when adding a network namespace while vxlan module was present in
> the system:
>
> [<ffffffff814d0865>] vxlan_lowerdev_event+0xf5/0x100
> [<ffffffff816e9e5d>] notifier_call_chain+0x4d/0x70
> [<ffffffff810912be>] __raw_notifier_call_chain+0xe/0x10
> [<ffffffff810912d6>] raw_notifier_call_chain+0x16/0x20
> [<ffffffff815d9610>] call_netdevice_notifiers_info+0x40/0x70
> [<ffffffff815d9656>] call_netdevice_notifiers+0x16/0x20
> [<ffffffff815e1bce>] register_netdevice+0x1be/0x3a0
> [<ffffffff815e1dce>] register_netdev+0x1e/0x30
> [<ffffffff814cb94a>] loopback_net_init+0x4a/0xb0
> [<ffffffffa016ed6e>] ? lockd_init_net+0x6e/0xb0 [lockd]
> [<ffffffff815d6bac>] ops_init+0x4c/0x150
> [<ffffffff815d6d23>] setup_net+0x73/0x110
> [<ffffffff815d725b>] copy_net_ns+0x7b/0x100
> [<ffffffff81090e11>] create_new_namespaces+0x101/0x1b0
> [<ffffffff81090f45>] copy_namespaces+0x85/0xb0
> [<ffffffff810693d5>] copy_process.part.26+0x935/0x1500
> [<ffffffff811d5186>] ? mntput+0x26/0x40
> [<ffffffff8106a15c>] do_fork+0xbc/0x2e0
> [<ffffffff811b7f2e>] ? ____fput+0xe/0x10
> [<ffffffff81089c5c>] ? task_work_run+0xac/0xe0
> [<ffffffff8106a406>] SyS_clone+0x16/0x20
> [<ffffffff816ee689>] stub_clone+0x69/0x90
> [<ffffffff816ee329>] ? system_call_fastpath+0x16/0x1b
>
> Apparently loopback device is being registered first and thus we
> receive an event notification when vxlan_net is not ready. Hence,
> when we call net_generic() and request vxlan_net_id, we seem to
> access garbage at that point in time. In setup_net() where we set
> up a newly allocated network namespace, we traverse the list of
> pernet ops ...
>
> list_for_each_entry(ops, &pernet_list, list) {
> 	error = ops_init(ops, net);
> 	if (error < 0)
> 		goto out_undo;
> }
>
> ... and loopback_net_init() is invoked first here, so in the middle
> of setup_net() we get this notification in vxlan. As currently we
> only care about devices that unregister, move access through
> net_generic() there. Fix is based on Cong Wang's proposal, but
> only changes what is needed here. It sucks a bit as we only work
> around the actual cure: right now it seems the only way to check if
> a netns actually finished traversing all init ops would be to check
> if it's part of net_namespace_list. But that I find quite expensive
> each time we go through a notifier callback. Anyway, did a couple
> of tests and it seems good for now.

I am not going to argue against this patch as an immediate bug fix but
something smells here, that bears deeper investigation.  It looks like
the symptom is being patched rather than the actual problem.

In particular net_generic(dev_net(dev), vxlan_net_id) is valid at the
point that it is being called.  As the pointers are allocated in
copy_net_ns in net_alloc prior to setup_net being called.

On the flip side it is the responsibility of code that uses both
register_netdev_notifier and register_pernet_xxx to be ready to handle
events from any namespace as soon as they happen.  vxlan should be using
register_pernet_subsys instead of register_pernet_device to ensure the
vxlan_net structure is initialized before and cleaned up after all
network devices in a given network namespace.  The vlan devices with a
similar problem already do this.

So in summary.  Something smells and I don't believe this patch fixes
the underlying issue.  Please take a deeper look into what vxlan is doing.

Eric


> Fixes: acaf4e70997f ("net: vxlan: when lower dev unregisters remove vxlan dev as well")
> Reported-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Cc: "Eric W. Biederman" <ebiederm@xmission.com>
> Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
> Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
> ---
>  drivers/net/vxlan.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
> index a2dee80..d6ec71f 100644
> --- a/drivers/net/vxlan.c
> +++ b/drivers/net/vxlan.c
> @@ -2681,10 +2681,12 @@ static int vxlan_lowerdev_event(struct notifier_block *unused,
>  				unsigned long event, void *ptr)
>  {
>  	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> -	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
> +	struct vxlan_net *vn;
>  
> -	if (event == NETDEV_UNREGISTER)
> +	if (event == NETDEV_UNREGISTER) {
> +		vn = net_generic(dev_net(dev), vxlan_net_id);
>  		vxlan_handle_lowerdev_unregister(vn, dev);
> +	}
>  
>  	return NOTIFY_DONE;
>  }

^ permalink raw reply

* [PATCH net v2] tuntap: Fix for a race in accessing numqueues
From: Dominic Curran @ 2014-01-20 21:59 UTC (permalink / raw)
  To: netdev; +Cc: Dominic Curran, Jason Wang, Maxim Krasnyansky

A patch for fixing a race between queue selection and changing queues
was introduced in commit 92bb73ea2("tuntap: fix a possible race between
queue selection and changing queues").

The fix was to prevent the driver from re-reading the tun->numqueues
more than once within tun_select_queue() using ACCESS_ONCE().

We have been experiancing 'Divide-by-zero' errors in tun_net_xmit() 
since we moved from 3.6 to 3.10, and believe that they come from a 
simular source where the value of tun->numqueues changes to zero 
between the first and a subsequent read of tun->numqueues.

The fix is a simular use of ACCESS_ONCE(), as well as a multiply
instead of a divide in the if statement.

Signed-off-by: Dominic Curran <dominic.curran@citrix.com>
Cc: Jason Wang <jasowang@redhat.com>
Cc: Maxim Krasnyansky <maxk@qti.qualcomm.com>
---
V2: Use multiply instead of divide. Suggested by Eric Dumazet.
    Fixed email address for maxk. Rebase against net tree.
---
 drivers/net/tun.c |    8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

Index: net/drivers/net/tun.c
===================================================================
--- net.orig/drivers/net/tun.c	2014-01-20 20:22:06.000000000 +0000
+++ net/drivers/net/tun.c	2014-01-20 20:54:54.000000000 +0000
@@ -721,12 +721,14 @@ static netdev_tx_t tun_net_xmit(struct s
 	struct tun_struct *tun = netdev_priv(dev);
 	int txq = skb->queue_mapping;
 	struct tun_file *tfile;
+	u32 numqueues = 0;
 
 	rcu_read_lock();
 	tfile = rcu_dereference(tun->tfiles[txq]);
+	numqueues = ACCESS_ONCE(tun->numqueues);
 
 	/* Drop packet if interface is not attached */
-	if (txq >= tun->numqueues)
+	if (txq >= numqueues)
 		goto drop;
 
 	tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
@@ -746,8 +748,8 @@ static netdev_tx_t tun_net_xmit(struct s
 	/* Limit the number of packets queued by dividing txq length with the
 	 * number of queues.
 	 */
-	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue)
-			  >= dev->tx_queue_len / tun->numqueues)
+	if (skb_queue_len(&tfile->socket.sk->sk_receive_queue) * numqueues
+			  >= dev->tx_queue_len)
 		goto drop;
 
 	if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))

^ permalink raw reply

* Re: [PATCH net-next] net: vxlan: do not use vxlan_net before checking event type
From: Daniel Borkmann @ 2014-01-20 22:01 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: davem, netdev, Jesse Brandeburg, Cong Wang
In-Reply-To: <87eh428gpx.fsf@xmission.com>

On 01/20/2014 10:51 PM, Eric W. Biederman wrote:
...
> I am not going to argue against this patch as an immediate bug fix but
> something smells here, that bears deeper investigation.  It looks like
> the symptom is being patched rather than the actual problem.
>
> In particular net_generic(dev_net(dev), vxlan_net_id) is valid at the
> point that it is being called.  As the pointers are allocated in
> copy_net_ns in net_alloc prior to setup_net being called.
>
> On the flip side it is the responsibility of code that uses both
> register_netdev_notifier and register_pernet_xxx to be ready to handle
> events from any namespace as soon as they happen.  vxlan should be using
> register_pernet_subsys instead of register_pernet_device to ensure the
> vxlan_net structure is initialized before and cleaned up after all
> network devices in a given network namespace.  The vlan devices with a
> similar problem already do this.
>
> So in summary.  Something smells and I don't believe this patch fixes
> the underlying issue.  Please take a deeper look into what vxlan is doing.

Thanks for the input Eric!

If no-one is faster than me, I'll try to look into it soon.

^ permalink raw reply

* Re: [PATCH net-next v5 8/9] xen-netback: Timeout packets in RX path
From: Wei Liu @ 2014-01-20 22:03 UTC (permalink / raw)
  To: Zoltan Kiss
  Cc: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
In-Reply-To: <1390253069-25507-9-git-send-email-zoltan.kiss@citrix.com>

On Mon, Jan 20, 2014 at 09:24:28PM +0000, Zoltan Kiss wrote:
> A malicious or buggy guest can leave its queue filled indefinitely, in which
> case qdisc start to queue packets for that VIF. If those packets came from an
> another guest, it can block its slots and prevent shutdown. To avoid that, we
> make sure the queue is drained in every 10 seconds.
> The QDisc queue in worst case takes 3 round to flush usually.
> 
> v3:
> - remove stale debug log
> - tie unmap timeout in xenvif_free to this timeout
> 
> v4:
> - due to RX flow control changes now start_xmit doesn't drop the packets but
>   place them on the internal queue. So the timer set rx_queue_purge and kick in
>   the thread to drop the packets there
> - we shoot down the timer if a previously filled internal queue drains
> - adjust the teardown timeout as in worst case it can take more time now
> 
> v5:
> - create separate variable worst_case_skb_lifetime and add an explanation about
>   why is it so long
> 
> Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
> ---
>  drivers/net/xen-netback/common.h    |    6 ++++++
>  drivers/net/xen-netback/interface.c |   37 +++++++++++++++++++++++++++++++++--
>  drivers/net/xen-netback/netback.c   |   23 +++++++++++++++++++---
>  3 files changed, 61 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index 109c29f..d1cd8ce 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -129,6 +129,9 @@ struct xenvif {
>  	struct xen_netif_rx_back_ring rx;
>  	struct sk_buff_head rx_queue;
>  	RING_IDX rx_last_skb_slots;
> +	bool rx_queue_purge;
> +
> +	struct timer_list wake_queue;
>  
>  	/* This array is allocated seperately as it is large */
>  	struct gnttab_copy *grant_copy_op;
> @@ -225,4 +228,7 @@ void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
>  
>  extern bool separate_tx_rx_irq;
>  
> +extern unsigned int rx_drain_timeout_msecs;
> +extern unsigned int rx_drain_timeout_jiffies;
> +
>  #endif /* __XEN_NETBACK__COMMON_H__ */
> diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
> index af6b3e1..40aa500 100644
> --- a/drivers/net/xen-netback/interface.c
> +++ b/drivers/net/xen-netback/interface.c
> @@ -114,6 +114,18 @@ static irqreturn_t xenvif_interrupt(int irq, void *dev_id)
>  	return IRQ_HANDLED;
>  }
>  
> +static void xenvif_wake_queue(unsigned long data)
> +{
> +	struct xenvif *vif = (struct xenvif *)data;
> +
> +	if (netif_queue_stopped(vif->dev)) {
> +		netdev_err(vif->dev, "draining TX queue\n");
> +		vif->rx_queue_purge = true;
> +		xenvif_kick_thread(vif);
> +		netif_wake_queue(vif->dev);
> +	}
> +}
> +
>  static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  {
>  	struct xenvif *vif = netdev_priv(dev);
> @@ -143,8 +155,13 @@ static int xenvif_start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	 * then turn off the queue to give the ring a chance to
>  	 * drain.
>  	 */
> -	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed))
> +	if (!xenvif_rx_ring_slots_available(vif, min_slots_needed)) {
> +		vif->wake_queue.function = xenvif_wake_queue;
> +		vif->wake_queue.data = (unsigned long)vif;
>  		xenvif_stop_queue(vif);
> +		mod_timer(&vif->wake_queue,
> +			jiffies + rx_drain_timeout_jiffies);
> +	}
>  
>  	skb_queue_tail(&vif->rx_queue, skb);
>  	xenvif_kick_thread(vif);
> @@ -352,6 +369,8 @@ struct xenvif *xenvif_alloc(struct device *parent, domid_t domid,
>  	init_timer(&vif->credit_timeout);
>  	vif->credit_window_start = get_jiffies_64();
>  
> +	init_timer(&vif->wake_queue);
> +
>  	dev->netdev_ops	= &xenvif_netdev_ops;
>  	dev->hw_features = NETIF_F_SG |
>  		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
> @@ -532,6 +551,7 @@ void xenvif_disconnect(struct xenvif *vif)
>  		xenvif_carrier_off(vif);
>  
>  	if (vif->task) {
> +		del_timer_sync(&vif->wake_queue);
>  		kthread_stop(vif->task);
>  		vif->task = NULL;
>  	}
> @@ -557,12 +577,25 @@ void xenvif_disconnect(struct xenvif *vif)
>  void xenvif_free(struct xenvif *vif)
>  {
>  	int i, unmap_timeout = 0;
> +	/* Here we want to avoid timeout messages if an skb can be legitimatly
> +	 * stucked somewhere else. Realisticly this could be an another vif's
> +	 * internal or QDisc queue. That another vif also has this
> +	 * rx_drain_timeout_msecs timeout, but the timer only ditches the
> +	 * internal queue. After that, the QDisc queue can put in worst case
> +	 * XEN_NETIF_RX_RING_SIZE / MAX_SKB_FRAGS skbs into that another vif's
> +	 * internal queue, so we need several rounds of such timeouts until we
> +	 * can be sure that no another vif should have skb's from us. We are
> +	 * not sending more skb's, so newly stucked packets are not interesting
> +	 * for us here.
> +	 */

You beat me to this. Was about to reply to your other email. :-)

It's also worth mentioning that DIV_ROUND_UP part is merely estimation,
as you cannot possible know the maximum / miminum queue length of all
other vifs (as they can be changed during runtime). In practice most
users will stick with the default, but some advanced users might want to
tune this value for individual vif (whether that's a good idea or not is
another topic).

So, in order to convince myself this is safe. I also did some analysis
on the impact of having queue length other than default value.  If
queue_len < XENVIF_QUEUE_LENGTH, that means you can queue less packets
in qdisc than default and drain it faster than calculated, which is
safe. On the other hand if queue_len > XENVIF_QUEUE_LENGTH, it means
actually you need more time than calculated. I'm in two minded here. The
default value seems sensible to me but I'm still a bit worried about the
queue_len > XENVIF_QUEUE_LENGTH case.

An idea is to book-keep maximum tx queue len among all vifs and use that
to calculate worst scenario.

Wei.

^ permalink raw reply

* RE: [PATCH net-next] hyperv: Add support for physically discontinuous receive buffer
From: Haiyang Zhang @ 2014-01-20 22:06 UTC (permalink / raw)
  To: David Miller
  Cc: netdev@vger.kernel.org, KY Srinivasan, olaf@aepfle.de,
	jasowang@redhat.com, linux-kernel@vger.kernel.org,
	driverdev-devel@linuxdriverproject.org
In-Reply-To: <20140114.143133.1863157816134006560.davem@davemloft.net>



> -----Original Message-----
> From: David Miller [mailto:davem@davemloft.net]
> Sent: Tuesday, January 14, 2014 5:32 PM
> To: Haiyang Zhang
> Cc: netdev@vger.kernel.org; KY Srinivasan; olaf@aepfle.de;
> jasowang@redhat.com; linux-kernel@vger.kernel.org; driverdev-
> devel@linuxdriverproject.org
> Subject: Re: [PATCH net-next] hyperv: Add support for physically discontinuous
> receive buffer
> 
> From: Haiyang Zhang <haiyangz@microsoft.com>
> Date: Thu,  9 Jan 2014 14:24:47 -0800
> 
> > This will allow us to use bigger receive buffer, and prevent
> > allocation failure due to fragmented memory.
> >
> > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
> 
> Not until you start using paged SKBs in netvsc_recv_callback.
> 
> Whatever fragmention you think you're avoiding in the hyperv layer, you're still
> going to get from the:
> 
> 	skb = netdev_alloc_skb_ip_align(net, packet->total_data_buflen);
> 
> call there.
> 
> This change makes no sense in isolation, therefore I'm not applying it until you
> also include the appropriate changes to avoid the same exact fragmentation
> issue in netvsc_drv.c as stated above.

The receive buffer currently requires multiple MB of physically continuous memory,
and may fail to be allocated when memory is fragmented. The patch is created for
this issue.

The SKB buffer is usually less than 1500 bytes or up to several KB with jumbo frame, 
so it's much less sensitive to fragmented memory. I will work on another patch to use 
SKB buffer with discontinuous pages. 

Could you accept this patch separately?

Thanks,
- Haiyang

^ permalink raw reply

* RE: [PATCH net-next] hyperv: Add support for physically discontinuous receive buffer
From: KY Srinivasan @ 2014-01-20 22:10 UTC (permalink / raw)
  To: Haiyang Zhang, David Miller
  Cc: netdev@vger.kernel.org, olaf@aepfle.de,
	driverdev-devel@linuxdriverproject.org, jasowang@redhat.com,
	linux-kernel@vger.kernel.org
In-Reply-To: <5f60b6bd1ea84eadbb730c7ea9d3d4d5@DFM-DB3MBX15-06.exchange.corp.microsoft.com>



> -----Original Message-----
> From: Haiyang Zhang
> Sent: Monday, January 20, 2014 2:06 PM
> To: David Miller
> Cc: netdev@vger.kernel.org; KY Srinivasan; olaf@aepfle.de;
> jasowang@redhat.com; linux-kernel@vger.kernel.org; driverdev-
> devel@linuxdriverproject.org
> Subject: RE: [PATCH net-next] hyperv: Add support for physically discontinuous
> receive buffer
> 
> 
> 
> > -----Original Message-----
> > From: David Miller [mailto:davem@davemloft.net]
> > Sent: Tuesday, January 14, 2014 5:32 PM
> > To: Haiyang Zhang
> > Cc: netdev@vger.kernel.org; KY Srinivasan; olaf@aepfle.de;
> > jasowang@redhat.com; linux-kernel@vger.kernel.org; driverdev-
> > devel@linuxdriverproject.org
> > Subject: Re: [PATCH net-next] hyperv: Add support for physically discontinuous
> > receive buffer
> >
> > From: Haiyang Zhang <haiyangz@microsoft.com>
> > Date: Thu,  9 Jan 2014 14:24:47 -0800
> >
> > > This will allow us to use bigger receive buffer, and prevent
> > > allocation failure due to fragmented memory.
> > >
> > > Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
> > > Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
> >
> > Not until you start using paged SKBs in netvsc_recv_callback.
> >
> > Whatever fragmention you think you're avoiding in the hyperv layer, you're still
> > going to get from the:
> >
> > skb = netdev_alloc_skb_ip_align(net, packet->total_data_buflen);
> >
> > call there.
> >
> > This change makes no sense in isolation, therefore I'm not applying it until you
> > also include the appropriate changes to avoid the same exact fragmentation
> > issue in netvsc_drv.c as stated above.
> 
> The receive buffer currently requires multiple MB of physically continuous
> memory,
> and may fail to be allocated when memory is fragmented. The patch is created
> for
> this issue.
> 
> The SKB buffer is usually less than 1500 bytes or up to several KB with jumbo
> frame,
> so it's much less sensitive to fragmented memory. I will work on another patch to
> use
> SKB buffer with discontinuous pages.
> 
> Could you accept this patch separately?

Today, if we try to unload and load the network driver, the load may fail because we may 
not be able to allocate the receive buffers if memory is fragmented. This patch specifically addresses
this problem.

Regards,

K. Y
> 
> Thanks,
> - Haiyang
> 

^ permalink raw reply

* Re: [PATCH net-next v5 8/9] xen-netback: Timeout packets in RX path
From: Wei Liu @ 2014-01-20 22:12 UTC (permalink / raw)
  To: Zoltan Kiss
  Cc: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
In-Reply-To: <20140120220348.GA5058@zion.uk.xensource.com>

On Mon, Jan 20, 2014 at 10:03:48PM +0000, Wei Liu wrote:
[...]
> 
> You beat me to this. Was about to reply to your other email. :-)
> 
> It's also worth mentioning that DIV_ROUND_UP part is merely estimation,
> as you cannot possible know the maximum / miminum queue length of all
> other vifs (as they can be changed during runtime). In practice most
> users will stick with the default, but some advanced users might want to
> tune this value for individual vif (whether that's a good idea or not is
> another topic).
> 
> So, in order to convince myself this is safe. I also did some analysis
> on the impact of having queue length other than default value.  If
> queue_len < XENVIF_QUEUE_LENGTH, that means you can queue less packets
> in qdisc than default and drain it faster than calculated, which is
> safe. On the other hand if queue_len > XENVIF_QUEUE_LENGTH, it means
> actually you need more time than calculated. I'm in two minded here. The
> default value seems sensible to me but I'm still a bit worried about the
> queue_len > XENVIF_QUEUE_LENGTH case.
> 
> An idea is to book-keep maximum tx queue len among all vifs and use that
> to calculate worst scenario.
> 

And unfortunately there doesn't seem to be a way to know when tx queue
length is changed! So this approach won't work. :-(

Wei.

^ permalink raw reply

* [PATCH iproute2] Add IFLA_SLAVE support.
From: Scott Feldman @ 2014-01-20 22:25 UTC (permalink / raw)
  To: stephen; +Cc: netdev, roopa, shm

Show slave details for link when slave has IFLA_SLAVE attributes, e.g.:

ip -d link show eth4
6: eth4: <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> mtu 1500 qdisc pfifo_fast master bond1 state UP mode DEFAULT group default qlen 1000
    link/ether 00:02:00:00:04:03 brd ff:ff:ff:ff:ff:ff promiscuity 1
    slave state ACTIVE mii_status UP link_failure_count 0 perm_hwaddr 00:02:00:00:04:03 queue_id 0 ad_aggregator_id 1
---
 include/linux/if_link.h |   13 +++++++++
 ip/ipaddress.c          |   71 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 098be3d..7f956f6 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -144,6 +144,7 @@ enum {
 	IFLA_NUM_RX_QUEUES,
 	IFLA_CARRIER,
 	IFLA_PHYS_PORT_ID,
+	IFLA_SLAVE,
 	__IFLA_MAX
 };
 
@@ -366,6 +367,18 @@ enum {
 
 #define IFLA_BOND_AD_INFO_MAX   (__IFLA_BOND_AD_INFO_MAX - 1)
 
+enum {
+	IFLA_SLAVE_STATE,
+	IFLA_SLAVE_MII_STATUS,
+	IFLA_SLAVE_LINK_FAILURE_COUNT,
+	IFLA_SLAVE_PERM_HWADDR,
+	IFLA_SLAVE_QUEUE_ID,
+	IFLA_SLAVE_AD_AGGREGATOR_ID,
+	__IFLA_SLAVE_MAX,
+};
+
+#define IFLA_SLAVE_MAX  (__IFLA_SLAVE_MAX - 1)
+
 /* SR-IOV virtual function management section */
 
 enum {
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index d02eaaf..a0d3ab8 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -27,6 +27,7 @@
 
 #include <linux/netdevice.h>
 #include <linux/if_arp.h>
+#include <linux/if_bonding.h>
 #include <linux/sockios.h>
 
 #include "rt_names.h"
@@ -223,6 +224,73 @@ static void print_linktype(FILE *fp, struct rtattr *tb)
 	}
 }
 
+static const char *slave_states[] = {
+	[BOND_STATE_ACTIVE] = "ACTIVE",
+	[BOND_STATE_BACKUP] = "BACKUP",
+};
+
+static void print_slave_state(FILE *f, struct rtattr *tb)
+{
+	unsigned int state = rta_getattr_u8(tb);
+
+	if (state >= sizeof(slave_states) / sizeof(slave_states[0]))
+		fprintf(f, "state %d ", state);
+	else
+		fprintf(f, "state %s ", slave_states[state]);
+}
+
+static const char *slave_mii_status[] = {
+	[BOND_LINK_UP] = "UP",
+	[BOND_LINK_FAIL] = "GOING_DOWN",
+	[BOND_LINK_DOWN] = "DOWN",
+	[BOND_LINK_BACK] = "GOING_BACK",
+};
+
+static void print_slave_mii_status(FILE *f, struct rtattr *tb)
+{
+	unsigned int status = rta_getattr_u8(tb);
+
+	if (status >= sizeof(slave_mii_status) / sizeof(slave_mii_status[0]))
+		fprintf(f, "mii_status %d ", status);
+	else
+		fprintf(f, "mii_status %s ", slave_mii_status[status]);
+}
+
+static void print_slave(FILE *fp, struct rtattr *tb)
+{
+	struct rtattr *slave[IFLA_SLAVE_MAX+1];
+	SPRINT_BUF(b1);
+
+	parse_rtattr_nested(slave, IFLA_SLAVE_MAX, tb);
+
+	if (!slave[IFLA_SLAVE_STATE])
+		return;
+
+	fprintf(fp, "%s    slave ", _SL_);
+	print_slave_state(fp, slave[IFLA_SLAVE_STATE]);
+
+	if (slave[IFLA_SLAVE_MII_STATUS])
+		print_slave_mii_status(fp, slave[IFLA_SLAVE_MII_STATUS]);
+
+	if (slave[IFLA_SLAVE_LINK_FAILURE_COUNT])
+		fprintf(fp, "link_failure_count %d ",
+			rta_getattr_u32(slave[IFLA_SLAVE_LINK_FAILURE_COUNT]));
+
+	if (slave[IFLA_SLAVE_PERM_HWADDR])
+		fprintf(fp, "perm_hwaddr %s ",
+			ll_addr_n2a(RTA_DATA(slave[IFLA_SLAVE_PERM_HWADDR]),
+				    RTA_PAYLOAD(slave[IFLA_SLAVE_PERM_HWADDR]),
+				    0, b1, sizeof(b1)));
+
+	if (slave[IFLA_SLAVE_QUEUE_ID])
+		fprintf(fp, "queue_id %d ",
+			rta_getattr_u16(slave[IFLA_SLAVE_QUEUE_ID]));
+
+	if (slave[IFLA_SLAVE_AD_AGGREGATOR_ID])
+		fprintf(fp, "ad_aggregator_id %d ",
+			rta_getattr_u16(slave[IFLA_SLAVE_AD_AGGREGATOR_ID]));
+}
+
 static void print_vfinfo(FILE *fp, struct rtattr *vfinfo)
 {
 	struct ifla_vf_mac *vf_mac;
@@ -516,6 +584,9 @@ int print_linkinfo(const struct sockaddr_nl *who,
 			print_vfinfo(fp, i);
 	}
 
+	if (do_link && tb[IFLA_SLAVE] && show_details)
+		print_slave(fp, tb[IFLA_SLAVE]);
+
 	fprintf(fp, "\n");
 	fflush(fp);
 	return 0;

^ permalink raw reply related

* Poor network performance x86_64.. also with 3.13
From: Daniel Exner @ 2014-01-20 22:27 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-kernel, netdev
In-Reply-To: <52DB0439.2060905@dragonslave.de>

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA512

Hi,

Am 18.01.2014 23:46, schrieb Daniel Exner:
> Hi again,
> 
> Am 18.01.2014 20:50, schrieb Borislav Petkov:
>> + netdev.
> Thx
> 
> Am 18.01.2014 20:49, schrieb Holger Hoffstätte:> [This mail was
> also posted to gmane.linux.kernel.]
> 
>> On Sat, 18 Jan 2014 20:30:55 +0100, Daniel Exner wrote:
> 
>>> I recently upgraded the Kernel from version 3.10 to latest 
>>> stable 3.12.8, did the usual "make oldconfig" (resulting
>>> config attached).
>>> 
>>> But now I noticed some _really_ low network performance.
> 
>> Try: sysctl net.ipv4.tcp_limit_output_bytes=262144
> 
> Tried that. Even 10 times the value. Same effect.


I just did the same procedure with Kernel Version 3.13: same poor rates.

I think I will try to see of 3.12.6 was still ok and bisect from there.

Greetings
Daniel
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.14 (GNU/Linux)
Comment: Using GnuPG with Thunderbird - http://www.enigmail.net/

iQIcBAEBCgAGBQJS3aLFAAoJEPI6v6bI/Qkf9+4QAJkfljUsRQn6DA6gWy3XYsn4
ZB3F2Mu8kLsCMVjpASsi7+km2qTiFv4qGOgezHCJmqMcdCFszBweGQrnYBLA5PCD
XSZ7G4S0U71aHWtY6iQd1q4ywnA21pfnGRqIpc5+OuIiIOm+YY+RXpJAHC5y1OVo
MxsPL1ZVp/enJoZuvblw6i+JT+soAbSypPWcNQ78qb+CYzLVMLZHcqQvMwpAsRvQ
LNKx8nyj8p32CdZo1GoT3f/nWvBeh/V/ViLrtt64u/oXMJyk5INVRFpSNUUviP8c
42y+r2K31+nY11K2dHsdJYbv5lZ8p8g0SNoLG1SrjgDspaptnT8jptxxn7GcQqdL
PZ3waUB7qYU15IxCA2iXwNPqjtsv8V5l55H/cunKQgxNbb318ui/a3cW7+R++CeL
onv9HFNUkHJiP/MvZJ1/FXE0AsjX70un9NuQ0+xFjCRwJ/YLZzCHkWMERcev500O
vS1yFTGiVY1HndoA4VFYzEkjOyHgDHHQA+0JkfBspVlhL7ow9hccmULZtEn9LzwU
9rooQHyXwdKr6KIbsjHECyjIsBhW4Jfj6195bZ9ddBDBXSqYyGqjiuy7l7TjlZVa
YmPNTlkEfMeXkO2h3km8TD2f+MPntYXkYjZVVUcK8NucgnIdLuWDk/GLrt73VTd8
Cww6B/u4YnGJSF5v/nit
=xCa3
-----END PGP SIGNATURE-----

^ permalink raw reply

* Re: Poor network performance x86_64.. also with 3.13
From: Borislav Petkov @ 2014-01-20 22:37 UTC (permalink / raw)
  To: Daniel Exner; +Cc: linux-kernel, netdev
In-Reply-To: <52DDA2CD.9040001@dragonslave.de>

On Mon, Jan 20, 2014 at 11:27:25PM +0100, Daniel Exner wrote:
> I just did the same procedure with Kernel Version 3.13: same poor rates.
> 
> I think I will try to see of 3.12.6 was still ok and bisect from there.

Or try something more coarse-grained like 3.11 first, then 3.12 and then
the -rcs in between.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply

* Re: [PATCH 2/2] net/cxgb4: Don't retrieve stats during recovery
From: Dimitris Michailidis @ 2014-01-20 22:05 UTC (permalink / raw)
  To: Gavin Shan, netdev
In-Reply-To: <1390187144-15495-2-git-send-email-shangw@linux.vnet.ibm.com>

On 01/19/2014 07:05 PM, Gavin Shan wrote:
> We possiblly retrieve the adapter's statistics during EEH recovery
> and that should be disallowed. Otherwise, it would possibly incur
> replicate EEH error and EEH recovery is going to fail eventually.
> The patch checks if the PCI device is off-line before statistic
> retrieval.

The net_devices are detached during EEH so I think netif_device_present 
is a better check than pci_channel_offline.  I am not sure such a test 
should be left to each driver though.  If you do end up putting it in 
the driver it needs better synchronization with the EEH handlers as Ben 
mentioned.

>
> Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> ---
>   drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |   11 +++++++++++
>   1 file changed, 11 insertions(+)
>
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index c8eafbf..b0e72fb 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -4288,6 +4288,17 @@ static struct rtnl_link_stats64 *cxgb_get_stats(struct net_device *dev,
>   	struct port_info *p = netdev_priv(dev);
>   	struct adapter *adapter = p->adapter;
>
> +	/*
> +	 * We possibly retrieve the statistics while the PCI
> +	 * device is off-line. That would cause the recovery
> +	 * on off-lined PCI device going to fail. So it's
> +	 * reasonable to block it during the recovery period.
> +	 */
> +	if (pci_channel_offline(adapter->pdev)) {
> +		memset(ns, 0, sizeof(*ns));
> +		return ns;
> +	}
> +
>   	spin_lock(&adapter->stats_lock);
>   	t4_get_port_stats(adapter, p->tx_chan, &stats);
>   	spin_unlock(&adapter->stats_lock);
>

^ permalink raw reply

* Re: [PATCH] SUNRPC: Allow one callback request to be received from two sk_buff
From: Trond Myklebust @ 2014-01-20 23:17 UTC (permalink / raw)
  To: shaobingqing; +Cc: bfields, davem, linux-nfs, netdev, linux-kernel
In-Reply-To: <1390201154-20815-1-git-send-email-shaobingqing@bwstor.com.cn>

On Mon, 2014-01-20 at 14:59 +0800, shaobingqing wrote:
> In current code, there only one struct rpc_rqst is prealloced. If one 
> callback request is received from two sk_buff, the xprt_alloc_bc_request
> would be execute two times with the same transport->xid. The first time
> xprt_alloc_bc_request will alloc one struct rpc_rqst and the TCP_RCV_COPY_DATA
> bit of transport->tcp_flags will not be cleared. The second time 
> xprt_alloc_bc_request could not alloc struct rpc_rqst any more and NULL
> pointer will be returned, then xprt_force_disconnect occur. I think one 
> callback request can be allowed to be received from two sk_buff.
> 
> Signed-off-by: shaobingqing <shaobingqing@bwstor.com.cn>
> ---
>  net/sunrpc/xprtsock.c |   11 +++++++++--
>  1 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> index ee03d35..606950d 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -1271,8 +1271,13 @@ static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
>  	struct sock_xprt *transport =
>  				container_of(xprt, struct sock_xprt, xprt);
>  	struct rpc_rqst *req;
> +	static struct rpc_rqst *req_partial;
> +
> +	if (req_partial == NULL)
> +		req = xprt_alloc_bc_request(xprt);
> +	else if (req_partial->rq_xid == transport->tcp_xid)
> +		req = req_partial;

What happens here if req_partial->rq_xid != transport->tcp_xid? AFAICS,
req will be undefined. Either way, you cannot use a static variable for
storage here: that isn't re-entrant.

> -	req = xprt_alloc_bc_request(xprt);
>  	if (req == NULL) {
>  		printk(KERN_WARNING "Callback slot table overflowed\n");
>  		xprt_force_disconnect(xprt);
> @@ -1285,6 +1290,7 @@ static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
>  
>  	if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
>  		struct svc_serv *bc_serv = xprt->bc_serv;
> +		req_partial = NULL;
>  
>  		/*
>  		 * Add callback request to callback list.  The callback
> @@ -1297,7 +1303,8 @@ static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
>  		list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
>  		spin_unlock(&bc_serv->sv_cb_lock);
>  		wake_up(&bc_serv->sv_cb_waitq);
> -	}
> +	} else
> +		req_partial = req;
>  
>  	req->rq_private_buf.len = transport->tcp_copied;
>  


-- 
Trond Myklebust
Linux NFS client maintainer

^ permalink raw reply

* [PATCH net-next] net: filter: let bpf_tell_extensions return SKF_AD_MAX
From: Daniel Borkmann @ 2014-01-20 23:19 UTC (permalink / raw)
  To: davem; +Cc: netdev, Michal Sekletar, Eric Dumazet

Michal Sekletar added in commit ea02f9411d9f ("net: introduce
SO_BPF_EXTENSIONS") a facility where user space can enquire
the BPF ancillary instruction set, which is imho a step into
the right direction for letting user space high-level to BPF
optimizers make an informed decision for possibly using these
extensions.

The original rationale was to return through a getsockopt(2)
a bitfield of which instructions are supported and which
are not, as of right now, we just return 0 to indicate a
base support for SKF_AD_PROTOCOL up to SKF_AD_PAY_OFFSET.
Limitations of this approach are that this API which we need
to maintain for a long time can only support a maximum of 32
extensions, and needs to be additionally maintained/updated
when each new extension that comes in.

I thought about this a bit more and what we can do here to
overcome this is to just return SKF_AD_MAX. Since we never
remove any extension since we cannot break user space and
always linearly increase SKF_AD_MAX on each newly added
extension, user space can make a decision on what extensions
are supported in the whole set of extensions and which aren't,
by just checking which of them from the whole set have an
offset < SKF_AD_MAX of the underlying kernel.

Since SKF_AD_MAX must be updated each time we add new ones,
we don't need to introduce an additional enum and got
maintenance for free. At some point in time when
SO_BPF_EXTENSIONS becomes ubiquitous for most kernels, then
an application can simply make use of this and easily be run
on newer or older underlying kernels without needing to be
recompiled, of course. Since that is for 3.14, it's not too
late to do this change.

Cc: Michal Sekletar <msekleta@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
---
 include/linux/filter.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 1a95a2d..e568c8e 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -85,13 +85,7 @@ static inline void bpf_jit_free(struct sk_filter *fp)
 
 static inline int bpf_tell_extensions(void)
 {
-	/* When adding new BPF extension it is necessary to enumerate
-	 * it here, so userspace software which wants to know what is
-	 * supported can do so by inspecting return value of this
-	 * function
-	 */
-
-	return 0;
+	return SKF_AD_MAX;
 }
 
 enum {
-- 
1.8.3.1

^ permalink raw reply related

* Re: Poor network performance x86_64.. also with 3.13
From: Branimir Maksimovic @ 2014-01-20 23:27 UTC (permalink / raw)
  To: linux-kernel; +Cc: Borislav Petkov, Daniel Exner, netdev
In-Reply-To: <20140120223752.GA3057@pd.tnic>

On 01/20/2014 11:37 PM, Borislav Petkov wrote:
> On Mon, Jan 20, 2014 at 11:27:25PM +0100, Daniel Exner wrote:
>> I just did the same procedure with Kernel Version 3.13: same poor rates.
>>
>> I think I will try to see of 3.12.6 was still ok and bisect from there.
> Or try something more coarse-grained like 3.11 first, then 3.12 and then
> the -rcs in between.
>
Hm, on my machine 3.13 (latest git) has double throughtput of 3.11 (distro
compiled) on loopback interface. 68Gb vs 33Gb (iperf).

^ permalink raw reply

* Re: [PATCH] DT: net: document Ethernet bindings in one place
From: Sergei Shtylyov @ 2014-01-20 23:33 UTC (permalink / raw)
  To: Rob Herring
  Cc: netdev, Rob Herring, Pawel Moll, Mark Rutland, Ian Campbell,
	Kumar Gala, devicetree@vger.kernel.org, Rob Landley,
	linux-doc@vger.kernel.org
In-Reply-To: <CAL_JsqJXbF1-PcPHR2VP+Vi9A1aWizdsG_r3kDvRt3itXDhCGQ@mail.gmail.com>

On 01/21/2014 12:20 AM, Rob Herring wrote:

>>>> This patch is an attempt to gather the Ethernet related bindings in one
>>>> file,
>>>> like it's done in the MMC and some other subsystems. It should save the
>>>> trouble
>>>> of documenting several properties over and over in each binding document.

>>>> I have used the Embedded Power Architecture(TM) Platform Requirements
>>>> (ePAPR)
>>>> standard as a base for the properties description, also documenting some
>>>> ad-hoc
>>>> properties that have been introduced over time despite having direct
>>>> analogs in
>>>> ePAPR.

>>>> Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>

>>>> ---
>>>> The patch is against DaveM's 'net-next.git' repo and the DaVinci EMAC
>>>> bindings
>>>> fix I've posted yesterday:

>>>> http://patchwork.ozlabs.org/patch/311854/

>> [...]

>>>> Index: net-next/Documentation/devicetree/bindings/net/ethernet.txt
>>>> ===================================================================
>>>> --- /dev/null
>>>> +++ net-next/Documentation/devicetree/bindings/net/ethernet.txt
>>>> @@ -0,0 +1,20 @@
>>>> +The following properties are common to the Ethernet controllers:
>>>> +
>>>> +- local-mac-address: array of 6 bytes, specifies the MAC address that
>>>> was
>>>> +  assigned to the network device;
>>>> +- mac-address: array of 6 bytes, specifies the MAC address that was last
>>>> used by
>>>> +  the boot program; should be used in cases where the MAC address
>>>> assigned to
>>>> +  the device by the boot program is different from the
>>>> "local-mac-address"
>>>> +  property;
>>>> +- max-speed: number, specifies maximum speed in Mbit/s supported by the
>>>> device;
>>>> +- phy-mode: string, operation mode of the PHY interface; supported
>>>> values are
>>>> +  "mii", "gmii", "sgmii", "tbi", "rev-mii", "rmii", "rgmii", "rgmii-id",
>>>> +  "rgmii-rxid", "rgmii-txid", "rtbi", "smii", "xgmii";

>>> Mark this as deprecated

>>     That's kind of wishful thinking at this point, as that's what the
>> majority of drivers use. I'm unsure of the reasons why that was done,
>> probably people just didn't read the proper specs...

> Or the spec was defined after those bindings.

    No, that's not likely as "phy-connection-type" prop seems very old, most 
probably predating ePAPR. ePAPR exists since 2008, kernel support for 
"phy-mode" prop dates back only to 2011.

> Deprecating does not
> matter for existing bindings. It's only defining new ones that are
> affected. I was more concerned with giving people guidance on which
> one to use for new bindings.

    If "phy-connection-type" is to be used, it makes sense to modify 
of_get_phy_mode() to also look for that prop, right?

>>> in favor of phy-connection-type

>>     That one is only used by the oldish PowerPC 'gianfar' driver.

>>> so it's use does not spread.

>>     I'm afraid that's too late, it has spread very far, so that
>> of_get_phy_mode() handles that property, not "phy-connection-type".

> Uggg, I guess this is a case of a defacto standard then if the kernel
> doesn't even support it.

    What's your guess on what to do on these 2 props then? Still deprecate 
"phy-mode"?

>>>> +- phy-connection-type: the same as "phy-mode" property (but described in
>>>> ePAPR);
>>>> +- phy-handle: phandle, specifies a reference to a node representing a
>>>> PHY
>>>> +  device (this property is described in ePAPR);
>>>> +- phy: the same as "phy-handle" property (but actually ad-hoc one).

>>> Mark this as deprecated in favor of phy-handle.

>>     Here situation is more optimistic. Quite many drivers still use
>> "phy-handle", though some use even more exotic props I didn't document here.

> Perhaps flagging as "Not recommended for new bindings" would be nicer wording...

    Perhaps.

> Rob

WBR, Sergei


^ permalink raw reply

* Release of nftables-plus 0.099
From: Jan Engelhardt @ 2014-01-20 23:38 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Netfilter Developer Mailing List, Netfilter user mailing list,
	announce, Linux Networking Developer Mailing List, coreteam
In-Reply-To: <20140120131132.GA32427@macbook.localnet>


My lone self presents:

	nftables-plus 0.099


Overview
========

A package that is based upon nftables and adds usability patches.


Status
======

In this first addon release, the build system was converted to automake 
to give distributions a better build experience.


Naming
======

	What's in a name?


Resources
=========

nftables-plus is obtainable from

http://xtables.de/
git://git.xtables.de/nftables-plus

The same build requirements as plain nftables apply. Apparently 
minus lex and bison, which have become optional.

The code appears now.

^ permalink raw reply

* Re: [netfilter-core] Release of nftables-plus 0.099
From: Patrick McHardy @ 2014-01-20 23:41 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Linux Networking Developer Mailing List,
	Netfilter user mailing list, Netfilter Developer Mailing List,
	coreteam, announce
In-Reply-To: <alpine.LSU.2.11.1401202336140.22927@nerf08.vanv.qr>

On Tue, Jan 21, 2014 at 12:38:41AM +0100, Jan Engelhardt wrote:
> 
> My lone self presents:
> 
> 	nftables-plus 0.099

Jan, there's no need to CC the netfilter-core list, we're not interested
in your ego trips.

^ permalink raw reply

* Re: [netfilter-core] Release of nftables-plus 0.099
From: Jan Engelhardt @ 2014-01-21  0:00 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Linux Networking Developer Mailing List,
	Netfilter user mailing list, Netfilter Developer Mailing List,
	coreteam, announce
In-Reply-To: <20140120234150.GA17224@macbook.localnet>

On Tuesday 2014-01-21 00:41, Patrick McHardy wrote:

>On Tue, Jan 21, 2014 at 12:38:41AM +0100, Jan Engelhardt wrote:
>> 
>> My lone self presents:
>> 
>> 	nftables-plus 0.099
>
>Jan, there's no need to CC the netfilter-core list, we're not interested
>in your ego trips.

Well thanks for spelling out so clearly that I am not welcome anymore.

(That being said, someone will find itself to also complain to 
a stripped Cc. It happened regularly on lkml.)

^ permalink raw reply

* r8169 won't transmit with 3.12
From: Craig Small @ 2014-01-21  0:06 UTC (permalink / raw)
  To: Realtek linux nic maintainers, Francois Romieu; +Cc: netdev

Hi,
  I seem to be having lots of troubles with the RealTek chipsets. I have
one onboard and two PCI-e cards and they all have the same problem. They
will not transmit anything.  The r8169 driver does it, as does the 
r8168-dkms module.

It's a new setup so it might of never worked. It's not likely to be a
hardware problem as its three different devices.

I've sent what I think you might need for starters, but if there
is extra stuff you'd like to see, let me know.

I'm using a stock Debian kernel
3.12-1-amd64 #1 SMP Debian 3.12.6-2 (2013-12-29)

I've included the lspci output of one of the cards below. I'm not
really sure what else you'd need to check things.

The problem shows up the same, the TX dropped counter increments.
I'm not sure why 42 packets made it out (or even if they really did)
Receive works fine, I can even start up wireshark and see packets
pass by.

eth0      Link encap:Ethernet  HWaddr 00:e0:4c:80:66:57  
          inet addr:192.168.1.222  Bcast:192.168.1.255  Mask:255.255.255.0
          inet6 addr: fe80::2e0:4cff:fe80:6657/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:8252 errors:0 dropped:0 overruns:0 frame:0
          TX packets:42 errors:0 dropped:8029 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:584930 (571.2 KiB)  TX bytes:5083 (4.9 KiB)

ethtool -S shows a similar story, not sure if the rx_missed counter
is another problem:
NIC statistics:
     tx_packets: 47
     rx_packets: 31342
     tx_errors: 0
     rx_errors: 0
     rx_missed: 47438
     align_errors: 0
     tx_single_collisions: 0
     tx_multi_collisions: 0
     unicast: 1
     broadcast: 29638
     multicast: 1891
     tx_aborted: 0
     tx_underrun: 0

I've disabled the on-board device now, but the simple lspci output was:
03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168B PCI Express Gigabit Ethernet controller (rev 06)

The PCI-e cards have the following:

03:00.0 Ethernet controller: Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express Gigabit Ethernet Controller (rev 02)
	Subsystem: Realtek Semiconductor Co., Ltd. RTL8111/8168 PCI Express Gigabit Ethernet controller
	Control: I/O+ Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr- Stepping- SERR- FastB2B- DisINTx+
	Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort+ >SERR- <PERR- INTx-
	Latency: 0, Cache Line Size: 64 bytes
	Interrupt: pin A routed to IRQ 73
	Region 0: I/O ports at d000 [size=256]
	Region 2: Memory at fde00000 (64-bit, non-prefetchable) [size=4K]
	Region 4: Memory at d2200000 (64-bit, prefetchable) [size=64K]
	Expansion ROM at d2210000 [disabled] [size=64K]
	Capabilities: [40] Power Management version 2
		Flags: PMEClk- DSI- D1+ D2+ AuxCurrent=375mA PME(D0-,D1+,D2+,D3hot+,D3cold+)
		Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
	Capabilities: [50] MSI: Enable+ Count=1/1 Maskable- 64bit-
		Address: fee3f00c  Data: 4123
	Capabilities: [70] Express (v1) Endpoint, MSI 08
		DevCap:	MaxPayload 1024 bytes, PhantFunc 0, Latency L0s <128ns, L1 unlimited
			ExtTag+ AttnBtn- AttnInd- PwrInd- RBE- FLReset-
		DevCtl:	Report errors: Correctable- Non-Fatal- Fatal- Unsupported-
			RlxdOrd- ExtTag- PhantFunc- AuxPwr- NoSnoop-
			MaxPayload 128 bytes, MaxReadReq 4096 bytes
		DevSta:	CorrErr+ UncorrErr- FatalErr- UnsuppReq- AuxPwr+ TransPend+
		LnkCap:	Port #16, Speed unknown, Width x22, ASPM not supported, Exit Latency L0s <64ns, L1 <2us
			ClockPM+ Surprise- LLActRep- BwNot-
		LnkCtl:	ASPM Disabled; RCB 64 bytes Disabled- CommClk-
			ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
		LnkSta:	Speed 2.5GT/s, Width x1, TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
	Capabilities: [b0] MSI-X: Enable- Count=2 Masked-
		Vector table: BAR=4 offset=00000000
		PBA: BAR=4 offset=00000800
	Capabilities: [d0] Vital Product Data
		Unknown small resource type 05, will not decode more.
	Capabilities: [100 v1] Advanced Error Reporting
		UESta:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UEMsk:	DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
		UESvrt:	DLP+ SDES+ TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
		CESta:	RxErr+ BadTLP+ BadDLLP+ Rollover- Timeout- NonFatalErr-
		CEMsk:	RxErr- BadTLP- BadDLLP- Rollover- Timeout- NonFatalErr+
		AERCap:	First Error Pointer: 00, GenCap+ CGenEn- ChkCap+ ChkEn-
	Capabilities: [140 v1] Virtual Channel
		Caps:	LPEVC=0 RefClk=100ns PATEntryBits=1
		Arb:	Fixed- WRR32- WRR64- WRR128-
		Ctrl:	ArbSelect=Fixed
		Status:	InProgress-
		VC0:	Caps:	PATOffset=00 MaxTimeSlots=1 RejSnoopTrans-
			Arb:	Fixed- WRR32- WRR64- WRR128- TWRR128- WRR256-
			Ctrl:	Enable+ ID=0 ArbSelect=Fixed TC/VC=01
			Status:	NegoPending- InProgress-
	Capabilities: [160 v1] Device Serial Number 00-00-00-00-00-00-00-00
	Kernel driver in use: r8169

-- 
Craig Small (@smallsees)   http://enc.com.au/       csmall at : enc.com.au
Debian GNU/Linux           http://www.debian.org/   csmall at : debian.org
GPG fingerprint:        5D2F B320 B825 D939 04D2  0519 3938 F96B DF50 FEA5

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox