Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v4 6/9] xen-netback: Handle guests with too many frags
From: Zoltan Kiss @ 2014-01-14 20:39 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1389731995-9887-1-git-send-email-zoltan.kiss@citrix.com>

Xen network protocol had implicit dependency on MAX_SKB_FRAGS. Netback has to
handle guests sending up to XEN_NETBK_LEGACY_SLOTS_MAX slots. To achieve that:
- create a new skb
- map the leftover slots to its frags (no linear buffer here!)
- chain it to the previous through skb_shinfo(skb)->frag_list
- map them
- copy the whole stuff into a brand new skb and send it to the stack
- unmap the 2 old skb's pages

v3:
- adding extra check for frag number
- consolidate alloc_skb's into xenvif_alloc_skb()
- BUG_ON(frag_overflow > MAX_SKB_FRAGS)

v4:
- handle error of skb_copy_expand()

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

---
 drivers/net/xen-netback/netback.c |  125 ++++++++++++++++++++++++++++++++++---
 1 file changed, 115 insertions(+), 10 deletions(-)

diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index c2b2597..345c6a2 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -802,6 +802,20 @@ static inline void xenvif_tx_create_gop(struct xenvif *vif,
 
 }
 
+static inline struct sk_buff *xenvif_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb =
+		alloc_skb(size + NET_SKB_PAD + NET_IP_ALIGN,
+			  GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(skb == NULL))
+		return NULL;
+
+	/* Packets passed to netif_rx() must have some headroom. */
+	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
+
+	return skb;
+}
+
 static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 							struct sk_buff *skb,
 							struct xen_netif_tx_request *txp,
@@ -812,11 +826,16 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	u16 pending_idx = *((u16 *)skb->data);
 	int start;
 	pending_ring_idx_t index;
-	unsigned int nr_slots;
+	unsigned int nr_slots, frag_overflow = 0;
 
 	/* At this point shinfo->nr_frags is in fact the number of
 	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
 	 */
+	if (shinfo->nr_frags > MAX_SKB_FRAGS) {
+		frag_overflow = shinfo->nr_frags - MAX_SKB_FRAGS;
+		BUG_ON(frag_overflow > MAX_SKB_FRAGS);
+		shinfo->nr_frags = MAX_SKB_FRAGS;
+	}
 	nr_slots = shinfo->nr_frags;
 
 	/* Skip first skb fragment if it is on same page as header fragment. */
@@ -832,6 +851,29 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
 
+	if (frag_overflow) {
+		struct sk_buff *nskb = xenvif_alloc_skb(0);
+		if (unlikely(nskb == NULL)) {
+			netdev_err(vif->dev,
+				   "Can't allocate the frag_list skb.\n");
+			return NULL;
+		}
+
+		shinfo = skb_shinfo(nskb);
+		frags = shinfo->frags;
+
+		for (shinfo->nr_frags = 0; shinfo->nr_frags < frag_overflow;
+		     shinfo->nr_frags++, txp++, gop++) {
+			index = pending_index(vif->pending_cons++);
+			pending_idx = vif->pending_ring[index];
+			xenvif_tx_create_gop(vif, pending_idx, txp, gop);
+			frag_set_pending_idx(&frags[shinfo->nr_frags],
+					     pending_idx);
+		}
+
+		skb_shinfo(skb)->frag_list = nskb;
+	}
+
 	return gop;
 }
 
@@ -845,6 +887,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
+	struct sk_buff *first_skb = NULL;
 
 	/* Check status of header. */
 	err = gop->status;
@@ -867,6 +910,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	/* Skip first skb fragment if it is on same page as header fragment. */
 	start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
 
+check_frags:
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
 
@@ -903,11 +947,20 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		/* Not the first error? Preceding frags already invalidated. */
 		if (err)
 			continue;
-
 		/* First error: invalidate header and preceding fragments. */
-		pending_idx = *((u16 *)skb->data);
-		xenvif_idx_unmap(vif, pending_idx);
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_OKAY);
+		if (!first_skb) {
+			pending_idx = *((u16 *)skb->data);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif,
+					   pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		} else {
+			pending_idx = *((u16 *)first_skb->data);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif,
+					   pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		}
 		for (j = start; j < i; j++) {
 			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
 			xenvif_idx_unmap(vif, pending_idx);
@@ -919,6 +972,32 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 		err = newerr;
 	}
 
+	if (shinfo->frag_list) {
+		first_skb = skb;
+		skb = shinfo->frag_list;
+		shinfo = skb_shinfo(skb);
+		nr_frags = shinfo->nr_frags;
+		start = 0;
+
+		goto check_frags;
+	}
+
+	/* There was a mapping error in the frag_list skb. We have to unmap
+	 * the first skb's frags
+	 */
+	if (first_skb && err) {
+		int j;
+		shinfo = skb_shinfo(first_skb);
+		pending_idx = *((u16 *)first_skb->data);
+		start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx);
+		for (j = start; j < shinfo->nr_frags; j++) {
+			pending_idx = frag_get_pending_idx(&shinfo->frags[j]);
+			xenvif_idx_unmap(vif, pending_idx);
+			xenvif_idx_release(vif, pending_idx,
+					   XEN_NETIF_RSP_OKAY);
+		}
+	}
+
 	*gopp = gop + 1;
 	return err;
 }
@@ -1422,8 +1501,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			    ret < XEN_NETBK_LEGACY_SLOTS_MAX) ?
 			PKT_PROT_LEN : txreq.size;
 
-		skb = alloc_skb(data_len + NET_SKB_PAD + NET_IP_ALIGN,
-				GFP_ATOMIC | __GFP_NOWARN);
+		skb = xenvif_alloc_skb(data_len);
 		if (unlikely(skb == NULL)) {
 			netdev_dbg(vif->dev,
 				   "Can't allocate a skb in start_xmit.\n");
@@ -1431,9 +1509,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 			break;
 		}
 
-		/* Packets passed to netif_rx() must have some headroom. */
-		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
-
 		if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) {
 			struct xen_netif_extra_info *gso;
 			gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1];
@@ -1495,6 +1570,7 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		struct xen_netif_tx_request *txp;
 		u16 pending_idx;
 		unsigned data_len;
+		struct sk_buff *nskb = NULL;
 
 		pending_idx = *((u16 *)skb->data);
 		txp = &vif->pending_tx_info[pending_idx].req;
@@ -1537,6 +1613,32 @@ static int xenvif_tx_submit(struct xenvif *vif)
 				  pending_idx :
 				  INVALID_PENDING_IDX);
 
+		if (skb_shinfo(skb)->frag_list) {
+			nskb = skb_shinfo(skb)->frag_list;
+			xenvif_fill_frags(vif, nskb, INVALID_PENDING_IDX);
+			skb->len += nskb->len;
+			skb->data_len += nskb->len;
+			skb->truesize += nskb->truesize;
+			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			skb_shinfo(nskb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			vif->tx_zerocopy_sent += 2;
+			nskb = skb;
+
+			skb = skb_copy_expand(skb,
+					      0,
+					      0,
+					      GFP_ATOMIC | __GFP_NOWARN);
+			if (!skb) {
+				netdev_dbg(vif->dev,
+					   "Can't consolidate skb with too many fragments\n");
+				if (skb_shinfo(nskb)->destructor_arg)
+					skb_shinfo(nskb)->tx_flags |=
+						SKBTX_DEV_ZEROCOPY;
+				kfree_skb(nskb);
+				continue;
+			}
+			skb_shinfo(skb)->destructor_arg = NULL;
+		}
 		if (skb_is_nonlinear(skb) && skb_headlen(skb) < PKT_PROT_LEN) {
 			int target = min_t(int, skb->len, PKT_PROT_LEN);
 			__pskb_pull_tail(skb, target - skb_headlen(skb));
@@ -1590,6 +1692,9 @@ static int xenvif_tx_submit(struct xenvif *vif)
 		}
 
 		netif_receive_skb(skb);
+
+		if (nskb)
+			kfree_skb(nskb);
 	}
 
 	return work_done;

^ permalink raw reply related

* [PATCH net-next v4 5/9] xen-netback: Add stat counters for zerocopy
From: Zoltan Kiss @ 2014-01-14 20:39 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1389731995-9887-1-git-send-email-zoltan.kiss@citrix.com>

These counters help determine how often the buffers had to be copied. Also
they help find out if packets are leaked, as if "sent != success + fail",
there are probably packets never freed up properly.

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    3 +++
 drivers/net/xen-netback/interface.c |   15 +++++++++++++++
 drivers/net/xen-netback/netback.c   |    9 ++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 419e63c..e3c28ff 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -155,6 +155,9 @@ struct xenvif {
 
 	/* Statistics */
 	unsigned long rx_gso_checksum_fixup;
+	unsigned long tx_zerocopy_sent;
+	unsigned long tx_zerocopy_success;
+	unsigned long tx_zerocopy_fail;
 
 	/* Miscellaneous private stuff. */
 	struct net_device *dev;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index af5216f..75fe683 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -239,6 +239,21 @@ static const struct xenvif_stat {
 		"rx_gso_checksum_fixup",
 		offsetof(struct xenvif, rx_gso_checksum_fixup)
 	},
+	/* If (sent != success + fail), there are probably packets never
+	 * freed up properly!
+	 */
+	{
+		"tx_zerocopy_sent",
+		offsetof(struct xenvif, tx_zerocopy_sent),
+	},
+	{
+		"tx_zerocopy_success",
+		offsetof(struct xenvif, tx_zerocopy_success),
+	},
+	{
+		"tx_zerocopy_fail",
+		offsetof(struct xenvif, tx_zerocopy_fail)
+	},
 };
 
 static int xenvif_get_sset_count(struct net_device *dev, int string_set)
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index a1b03e4..e2dd565 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1611,8 +1611,10 @@ static int xenvif_tx_submit(struct xenvif *vif, int budget)
 		 * skb_copy_ubufs while we are still in control of the skb. E.g.
 		 * the __pskb_pull_tail earlier can do such thing.
 		 */
-		if (skb_shinfo(skb)->destructor_arg)
+		if (skb_shinfo(skb)->destructor_arg) {
 			skb_shinfo(skb)->tx_flags |= SKBTX_DEV_ZEROCOPY;
+			vif->tx_zerocopy_sent++;
+		}
 
 		netif_receive_skb(skb);
 	}
@@ -1645,6 +1647,11 @@ void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
 		napi_schedule(&vif->napi);
 	} while (ubuf);
 	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+
+	if (likely(zerocopy_success))
+		vif->tx_zerocopy_success++;
+	else
+		vif->tx_zerocopy_fail++;
 }
 
 static inline void xenvif_tx_action_dealloc(struct xenvif *vif)

^ permalink raw reply related

* [PATCH net-next v4 3/9] xen-netback: Remove old TX grant copy definitons and fix indentations
From: Zoltan Kiss @ 2014-01-14 20:39 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1389731995-9887-1-git-send-email-zoltan.kiss@citrix.com>

These became obsolate with grant mapping. I've left intentionally the
indentations in this way, to improve readability of previous patches.

v2:
- move the indentation fixup patch here

v4:
- indentation fixes

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h  |   37 +------------------
 drivers/net/xen-netback/netback.c |   72 ++++++++-----------------------------
 2 files changed, 15 insertions(+), 94 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index f35a3ce..2b1cd83 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -46,39 +46,9 @@
 #include <xen/xenbus.h>
 
 typedef unsigned int pending_ring_idx_t;
-#define INVALID_PENDING_RING_IDX (~0U)
 
-/* For the head field in pending_tx_info: it is used to indicate
- * whether this tx info is the head of one or more coalesced requests.
- *
- * When head != INVALID_PENDING_RING_IDX, it means the start of a new
- * tx requests queue and the end of previous queue.
- *
- * An example sequence of head fields (I = INVALID_PENDING_RING_IDX):
- *
- * ...|0 I I I|5 I|9 I I I|...
- * -->|<-INUSE----------------
- *
- * After consuming the first slot(s) we have:
- *
- * ...|V V V V|5 I|9 I I I|...
- * -----FREE->|<-INUSE--------
- *
- * where V stands for "valid pending ring index". Any number other
- * than INVALID_PENDING_RING_IDX is OK. These entries are considered
- * free and can contain any number other than
- * INVALID_PENDING_RING_IDX. In practice we use 0.
- *
- * The in use non-INVALID_PENDING_RING_IDX (say 0, 5 and 9 in the
- * above example) number is the index into pending_tx_info and
- * mmap_pages arrays.
- */
 struct pending_tx_info {
-	struct xen_netif_tx_request req; /* coalesced tx request */
-	pending_ring_idx_t head; /* head != INVALID_PENDING_RING_IDX
-				  * if it is head of one or more tx
-				  * reqs
-				  */
+	struct xen_netif_tx_request req; /* tx request */
 	/* callback data for released SKBs. The	callback is always
 	 * xenvif_zerocopy_callback, ctx points to the next fragment, desc
 	 * contains the pending_idx
@@ -135,11 +105,6 @@ struct xenvif {
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
 	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
-	/* Coalescing tx requests before copying makes number of grant
-	 * copy ops greater or equal to number of slots required. In
-	 * worst case a tx request consumes 2 gnttab_copy.
-	 */
-	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
 	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
 	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
 
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 5724468..f74fa92 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -71,16 +71,6 @@ module_param(fatal_skb_slots, uint, 0444);
  */
 #define XEN_NETBK_LEGACY_SLOTS_MAX XEN_NETIF_NR_SLOTS_MIN
 
-/*
- * If head != INVALID_PENDING_RING_IDX, it means this tx request is head of
- * one or more merged tx requests, otherwise it is the continuation of
- * previous tx request.
- */
-static inline int pending_tx_is_head(struct xenvif *vif, RING_IDX idx)
-{
-	return vif->pending_tx_info[idx].head != INVALID_PENDING_RING_IDX;
-}
-
 static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 			       u8 status);
 
@@ -762,19 +752,6 @@ static int xenvif_count_requests(struct xenvif *vif,
 	return slots;
 }
 
-static struct page *xenvif_alloc_page(struct xenvif *vif,
-				      u16 pending_idx)
-{
-	struct page *page;
-
-	page = alloc_page(GFP_ATOMIC|__GFP_COLD);
-	if (!page)
-		return NULL;
-	vif->mmap_pages[pending_idx] = page;
-
-	return page;
-}
-
 static inline void xenvif_tx_create_gop(struct xenvif *vif,
 					u16 pending_idx,
 					struct xen_netif_tx_request *txp,
@@ -797,13 +774,9 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	struct skb_shared_info *shinfo = skb_shinfo(skb);
 	skb_frag_t *frags = shinfo->frags;
 	u16 pending_idx = *((u16 *)skb->data);
-	u16 head_idx = 0;
-	int slot, start;
-	struct page *page;
-	pending_ring_idx_t index, start_idx = 0;
-	uint16_t dst_offset;
+	int start;
+	pending_ring_idx_t index;
 	unsigned int nr_slots;
-	struct pending_tx_info *first = NULL;
 
 	/* At this point shinfo->nr_frags is in fact the number of
 	 * slots, which can be as large as XEN_NETBK_LEGACY_SLOTS_MAX.
@@ -815,8 +788,8 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 
 	for (shinfo->nr_frags = start; shinfo->nr_frags < nr_slots;
 	     shinfo->nr_frags++, txp++, gop++) {
-				index = pending_index(vif->pending_cons++);
-				pending_idx = vif->pending_ring[index];
+		index = pending_index(vif->pending_cons++);
+		pending_idx = vif->pending_ring[index];
 		xenvif_tx_create_gop(vif, pending_idx, txp, gop);
 		frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx);
 	}
@@ -824,18 +797,6 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif,
 	BUG_ON(shinfo->nr_frags > MAX_SKB_FRAGS);
 
 	return gop;
-err:
-	/* Unwind, freeing all pages and sending error responses. */
-	while (shinfo->nr_frags-- > start) {
-		xenvif_idx_release(vif,
-				frag_get_pending_idx(&frags[shinfo->nr_frags]),
-				XEN_NETIF_RSP_ERROR);
-	}
-	/* The head too, if necessary. */
-	if (start)
-		xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR);
-
-	return NULL;
 }
 
 static int xenvif_tx_check_gop(struct xenvif *vif,
@@ -848,7 +809,6 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 	struct pending_tx_info *tx_info;
 	int nr_frags = shinfo->nr_frags;
 	int i, err, start;
-	u16 peek; /* peek into next tx request */
 
 	/* Check status of header. */
 	err = gop->status;
@@ -873,14 +833,12 @@ static int xenvif_tx_check_gop(struct xenvif *vif,
 
 	for (i = start; i < nr_frags; i++) {
 		int j, newerr;
-		pending_ring_idx_t head;
 
 		pending_idx = frag_get_pending_idx(&shinfo->frags[i]);
 		tx_info = &vif->pending_tx_info[pending_idx];
-		head = tx_info->head;
 
 		/* Check error status: if okay then remember grant handle. */
-			newerr = (++gop)->status;
+		newerr = (++gop)->status;
 
 		if (likely(!newerr)) {
 			if (vif->grant_tx_handle[pending_idx] !=
@@ -1353,7 +1311,6 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget)
 	       (skb_queue_len(&vif->tx_queue) < budget)) {
 		struct xen_netif_tx_request txreq;
 		struct xen_netif_tx_request txfrags[XEN_NETBK_LEGACY_SLOTS_MAX];
-		struct page *page;
 		struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX-1];
 		u16 pending_idx;
 		RING_IDX idx;
@@ -1728,18 +1685,17 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 {
 	struct pending_tx_info *pending_tx_info;
 	pending_ring_idx_t index;
-	u16 peek; /* peek into next tx request */
 	unsigned long flags;
 
-		pending_tx_info = &vif->pending_tx_info[pending_idx];
-		spin_lock_irqsave(&vif->response_lock, flags);
-		make_tx_response(vif, &pending_tx_info->req, status);
-		index = pending_index(vif->pending_prod);
-		vif->pending_ring[index] = pending_idx;
-		/* TX shouldn't use the index before we give it back here */
-		mb();
-		vif->pending_prod++;
-		spin_unlock_irqrestore(&vif->response_lock, flags);
+	pending_tx_info = &vif->pending_tx_info[pending_idx];
+	spin_lock_irqsave(&vif->response_lock, flags);
+	make_tx_response(vif, &pending_tx_info->req, status);
+	index = pending_index(vif->pending_prod);
+	vif->pending_ring[index] = pending_idx;
+	/* TX shouldn't use the index before we give it back here */
+	mb();
+	vif->pending_prod++;
+	spin_unlock_irqrestore(&vif->response_lock, flags);
 }
 
 void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)

^ permalink raw reply related

* [PATCH net-next v4 1/9] xen-netback: Introduce TX grant map definitions
From: Zoltan Kiss @ 2014-01-14 20:39 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss
In-Reply-To: <1389731995-9887-1-git-send-email-zoltan.kiss@citrix.com>

This patch contains the new definitions necessary for grant mapping.

v2:
- move unmapping to separate thread. The NAPI instance has to be scheduled
  even from thread context, which can cause huge delays
- that causes unfortunately bigger struct xenvif
- store grant handle after checking validity

v3:
- fix comment in xenvif_tx_dealloc_action()
- call unmap hypercall directly instead of gnttab_unmap_refs(), which does
  unnecessary m2p_override. Also remove pages_to_[un]map members
- BUG() if grant_tx_handle corrupted

v4:
- fix indentations and comments
- use bool for tx_dealloc_work_todo
- BUG() if grant_tx_handle corrupted - now really :)
- go back to gnttab_unmap_refs, now we rely on API changes

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

---
 drivers/net/xen-netback/common.h    |   30 +++++-
 drivers/net/xen-netback/interface.c |    1 +
 drivers/net/xen-netback/netback.c   |  171 +++++++++++++++++++++++++++++++++++
 3 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index c955fc3..3e5ca11 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -79,6 +79,11 @@ struct pending_tx_info {
 				  * if it is head of one or more tx
 				  * reqs
 				  */
+	/* callback data for released SKBs. The	callback is always
+	 * xenvif_zerocopy_callback, ctx points to the next fragment, desc
+	 * contains the pending_idx
+	 */
+	struct ubuf_info callback_struct;
 };
 
 #define XEN_NETIF_TX_RING_SIZE __CONST_RING_SIZE(xen_netif_tx, PAGE_SIZE)
@@ -108,6 +113,8 @@ struct xenvif_rx_meta {
  */
 #define MAX_GRANT_COPY_OPS (MAX_SKB_FRAGS * XEN_NETIF_RX_RING_SIZE)
 
+#define NETBACK_INVALID_HANDLE -1
+
 struct xenvif {
 	/* Unique identifier for this interface. */
 	domid_t          domid;
@@ -126,13 +133,26 @@ struct xenvif {
 	pending_ring_idx_t pending_cons;
 	u16 pending_ring[MAX_PENDING_REQS];
 	struct pending_tx_info pending_tx_info[MAX_PENDING_REQS];
+	grant_handle_t grant_tx_handle[MAX_PENDING_REQS];
 
 	/* Coalescing tx requests before copying makes number of grant
 	 * copy ops greater or equal to number of slots required. In
 	 * worst case a tx request consumes 2 gnttab_copy.
 	 */
 	struct gnttab_copy tx_copy_ops[2*MAX_PENDING_REQS];
-
+	struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS];
+	struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS];
+	/* passed to gnttab_[un]map_refs with pages under (un)mapping */
+	struct page *pages_to_map[MAX_PENDING_REQS];
+	struct page *pages_to_unmap[MAX_PENDING_REQS];
+
+	spinlock_t dealloc_lock;
+	spinlock_t response_lock;
+	pending_ring_idx_t dealloc_prod;
+	pending_ring_idx_t dealloc_cons;
+	u16 dealloc_ring[MAX_PENDING_REQS];
+	struct task_struct *dealloc_task;
+	wait_queue_head_t dealloc_wq;
 
 	/* Use kthread for guest RX */
 	struct task_struct *task;
@@ -222,6 +242,8 @@ int xenvif_tx_action(struct xenvif *vif, int budget);
 int xenvif_kthread(void *data);
 void xenvif_kick_thread(struct xenvif *vif);
 
+int xenvif_dealloc_kthread(void *data);
+
 /* Determine whether the needed number of slots (req) are available,
  * and set req_event if not.
  */
@@ -229,6 +251,12 @@ bool xenvif_rx_ring_slots_available(struct xenvif *vif, int needed);
 
 void xenvif_stop_queue(struct xenvif *vif);
 
+/* Callback from stack when TX packet can be released */
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success);
+
+/* Unmap a pending page, usually has to be called before xenvif_idx_release */
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx);
+
 extern bool separate_tx_rx_irq;
 
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index b9de31e..a7855b3 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -38,6 +38,7 @@
 
 #include <xen/events.h>
 #include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
 
 #define XENVIF_QUEUE_LENGTH 32
 #define XENVIF_NAPI_WEIGHT  64
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 4f81ac0..b84d2b8 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -772,6 +772,21 @@ static struct page *xenvif_alloc_page(struct xenvif *vif,
 	return page;
 }
 
+static inline void xenvif_tx_create_gop(struct xenvif *vif,
+					u16 pending_idx,
+					struct xen_netif_tx_request *txp,
+					struct gnttab_map_grant_ref *gop)
+{
+	vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx];
+	gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx),
+			  GNTMAP_host_map | GNTMAP_readonly,
+			  txp->gref, vif->domid);
+
+	memcpy(&vif->pending_tx_info[pending_idx].req, txp,
+	       sizeof(*txp));
+
+}
+
 static struct gnttab_copy *xenvif_get_requests(struct xenvif *vif,
 					       struct sk_buff *skb,
 					       struct xen_netif_tx_request *txp,
@@ -1611,6 +1626,107 @@ static int xenvif_tx_submit(struct xenvif *vif)
 	return work_done;
 }
 
+void xenvif_zerocopy_callback(struct ubuf_info *ubuf, bool zerocopy_success)
+{
+	unsigned long flags;
+	pending_ring_idx_t index;
+	u16 pending_idx = ubuf->desc;
+	struct pending_tx_info *temp =
+		container_of(ubuf, struct pending_tx_info, callback_struct);
+	struct xenvif *vif = container_of(temp - pending_idx,
+					  struct xenvif,
+					  pending_tx_info[0]);
+
+	spin_lock_irqsave(&vif->dealloc_lock, flags);
+	do {
+		pending_idx = ubuf->desc;
+		ubuf = (struct ubuf_info *) ubuf->ctx;
+		index = pending_index(vif->dealloc_prod);
+		vif->dealloc_ring[index] = pending_idx;
+		/* Sync with xenvif_tx_dealloc_action:
+		 * insert idx then incr producer.
+		 */
+		smp_wmb();
+		vif->dealloc_prod++;
+	} while (ubuf);
+	wake_up(&vif->dealloc_wq);
+	spin_unlock_irqrestore(&vif->dealloc_lock, flags);
+}
+
+static inline void xenvif_tx_dealloc_action(struct xenvif *vif)
+{
+	struct gnttab_unmap_grant_ref *gop;
+	pending_ring_idx_t dc, dp;
+	u16 pending_idx, pending_idx_release[MAX_PENDING_REQS];
+	unsigned int i = 0;
+
+	dc = vif->dealloc_cons;
+	gop = vif->tx_unmap_ops;
+
+	/* Free up any grants we have finished using */
+	do {
+		dp = vif->dealloc_prod;
+
+		/* Ensure we see all indices enqueued by all
+		 * xenvif_zerocopy_callback().
+		 */
+		smp_rmb();
+
+		while (dc != dp) {
+			pending_idx =
+				vif->dealloc_ring[pending_index(dc++)];
+
+			/* Already unmapped? */
+			if (vif->grant_tx_handle[pending_idx] ==
+				NETBACK_INVALID_HANDLE) {
+				netdev_err(vif->dev,
+					   "Trying to unmap invalid handle! "
+					   "pending_idx: %x\n", pending_idx);
+				BUG();
+			}
+
+			pending_idx_release[gop-vif->tx_unmap_ops] =
+				pending_idx;
+			vif->pages_to_unmap[gop-vif->tx_unmap_ops] =
+				vif->mmap_pages[pending_idx];
+			gnttab_set_unmap_op(gop,
+					    idx_to_kaddr(vif, pending_idx),
+					    GNTMAP_host_map,
+					    vif->grant_tx_handle[pending_idx]);
+			vif->grant_tx_handle[pending_idx] =
+				NETBACK_INVALID_HANDLE;
+			++gop;
+		}
+
+	} while (dp != vif->dealloc_prod);
+
+	vif->dealloc_cons = dc;
+
+	if (gop - vif->tx_unmap_ops > 0) {
+		int ret;
+		ret = gnttab_unmap_refs(vif->tx_unmap_ops,
+					vif->pages_to_unmap,
+					gop - vif->tx_unmap_ops);
+		if (ret) {
+			netdev_err(vif->dev, "Unmap fail: nr_ops %x ret %d\n",
+				   gop - vif->tx_unmap_ops, ret);
+			for (i = 0; i < gop - vif->tx_unmap_ops; ++i) {
+				netdev_err(vif->dev,
+					   " host_addr: %llx handle: %x status: %d\n",
+					   gop[i].host_addr,
+					   gop[i].handle,
+					   gop[i].status);
+			}
+			BUG();
+		}
+	}
+
+	for (i = 0; i < gop - vif->tx_unmap_ops; ++i)
+		xenvif_idx_release(vif, pending_idx_release[i],
+				   XEN_NETIF_RSP_OKAY);
+}
+
+
 /* Called after netfront has transmitted */
 int xenvif_tx_action(struct xenvif *vif, int budget)
 {
@@ -1677,6 +1793,31 @@ static void xenvif_idx_release(struct xenvif *vif, u16 pending_idx,
 	vif->mmap_pages[pending_idx] = NULL;
 }
 
+void xenvif_idx_unmap(struct xenvif *vif, u16 pending_idx)
+{
+	int ret;
+	struct gnttab_unmap_grant_ref tx_unmap_op;
+
+	if (vif->grant_tx_handle[pending_idx] == NETBACK_INVALID_HANDLE) {
+		netdev_err(vif->dev,
+			   "Trying to unmap invalid handle! pending_idx: %x\n",
+			   pending_idx);
+		return;
+	}
+	gnttab_set_unmap_op(&tx_unmap_op,
+			    idx_to_kaddr(vif, pending_idx),
+			    GNTMAP_host_map,
+			    vif->grant_tx_handle[pending_idx]);
+	ret = gnttab_unmap_refs(&tx_unmap_op,
+				&vif->mmap_pages[pending_idx],
+				1);
+
+	ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+					&tx_unmap_op,
+					1);
+	BUG_ON(ret);
+	vif->grant_tx_handle[pending_idx] = NETBACK_INVALID_HANDLE;
+}
 
 static void make_tx_response(struct xenvif *vif,
 			     struct xen_netif_tx_request *txp,
@@ -1738,6 +1879,14 @@ static inline int tx_work_todo(struct xenvif *vif)
 	return 0;
 }
 
+static inline bool tx_dealloc_work_todo(struct xenvif *vif)
+{
+	if (vif->dealloc_cons != vif->dealloc_prod)
+		return true;
+
+	return false;
+}
+
 void xenvif_unmap_frontend_rings(struct xenvif *vif)
 {
 	if (vif->tx.sring)
@@ -1826,6 +1975,28 @@ int xenvif_kthread(void *data)
 	return 0;
 }
 
+int xenvif_dealloc_kthread(void *data)
+{
+	struct xenvif *vif = data;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(vif->dealloc_wq,
+					 tx_dealloc_work_todo(vif) ||
+					 kthread_should_stop());
+		if (kthread_should_stop())
+			break;
+
+		xenvif_tx_dealloc_action(vif);
+		cond_resched();
+	}
+
+	/* Unmap anything remaining*/
+	if (tx_dealloc_work_todo(vif))
+		xenvif_tx_dealloc_action(vif);
+
+	return 0;
+}
+
 static int __init netback_init(void)
 {
 	int rc = 0;

^ permalink raw reply related

* [PATCH net-next v4 0/9] xen-netback: TX grant mapping with SKBTX_DEV_ZEROCOPY instead of copy
From: Zoltan Kiss @ 2014-01-14 20:39 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss

A long known problem of the upstream netback implementation that on the TX
path (from guest to Dom0) it copies the whole packet from guest memory into
Dom0. That simply became a bottleneck with 10Gb NICs, and generally it's a
huge perfomance penalty. The classic kernel version of netback used grant
mapping, and to get notified when the page can be unmapped, it used page
destructors. Unfortunately that destructor is not an upstreamable solution.
Ian Campbell's skb fragment destructor patch series [1] tried to solve this
problem, however it seems to be very invasive on the network stack's code,
and therefore haven't progressed very well.
This patch series use SKBTX_DEV_ZEROCOPY flags to tell the stack it needs to
know when the skb is freed up. That is the way KVM solved the same problem,
and based on my initial tests it can do the same for us. Avoiding the extra
copy boosted up TX throughput from 6.8 Gbps to 7.9 (I used a slower
Interlagos box, both Dom0 and guest on upstream kernel, on the same NUMA node,
running iperf 2.0.5, and the remote end was a bare metal box on the same 10Gb
switch)
Based on my investigations the packet get only copied if it is delivered to
Dom0 stack, which is due to this [2] patch. That's a bit unfortunate, but
luckily it doesn't cause a major regression for this usecase. In the future
we should try to eliminate that copy somehow.
There are a few spinoff tasks which will be addressed in separate patches:
- grant copy the header directly instead of map and memcpy. This should help
  us avoiding TLB flushing
- use something else than ballooned pages
- fix grant map to use page->index properly
I will run some more extensive tests, but some basic XenRT tests were already
passed with good results.
I've tried to broke it down to smaller patches, with mixed results, so I
welcome suggestions on that part as well:
1: Introduce TX grant map definitions
2: Change TX path from grant copy to mapping
3: Remove old TX grant copy definitons and fix indentations
4: Change RX path for mapped SKB fragments
5: Add stat counters for zerocopy
6: Handle guests with too many frags
7: Add stat counters for frag_list skbs
8: Timeout packets in RX path
9: Aggregate TX unmap operations

v2: I've fixed some smaller things, see the individual patches. I've added a
few new stat counters, and handling the important use case when an older guest
sends lots of slots. Instead of delayed copy now we timeout packets on the RX
path, based on the assumption that otherwise packets should get stucked
anywhere else. Finally some unmap batching to avoid too much TLB flush

v3: Apart from fixing a few things mentioned in responses the important change
is the use the hypercall directly for grant [un]mapping, therefore we can
avoid m2p override.

v4: Now we are using a new grant mapping API to avoid m2p_override. The RX queue
timeout logic changed also.

[1] http://lwn.net/Articles/491522/
[2] https://lkml.org/lkml/2012/7/20/363

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>

^ permalink raw reply

* Re: pull-request: can-next 2014-01-11
From: Marc Kleine-Budde @ 2014-01-14 20:32 UTC (permalink / raw)
  To: Sergei Shtylyov, netdev
  Cc: David Miller, linux-can@vger.kernel.org, kernel@pengutronix.de
In-Reply-To: <52D59E97.9090109@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 716 bytes --]

On 01/14/2014 09:31 PM, Marc Kleine-Budde wrote:
>>    What about the rewritten Renesas R-Car CAN driver? Is there a chance
>> to get it into 3.14-rc1 now that we've missed 3.13-rc1? I've sent it at
>> the very end of December, and have got no feedback still...
> 
> I was ill last week and as a customer today and tomorrow, I'll review
                          ^^
                          at
> your driver this week.


Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 242 bytes --]

^ permalink raw reply

* Re: pull-request: can-next 2014-01-11
From: Marc Kleine-Budde @ 2014-01-14 20:31 UTC (permalink / raw)
  To: Sergei Shtylyov, netdev
  Cc: David Miller, linux-can@vger.kernel.org, kernel@pengutronix.de
In-Reply-To: <52D597B4.7020607@cogentembedded.com>

[-- Attachment #1: Type: text/plain, Size: 1073 bytes --]

On 01/14/2014 09:01 PM, Sergei Shtylyov wrote:
> Hello.
> 
> On 01/11/2014 09:48 PM, Marc Kleine-Budde wrote:
> 
>> Hello David,
> 
>> this is a pull request of three patches for net-next/master.
> 
>> Oleg Moroz added support for a new PCI card to the generic SJA1000 PCI
>> driver, Guenter Roeck's patch limits the flexcan driver to little
>> endian arm (and powerpc) and I fixed a sparse warning found by the
>> kbuild robot in the ti_hecc driver.
> 
>    What about the rewritten Renesas R-Car CAN driver? Is there a chance
> to get it into 3.14-rc1 now that we've missed 3.13-rc1? I've sent it at
> the very end of December, and have got no feedback still...

I was ill last week and as a customer today and tomorrow, I'll review
your driver this week.

regards,
Marc

-- 
Pengutronix e.K.                  | Marc Kleine-Budde           |
Industrial Linux Solutions        | Phone: +49-231-2826-924     |
Vertretung West/Dortmund          | Fax:   +49-5121-206917-5555 |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 242 bytes --]

^ permalink raw reply

* Re: [PATCH RFC] reciprocal_divide: correction/update of the algorithm
From: Hannes Frederic Sowa @ 2014-01-14 20:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Austin S Hemmelgarn, netdev, dborkman, linux-kernel, darkjames-ws
In-Reply-To: <1389729032.31367.262.camel@edumazet-glaptop2.roam.corp.google.com>

On Tue, Jan 14, 2014 at 11:50:32AM -0800, Eric Dumazet wrote:
> On Tue, 2014-01-14 at 14:22 -0500, Austin S Hemmelgarn wrote:
> 
> > I disagree with the statement that current CPU's have reasonably fast
> > dividers.  A lot of embedded processors and many low-end x86 CPU's do
> > not in-fact have any hardware divider, and usually provide it using
> > microcode based emulation if they provide it at all.  The AMD Jaguar
> > micro-architecture in particular comes to mind, it uses an iterative
> > division algorithm provided by the microcode that only produces 2 bits
> > of quotient per cycle, even in the best case (2 8-bit integers and an
> > integral 8-bit quotient) this still takes 4 cycles, which is twice as
> > slow as any other math operation on the same processor.
> 
> I doubt you run any BPF filter with a divide instruction in it on these
> platform.
> 
> Get real, do not over optimize things where it does not matter.

If I read the instruction tables correctly, we could half the latency with
reciprocal divide even on haswell.

What a pitty that the Intel Architecture Code Analyzer does not support imul
nor div instruction. :(

^ permalink raw reply

* Re: pull-request: can-next 2014-01-11
From: Sergei Shtylyov @ 2014-01-14 20:01 UTC (permalink / raw)
  To: Marc Kleine-Budde, netdev
  Cc: David Miller, linux-can@vger.kernel.org, kernel@pengutronix.de
In-Reply-To: <52D191E2.8040703@pengutronix.de>

Hello.

On 01/11/2014 09:48 PM, Marc Kleine-Budde wrote:

> Hello David,

> this is a pull request of three patches for net-next/master.

> Oleg Moroz added support for a new PCI card to the generic SJA1000 PCI
> driver, Guenter Roeck's patch limits the flexcan driver to little
> endian arm (and powerpc) and I fixed a sparse warning found by the
> kbuild robot in the ti_hecc driver.

    What about the rewritten Renesas R-Car CAN driver? Is there a chance to 
get it into 3.14-rc1 now that we've missed 3.13-rc1? I've sent it at the very 
end of December, and have got no feedback still...

> regards,
> Marc

WBR, Sergei


^ permalink raw reply

* Re: [PATCH net-next v3 2/2] stmmac: Fix kernel crashes for jumbo frames
From: Ben Hutchings @ 2014-01-14 19:53 UTC (permalink / raw)
  To: Vince Bridgers
  Cc: devicetree, netdev, peppe.cavallaro, robh+dt, pawel.moll,
	mark.rutland, ijc+devicetree, galak, dinguyen, rayagond
In-Reply-To: <1389719859-29071-3-git-send-email-vbridgers2013@gmail.com>

On Tue, 2014-01-14 at 11:17 -0600, Vince Bridgers wrote:
> These changes correct the following issues with jumbo frames on the
> stmmac driver:
> 
> 1) The Synopsys EMAC can be configured to support different FIFO
> sizes at core configuration time. There's no way to query the
> controller and know the FIFO size, so the driver needs to get this
> information from the device tree in order to know how to correctly
> handle MTU changes and setting up dma buffers. The default
> max-frame-size is as currently used, which is the size of a jumbo
> frame.
> 
> 2) The driver was enabling Jumbo frames by default, but was not allocating
> dma buffers of sufficient size to handle the maximum possible packet
> size that could be received. This led to memory corruption since DMAs were
> occurring beyond the extent of the allocated receive buffers for certain types
> of network traffic.
[...]
> --- a/drivers/net/ethernet/stmicro/stmmac/common.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
> @@ -293,6 +293,8 @@ struct dma_features {
>  #define STMMAC_CHAIN_MODE	0x1
>  #define STMMAC_RING_MODE	0x2
>  
> +#define JUMBO_LEN		9000
> +
>  struct stmmac_desc_ops {
>  	/* DMA RX descriptor ring initialization */
>  	void (*init_rx_desc) (struct dma_desc *p, int disable_rx_ic, int mode,
[...]
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
> @@ -51,6 +51,10 @@ static int stmmac_probe_config_dt(struct platform_device *pdev,
>  	plat->mdio_bus_data = devm_kzalloc(&pdev->dev,
>  					   sizeof(struct stmmac_mdio_bus_data),
>  					   GFP_KERNEL);
> +	/* Set the maxmtu to a default of 1500 in case the
> +	 * parameter is not present in the device tree
> +	 */
> +	plat->maxmtu = JUMBO_LEN;

The comment disagrees with the definition of JUMBO_LEN.

>  
>  	/*
>  	 * Currently only the properties needed on SPEAr600
> @@ -60,6 +64,7 @@ static int stmmac_probe_config_dt(struct platform_device *pdev,
>  	if (of_device_is_compatible(np, "st,spear600-gmac") ||
>  		of_device_is_compatible(np, "snps,dwmac-3.70a") ||
>  		of_device_is_compatible(np, "snps,dwmac")) {
> +		of_property_read_u32(np, "max-frame-size", &plat->maxmtu);
[...]

Is it the maximum frame size, including Ethernet header?  (And then, is
the CRC or any VLAN header included in the frame size?)
Or is it the MTU, excluding all of those?
You really need to be very clear about what this number represents.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [PATCH RFC] reciprocal_divide: correction/update of the algorithm
From: Eric Dumazet @ 2014-01-14 19:50 UTC (permalink / raw)
  To: Austin S Hemmelgarn
  Cc: Hannes Frederic Sowa, netdev, dborkman, linux-kernel,
	darkjames-ws
In-Reply-To: <52D58E6F.4050000@gmail.com>

On Tue, 2014-01-14 at 14:22 -0500, Austin S Hemmelgarn wrote:

> I disagree with the statement that current CPU's have reasonably fast
> dividers.  A lot of embedded processors and many low-end x86 CPU's do
> not in-fact have any hardware divider, and usually provide it using
> microcode based emulation if they provide it at all.  The AMD Jaguar
> micro-architecture in particular comes to mind, it uses an iterative
> division algorithm provided by the microcode that only produces 2 bits
> of quotient per cycle, even in the best case (2 8-bit integers and an
> integral 8-bit quotient) this still takes 4 cycles, which is twice as
> slow as any other math operation on the same processor.

I doubt you run any BPF filter with a divide instruction in it on these
platform.

Get real, do not over optimize things where it does not matter.

^ permalink raw reply

* [PATCH v2 net-next] stmmac: Add vlan rx for better GRO performance.
From: Vince Bridgers @ 2014-01-14 19:42 UTC (permalink / raw)
  To: netdev; +Cc: peppe.cavallaro, rayagond, vbridgers2013

GRO requires VLANs to be removed before aggregation can occur.
The Synopsys EMAC does not strip VLAN tags so this must be
done by the driver.

Signed-off-by: Vince Bridgers <vbridgers2013@gmail.com>
---
V2: remove inline on added function, line up broken lines per review comments
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |   19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index b8e3a4c..ecdc8ab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -1951,6 +1951,23 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+static void stmmac_rx_vlan(struct net_device *dev, struct sk_buff *skb)
+{
+	struct ethhdr *ehdr;
+	u16 vlanid;
+
+	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
+	    NETIF_F_HW_VLAN_CTAG_RX &&
+	    !__vlan_get_tag(skb, &vlanid)) {
+		/* pop the vlan tag */
+		ehdr = (struct ethhdr *)skb->data;
+		memmove(skb->data + VLAN_HLEN, ehdr, ETH_ALEN * 2);
+		skb_pull(skb, VLAN_HLEN);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlanid);
+	}
+}
+
+
 /**
  * stmmac_rx_refill: refill used skb preallocated buffers
  * @priv: driver private structure
@@ -2102,6 +2119,8 @@ static int stmmac_rx(struct stmmac_priv *priv, int limit)
 				print_pkt(skb->data, frame_len);
 			}
 
+			stmmac_rx_vlan(priv->dev, skb);
+
 			skb->protocol = eth_type_trans(skb, priv->dev);
 
 			if (unlikely(!coe))
-- 
1.7.9.5

^ permalink raw reply related

* [PATCH v2 2/2] net/mlx4_core: clean up srq_res_start_move_to()
From: Paul Bolle @ 2014-01-14 19:46 UTC (permalink / raw)
  To: Or Gerlitz, Jack Morgenstein, Rony Efraim, Hadar Hen Zion,
	David S. Miller
  Cc: netdev, linux-kernel
In-Reply-To: <20140114084050.70a612af@jpm-OptiPlex-GX620>

Building resource_tracker.o triggers a GCC warning:
    drivers/net/ethernet/mellanox/mlx4/resource_tracker.c: In function 'mlx4_HW2SW_SRQ_wrapper':
    drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:3202:17: warning: 'srq' may be used uninitialized in this function [-Wmaybe-uninitialized]
      atomic_dec(&srq->mtt->ref_count);
                     ^

This is a false positive. But a cleanup of srq_res_start_move_to() can
help GCC here. The code currently uses a switch statement where a plain
if/else would do, since only two of the switch's four cases can ever
occur. Dropping that switch makes the warning go away.

While we're at it, add some missing braces, and convert state to the
correct type.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
v2: adjust to Jack's review.

 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 46 ++++++++--------------
 1 file changed, 16 insertions(+), 30 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 15cd659..4acd84c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -1371,7 +1371,7 @@ static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
 }
 
 static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
-				 enum res_cq_states state, struct res_srq **srq)
+				 enum res_srq_states state, struct res_srq **srq)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_resource_tracker *tracker = &priv->mfunc.master.res_tracker;
@@ -1380,39 +1380,25 @@ static int srq_res_start_move_to(struct mlx4_dev *dev, int slave, int index,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	r = res_tracker_lookup(&tracker->res_tree[RES_SRQ], index);
-	if (!r)
+	if (!r) {
 		err = -ENOENT;
-	else if (r->com.owner != slave)
+	} else if (r->com.owner != slave) {
 		err = -EPERM;
-	else {
-		switch (state) {
-		case RES_SRQ_BUSY:
-			err = -EINVAL;
-			break;
-
-		case RES_SRQ_ALLOCATED:
-			if (r->com.state != RES_SRQ_HW)
-				err = -EINVAL;
-			else if (atomic_read(&r->ref_count))
-				err = -EBUSY;
-			break;
-
-		case RES_SRQ_HW:
-			if (r->com.state != RES_SRQ_ALLOCATED)
-				err = -EINVAL;
-			break;
-
-		default:
+	} else if (state == RES_SRQ_ALLOCATED) {
+		if (r->com.state != RES_SRQ_HW)
 			err = -EINVAL;
-		}
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+	} else if (state != RES_SRQ_HW || r->com.state != RES_SRQ_ALLOCATED) {
+		err = -EINVAL;
+	}
 
-		if (!err) {
-			r->com.from_state = r->com.state;
-			r->com.to_state = state;
-			r->com.state = RES_SRQ_BUSY;
-			if (srq)
-				*srq = r;
-		}
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_SRQ_BUSY;
+		if (srq)
+			*srq = r;
 	}
 
 	spin_unlock_irq(mlx4_tlock(dev));
-- 
1.8.4.2

^ permalink raw reply related

* [PATCH v2 1/2] net/mlx4_core: clean up cq_res_start_move_to()
From: Paul Bolle @ 2014-01-14 19:45 UTC (permalink / raw)
  To: Or Gerlitz, Jack Morgenstein, Rony Efraim, Hadar Hen Zion,
	David S. Miller
  Cc: netdev, linux-kernel
In-Reply-To: <20140114084752.1db64b21@jpm-OptiPlex-GX620>

Building resource_tracker.o triggers a GCC warning:
    drivers/net/ethernet/mellanox/mlx4/resource_tracker.c: In function 'mlx4_HW2SW_CQ_wrapper':
    drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:3019:16: warning: 'cq' may be used uninitialized in this function [-Wmaybe-uninitialized]
      atomic_dec(&cq->mtt->ref_count);
                    ^

This is a false positive. But a cleanup of cq_res_start_move_to() can
help GCC here. The code currently uses a switch statement where an
if/else construct would do too, since only two of the switch's four
cases can ever occur. Dropping that switch makes the warning go away.

While we're at it, add some missing braces.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
---
v2: adjust to Jack's review.

 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 52 ++++++++--------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 2f3f2bc..15cd659 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -1340,43 +1340,29 @@ static int cq_res_start_move_to(struct mlx4_dev *dev, int slave, int cqn,
 
 	spin_lock_irq(mlx4_tlock(dev));
 	r = res_tracker_lookup(&tracker->res_tree[RES_CQ], cqn);
-	if (!r)
+	if (!r) {
 		err = -ENOENT;
-	else if (r->com.owner != slave)
+	} else if (r->com.owner != slave) {
 		err = -EPERM;
-	else {
-		switch (state) {
-		case RES_CQ_BUSY:
-			err = -EBUSY;
-			break;
-
-		case RES_CQ_ALLOCATED:
-			if (r->com.state != RES_CQ_HW)
-				err = -EINVAL;
-			else if (atomic_read(&r->ref_count))
-				err = -EBUSY;
-			else
-				err = 0;
-			break;
-
-		case RES_CQ_HW:
-			if (r->com.state != RES_CQ_ALLOCATED)
-				err = -EINVAL;
-			else
-				err = 0;
-			break;
-
-		default:
+	} else if (state == RES_CQ_ALLOCATED) {
+		if (r->com.state != RES_CQ_HW)
 			err = -EINVAL;
-		}
+		else if (atomic_read(&r->ref_count))
+			err = -EBUSY;
+		else
+			err = 0;
+	} else if (state != RES_CQ_HW || r->com.state != RES_CQ_ALLOCATED) {
+		err = -EINVAL;
+	} else {
+		err = 0;
+	}
 
-		if (!err) {
-			r->com.from_state = r->com.state;
-			r->com.to_state = state;
-			r->com.state = RES_CQ_BUSY;
-			if (cq)
-				*cq = r;
-		}
+	if (!err) {
+		r->com.from_state = r->com.state;
+		r->com.to_state = state;
+		r->com.state = RES_CQ_BUSY;
+		if (cq)
+			*cq = r;
 	}
 
 	spin_unlock_irq(mlx4_tlock(dev));
-- 
1.8.4.2

^ permalink raw reply related

* Re: [RFC] sysfs_rename_link() and its usage
From: Greg KH @ 2014-01-14 19:31 UTC (permalink / raw)
  To: Veaceslav Falico; +Cc: linux-kernel, netdev, ebiederm
In-Reply-To: <20140114191208.GA9942@redhat.com>

On Tue, Jan 14, 2014 at 08:12:08PM +0100, Veaceslav Falico wrote:
> On Tue, Jan 14, 2014 at 10:21:35AM -0800, Greg KH wrote:
> >On Tue, Jan 14, 2014 at 06:17:40PM +0100, Veaceslav Falico wrote:
> >>Hi,
> >>
> >>I'm hitting a strange issue and/or I'm completely lost in sysfs internals.
> >>
> >>Consider having two net_device *a, *b; which are registered normally.
> >>Now, to create a link from /sys/class/net/a->name/linkname to b, one should
> >>use:
> >>
> >>sysfs_create_link(&(a->dev.kobj), &(b->dev.kobj), linkname);
> >>
> >>To remove it, even simpler:
> >>
> >>sysfs_remove_link(&(a->dev.kobj), linkname);
> >>
> >>This works like a charm. However, if I want to use (obviously, with the
> >>symlink present):
> >>
> >>sysfs_rename_link(&(a->dev.kobj), &(b->dev.kobj), oldname, newname);
> >
> >You forgot the namespace option to this call, what kernel version are
> >you using here?
> 
> It's git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next ,
> 3.13-rc6 with some networking patches on top of it.
> 
> And wrt namespace - there are two functions, one is sysfs_rename_link(),
> which calls the second one - sysfs_rename_link_ns() with NULL namespace.
> 
> >
> >>this fails with:
> >>
> >>"sysfs: ns invalid in 'a->name' for 'oldname'"
> >
> >Looks like the namespace for this link isn't valid.
> 
> Yep, though dunno why.

Are you testing this with network namespaces enabled?  Perhaps that is
why, you need to specify the namespace of the link that you are
changing.

The fact that the bridge link works is odd to me, I would think that it
too needs to specify the network namespace involved, but perhaps bridge
objects aren't part of any specific network namespace?  I don't know the
bridging code at all, sorry.

So try calling sysfs_rename_link_ns() and specify the namespace of the
kobject you are changing, and see if that works or not.

thanks,

greg k-h

^ permalink raw reply

* [net PATCH 1/1] qlge: Fix vlan netdev features.
From: Jitendra Kalsaria @ 2014-01-14 18:57 UTC (permalink / raw)
  To: davem
  Cc: netdev, ron.mercer, shahed.shaikh, Dept-HSGLinuxNICDev,
	Jitendra Kalsaria

From: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>

vlan gets the same netdev features except vlan filter.

Signed-off-by: Jitendra Kalsaria <jitendra.kalsaria@qlogic.com>
---
 drivers/net/ethernet/qlogic/qlge/qlge_main.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 449f506..f705aee 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -4765,6 +4765,8 @@ static int qlge_probe(struct pci_dev *pdev,
 			    NETIF_F_RXCSUM;
 	ndev->features = ndev->hw_features;
 	ndev->vlan_features = ndev->hw_features;
+	/* vlan gets same features (except vlan filter) */
+	ndev->vlan_features &= ~NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	if (test_bit(QL_DMA64, &qdev->flags))
 		ndev->features |= NETIF_F_HIGHDMA;
-- 
1.7.6.rc1.1.g2c162b

^ permalink raw reply related

* [PATCH net-next] xen-netback: Rework rx_work_todo
From: Zoltan Kiss @ 2014-01-14 19:28 UTC (permalink / raw)
  To: ian.campbell, wei.liu2, xen-devel, netdev, linux-kernel,
	jonathan.davies
  Cc: Zoltan Kiss

The recent patch to fix receive side flow control (11b57f) solved the spinning
thread problem, however caused an another one. The receive side can stall, if:
- xenvif_rx_action sets rx_queue_stopped to false
- interrupt happens, and sets rx_event to true
- then xenvif_kthread sets rx_event to false

Also, through rx_event a malicious guest can force the RX thread to spin. This
patch ditch that two variable, and rework rx_work_todo. If the thread finds it
can't fit more skb's into the ring, it saves the last slot estimation into
rx_last_skb_slots, otherwise it's kept as 0. Then rx_work_todo will check if:
- there is something to send to the ring
- there is space for the topmost packet in the queue

Signed-off-by: Zoltan Kiss <zoltan.kiss@citrix.com>
---
 drivers/net/xen-netback/common.h    |    6 +-----
 drivers/net/xen-netback/interface.c |    1 -
 drivers/net/xen-netback/netback.c   |   16 ++++++----------
 3 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 4c76bcb..ae413a2 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -143,11 +143,7 @@ struct xenvif {
 	char rx_irq_name[IFNAMSIZ+4]; /* DEVNAME-rx */
 	struct xen_netif_rx_back_ring rx;
 	struct sk_buff_head rx_queue;
-	bool rx_queue_stopped;
-	/* Set when the RX interrupt is triggered by the frontend.
-	 * The worker thread may need to wake the queue.
-	 */
-	bool rx_event;
+	RING_IDX rx_last_skb_slots;
 
 	/* This array is allocated seperately as it is large */
 	struct gnttab_copy *grant_copy_op;
diff --git a/drivers/net/xen-netback/interface.c b/drivers/net/xen-netback/interface.c
index b9de31e..7669d49 100644
--- a/drivers/net/xen-netback/interface.c
+++ b/drivers/net/xen-netback/interface.c
@@ -100,7 +100,6 @@ static irqreturn_t xenvif_rx_interrupt(int irq, void *dev_id)
 {
 	struct xenvif *vif = dev_id;
 
-	vif->rx_event = true;
 	xenvif_kick_thread(vif);
 
 	return IRQ_HANDLED;
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 2738563..bb241d0 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -477,7 +477,6 @@ static void xenvif_rx_action(struct xenvif *vif)
 	unsigned long offset;
 	struct skb_cb_overlay *sco;
 	bool need_to_notify = false;
-	bool ring_full = false;
 
 	struct netrx_pending_operations npo = {
 		.copy  = vif->grant_copy_op,
@@ -487,7 +486,7 @@ static void xenvif_rx_action(struct xenvif *vif)
 	skb_queue_head_init(&rxq);
 
 	while ((skb = skb_dequeue(&vif->rx_queue)) != NULL) {
-		int max_slots_needed;
+		RING_IDX max_slots_needed;
 		int i;
 
 		/* We need a cheap worse case estimate for the number of
@@ -510,9 +509,10 @@ static void xenvif_rx_action(struct xenvif *vif)
 		if (!xenvif_rx_ring_slots_available(vif, max_slots_needed)) {
 			skb_queue_head(&vif->rx_queue, skb);
 			need_to_notify = true;
-			ring_full = true;
+			vif->rx_last_skb_slots = max_slots_needed;
 			break;
-		}
+		} else
+			vif->rx_last_skb_slots = 0;
 
 		sco = (struct skb_cb_overlay *)skb->cb;
 		sco->meta_slots_used = xenvif_gop_skb(skb, &npo);
@@ -523,8 +523,6 @@ static void xenvif_rx_action(struct xenvif *vif)
 
 	BUG_ON(npo.meta_prod > ARRAY_SIZE(vif->meta));
 
-	vif->rx_queue_stopped = !npo.copy_prod && ring_full;
-
 	if (!npo.copy_prod)
 		goto done;
 
@@ -1727,8 +1725,8 @@ static struct xen_netif_rx_response *make_rx_response(struct xenvif *vif,
 
 static inline int rx_work_todo(struct xenvif *vif)
 {
-	return (!skb_queue_empty(&vif->rx_queue) && !vif->rx_queue_stopped) ||
-		vif->rx_event;
+	return !skb_queue_empty(&vif->rx_queue) &&
+	       xenvif_rx_ring_slots_available(vif, vif->rx_last_skb_slots);
 }
 
 static inline int tx_work_todo(struct xenvif *vif)
@@ -1814,8 +1812,6 @@ int xenvif_kthread(void *data)
 		if (!skb_queue_empty(&vif->rx_queue))
 			xenvif_rx_action(vif);
 
-		vif->rx_event = false;
-
 		if (skb_queue_empty(&vif->rx_queue) &&
 		    netif_queue_stopped(vif->dev))
 			xenvif_start_queue(vif);

^ permalink raw reply related

* Re: [PATCH RFC] reciprocal_divide: correction/update of the algorithm
From: Austin S Hemmelgarn @ 2014-01-14 19:22 UTC (permalink / raw)
  To: Eric Dumazet, Hannes Frederic Sowa
  Cc: netdev, dborkman, linux-kernel, darkjames-ws
In-Reply-To: <1389722825.31367.260.camel@edumazet-glaptop2.roam.corp.google.com>

On 2014-01-14 13:07, Eric Dumazet wrote:
> On Mon, 2014-01-13 at 22:42 +0100, Hannes Frederic Sowa wrote:
>> This patch is a RFC and part of a series Daniel Borkmann and me want to
>> do when introducing prandom_u32_range{,_ro} and prandom_u32_max{,_ro}
>> helpers later this week.
> 
>> -static inline u32 reciprocal_divide(u32 A, u32 R)
>> +struct reciprocal_value reciprocal_value(u32 d);
>> +
>> +static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
>>  {
>> -	return (u32)(((u64)A * R) >> 32);
>> +	u32 t = (u32)(((u64)a * R.m) >> 32);
>> +	return (t + ((a - t) >> R.sh1)) >> R.sh2;
>>  }
> 
> I would rather introduce new helpers and convert users that really need
> them.
> 
> For instance, just use a divide in BPF, because doing this on JIT might
> be too complex for the gains. Strangely, libpcap doesn't seem to
> optimize any divide, like divides by a power of two...
> 
> Reciprocal were added 7 years ago, for very specific uses, but current
> cpus have reasonably fast dividers.

I disagree with the statement that current CPU's have reasonably fast
dividers.  A lot of embedded processors and many low-end x86 CPU's do
not in-fact have any hardware divider, and usually provide it using
microcode based emulation if they provide it at all.  The AMD Jaguar
micro-architecture in particular comes to mind, it uses an iterative
division algorithm provided by the microcode that only produces 2 bits
of quotient per cycle, even in the best case (2 8-bit integers and an
integral 8-bit quotient) this still takes 4 cycles, which is twice as
slow as any other math operation on the same processor.

^ permalink raw reply

* Re: [PATCH net-next 2/2] net: rename sysfs symlinks on device name change
From: Veaceslav Falico @ 2014-01-14 19:14 UTC (permalink / raw)
  To: Cong Wang
  Cc: netdev, Ding Tianhong, David S. Miller, Eric Dumazet,
	Nicolas Dichtel
In-Reply-To: <CAHA+R7N=iFAGqoRuL8HD3SnkmH-dMPpUCkrJkLEGd1SPTbe4Kw@mail.gmail.com>

On Tue, Jan 14, 2014 at 10:30:56AM -0800, Cong Wang wrote:
>On Tue, Jan 14, 2014 at 8:36 AM, Veaceslav Falico <vfalico@redhat.com> wrote:
>> +void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
>> +{
>> +       struct netdev_adjacent *iter;
>> +
>> +       list_for_each_entry(iter, &dev->adj_list.upper, list) {
>> +               netdev_adjacent_sysfs_del(iter->dev, oldname,
>> +                                         &iter->dev->adj_list.lower);
>> +               netdev_adjacent_sysfs_add(iter->dev, dev,
>> +                                         &iter->dev->adj_list.lower);
>> +       }
>> +
>> +       list_for_each_entry(iter, &dev->adj_list.lower, list) {
>> +               netdev_adjacent_sysfs_del(iter->dev, oldname,
>> +                                         &iter->dev->adj_list.upper);
>> +               netdev_adjacent_sysfs_add(iter->dev, dev,
>> +                                         &iter->dev->adj_list.upper);
>> +       }
>> +}
>> +EXPORT_SYMBOL(netdev_adjacent_rename_links);
>
>Since it is only used within net/core/dev.c, why do you make it extern
>and export it? It can become static.

Hm, good point, exported it automatically, didn't realise that dev_rename()
is also in dev.c.

Will send v2.

^ permalink raw reply

* Re: [RFC] sysfs_rename_link() and its usage
From: Veaceslav Falico @ 2014-01-14 19:12 UTC (permalink / raw)
  To: Greg KH; +Cc: linux-kernel, netdev, ebiederm
In-Reply-To: <20140114182135.GA29296@kroah.com>

On Tue, Jan 14, 2014 at 10:21:35AM -0800, Greg KH wrote:
>On Tue, Jan 14, 2014 at 06:17:40PM +0100, Veaceslav Falico wrote:
>> Hi,
>>
>> I'm hitting a strange issue and/or I'm completely lost in sysfs internals.
>>
>> Consider having two net_device *a, *b; which are registered normally.
>> Now, to create a link from /sys/class/net/a->name/linkname to b, one should
>> use:
>>
>> sysfs_create_link(&(a->dev.kobj), &(b->dev.kobj), linkname);
>>
>> To remove it, even simpler:
>>
>> sysfs_remove_link(&(a->dev.kobj), linkname);
>>
>> This works like a charm. However, if I want to use (obviously, with the
>> symlink present):
>>
>> sysfs_rename_link(&(a->dev.kobj), &(b->dev.kobj), oldname, newname);
>
>You forgot the namespace option to this call, what kernel version are
>you using here?

It's git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next ,
3.13-rc6 with some networking patches on top of it.

And wrt namespace - there are two functions, one is sysfs_rename_link(),
which calls the second one - sysfs_rename_link_ns() with NULL namespace.

>
>> this fails with:
>>
>> "sysfs: ns invalid in 'a->name' for 'oldname'"
>
>Looks like the namespace for this link isn't valid.

Yep, though dunno why.

>
>> in
>>
>>  608 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
>> ...
>>  615         if (!!sysfs_ns_type(parent_sd) != !!ns) {
>>  616                 WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
>>  617                         sysfs_ns_type(parent_sd) ? "required" : "invalid",
>>  618                         parent_sd->s_name, name);
>>  619                 return NULL;
>>  620         }
>>
>> Code path:
>> warn_slowpath_fmt+0x46/0x50
>> sysfs_get_dirent_ns+0x30/0x80
>> sysfs_find_dirent+0x84/0x110
>> sysfs_get_dirent_ns+0x3e/0x80
>> sysfs_rename_link_ns+0x54/0xd0
>>
>> I have no idea what this code means. Is there any reason for it to
>> fail (i.e. am I doing something wrong?) or I've hit a bug?
>
>What exactly are you trying to do here?  Care to provide a pointer to
>your code somewhere?

I've tried to handle the network device renames, ended up doing it with
sysfs_remove/add_link() calls. Patches:

http://patchwork.ozlabs.org/patch/310798/
http://patchwork.ozlabs.org/patch/310796/

If an upper device (say, vlan, bonding, bridge etc.) enslaves another
device, symlinks are created (given bridge0 and eth0):

/sys/class/net/eth0/upper_bridge0 -> ../bridge0
/sys/class/net/bridge0/lower_eth0 -> ../eth0

However, in case someone renames eth0/bridge0, these symlinks should also
be renamed, and sysfs_rename_link() should fit great here. But it fails.

>
>> I've tested the only user of it (bridge) - and it works fine, however it's
>> not using its own net_device's kobject but rather its own dir.
>
>The driver core also uses this function, and it works there, so I'd
>blame your code :)

Yeah, I'd also like to blame it, I'm afraid of touching anything
sysfs-related :).

>
>thanks,
>
>greg k-h

^ permalink raw reply

* Re: [PATCHv2 net-next 3/4] flowcache: Fixup flow cache part in xfrm policy
From: Sabrina Dubroca @ 2014-01-14 18:59 UTC (permalink / raw)
  To: Fan Du; +Cc: steffen.klassert, davem, netdev
In-Reply-To: <1389663588-29678-4-git-send-email-fan.du@windriver.com>

2014-01-14, 09:39:46 +0800, Fan Du wrote:
> Bump flow cache genid, and flush flow cache should also be made
> in per net style.
> 
> Signed-off-by: Fan Du <fan.du@windriver.com>
> ---
>  net/xfrm/xfrm_policy.c |    7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
> index e205c4b..d39c90f 100644
> --- a/net/xfrm/xfrm_policy.c
> +++ b/net/xfrm/xfrm_policy.c
> @@ -661,7 +661,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
>  		hlist_add_head(&policy->bydst, chain);
>  	xfrm_pol_hold(policy);
>  	net->xfrm.policy_count[dir]++;
> -	atomic_inc(&flow_cache_genid);
> +	atomic_inc(&net->xfrm.flow_cache_genid);
>  
>  	/* After previous checking, family can either be AF_INET or AF_INET6 */
>  	if (policy->family == AF_INET)
> @@ -2567,14 +2567,14 @@ static void __xfrm_garbage_collect(struct net *net)
>  
>  void xfrm_garbage_collect(struct net *net)
>  {
> -	flow_cache_flush();
> +	flow_cache_flush(net);
>  	__xfrm_garbage_collect(net);
>  }
>  EXPORT_SYMBOL(xfrm_garbage_collect);
>  
>  static void xfrm_garbage_collect_deferred(struct net *net)
>  {
> -	flow_cache_flush_deferred();
> +	flow_cache_flush_deferred(net);
>  	__xfrm_garbage_collect(net);
>  }
>  
> @@ -2947,6 +2947,7 @@ static int __net_init xfrm_net_init(struct net *net)
>  	spin_lock_init(&net->xfrm.xfrm_policy_sk_bundle_lock);
>  	mutex_init(&net->xfrm.xfrm_cfg_mutex);
>  
> +	flow_cache_init(net);
>  	return 0;
>  
>  out_sysctl:


You didn't address Cong Wang's comments for v1:

2014-01-13, 11:42:47 -0800, Cong Wang wrote:
> On Sun, Jan 12, 2014 at 11:49 PM, Fan Du <fan.du@windriver.com> wrote:
> >  void xfrm_garbage_collect(struct net *net)
> >  {
> > -       flow_cache_flush();
> > +       flow_cache_flush(net);
> >         __xfrm_garbage_collect(net);
> >  }
> >  EXPORT_SYMBOL(xfrm_garbage_collect);
> >
> >  static void xfrm_garbage_collect_deferred(struct net *net)
> >  {
> > -       flow_cache_flush_deferred();
> > +       flow_cache_flush_deferred(net);
> >         __xfrm_garbage_collect(net);
> >  }
> >
> 
> You changed the prototypes of flow_cache_flush*() in the previous
> patch, so, here you break bisect. They have to be in one commit.


-- 
Sabrina

^ permalink raw reply

* Re: [Patch net-next 6/7] net_sched: cls: move allocation in ->init to generic layer
From: Cong Wang @ 2014-01-14 18:57 UTC (permalink / raw)
  To: David Miller
  Cc: Linux Kernel Network Developers, Thomas Graf, Jamal Hadi Salim
In-Reply-To: <20140113.114923.2164522715863402981.davem@davemloft.net>

On Mon, Jan 13, 2014 at 11:49 AM, David Miller <davem@davemloft.net> wrote:
>
> What I'm going to do for now is apply patches 1-5 and 7.
>

Sounds good to me. I will re-work on it.
Thanks!

^ permalink raw reply

* Re: [PATCH net-next v2] openvswitch: Pad OVS_PACKET_ATTR_PACKET if linear copy was performed
From: Zoltan Kiss @ 2014-01-14 18:56 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Thomas Graf, Jesse Gross, David Miller, dev@openvswitch.org,
	netdev
In-Reply-To: <20140114162749.GC24121@casper.infradead.org>

It works for me, thanks!

Acked-by: Zoltan Kiss <zoltan.kiss@citrix.com>

On 14/01/14 16:27, Thomas Graf wrote:
> While the zerocopy method is correctly omitted if user space
> does not support unaligned Netlink messages. The attribute is
> still not padded correctly as skb_zerocopy() will not ensure
> padding and the attribute size is no longer pre calculated
> though nla_reserve() which ensured padding previously.
>
> This patch applies appropriate padding if a linear data copy
> was performed in skb_zerocopy().
>
> Signed-off-by: Thomas Graf <tgraf@suug.ch>
> ---
> v2: initialize padding to 0's
>
>   net/openvswitch/datapath.c | 7 ++++++-
>   1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index df46928..3ca9121 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -396,7 +396,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>   		.dst_sk = ovs_dp_get_net(dp)->genl_sock,
>   		.snd_portid = upcall_info->portid,
>   	};
> -	size_t len;
> +	size_t len, plen;
>   	unsigned int hlen;
>   	int err, dp_ifindex;
>
> @@ -466,6 +466,11 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
>
>   	skb_zerocopy(user_skb, skb, skb->len, hlen);
>
> +	/* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */
> +	if (!(dp->user_features & OVS_DP_F_UNALIGNED) &&
> +	    (plen = (ALIGN(user_skb->len, NLA_ALIGNTO) - user_skb->len)) > 0)
> +		memset(skb_put(user_skb, plen), 0, plen);
> +
>   	((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len;
>
>   	err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid);
>

^ permalink raw reply

* Re: [Patch net-next 6/7] net_sched: cls: move allocation in ->init to generic layer
From: Cong Wang @ 2014-01-14 18:56 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Linux Kernel Network Developers, Thomas Graf, David S. Miller
In-Reply-To: <52D293A2.2070703@mojatatu.com>

On Sun, Jan 12, 2014 at 5:07 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 01/09/14 19:14, Cong Wang wrote:
>>
>> Most of the filters need allocation of tp->root in ->init()
>> and free it in ->destroy(), make this generic.
>>
>> Also we could reduce the use of tcf_tree_lock a bit.
>>
>> Cc: Thomas Graf <tgraf@suug.ch>
>> Cc: David S. Miller <davem@davemloft.net>
>> Cc: Jamal Hadi Salim <jhs@mojatatu.com>
>> Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
>
>
>
> Hrm. This one worries me a little.
> I dont see how just pre-allocing the private head of the classifier
> magically allows you to get rid of locks. Have you tested against those
> classifiers you changed?
> If those locks are useless - then that is a separate patch to kill
> them (sorry, dont have time to test myself right now).


Yeah, I am not sure either. I will re-think about this locking stuff.

^ permalink raw reply

* Re: [Patch net-next 0/7] net_sched: some more cleanup and improvements
From: Cong Wang @ 2014-01-14 18:54 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: Linux Kernel Network Developers, David S. Miller
In-Reply-To: <52D28B77.7030000@mojatatu.com>

On Sun, Jan 12, 2014 at 4:32 AM, Jamal Hadi Salim <jhs@mojatatu.com> wrote:
> On 01/09/14 19:13, Cong Wang wrote:
>>
>> This patchset collects the previous patches I sent which Jamal doesn't
>> object.
>> They are still some cleanup and improvements for tc actions and filters.
>>
>>
>
> Cong - I dont have time to test at the moment and like you said
> they dont look controversial; i will review them, please try to test
> them to the best you can.
>

I did and I will continue to test them in net-next.

Thanks for reviews!

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox