Netdev List
 help / color / mirror / Atom feed
* [PATCH v8 1/5] skbuff: return -EMSGSIZE in skb_to_sgvec to prevent overflow
From: Jason A. Donenfeld @ 2017-05-11 19:41 UTC (permalink / raw)
  To: netdev, linux-kernel, davem, kernel-hardening
  Cc: Jason A. Donenfeld, Steffen Klassert, Herbert Xu, David Howells,
	Sabrina Dubroca, Michael S. Tsirkin, Jason Wang
In-Reply-To: <20170511194134.31183-1-Jason@zx2c4.com>

This is a defense-in-depth measure in response to bugs like
4d6fa57b4dab ("macsec: avoid heap overflow in skb_to_sgvec"). There's
not only a potential overflow of sglist items, but also a stack overflow
potential, so we fix this by limiting the amount of recursion this function
is allowed to do. Not actually providing a bounded base case is a future
disaster that we can easily avoid here.

As a small matter of house keeping, we take this opportunity to move the
documentation comment over the actual function the documentation is for.

While this could be implemented by using an explicit stack of skbuffs,
when implementing this, the function complexity increased considerably,
and I don't think such complexity and bloat is actually worth it. So,
instead I built this and tested it on x86, x86_64, ARM, ARM64, and MIPS,
and measured the stack usage there. I also reverted the recent MIPS
changes that give it a separate IRQ stack, so that I could experience
some worst-case situations. I found that limiting it to 24 layers deep
yielded a good stack usage with room for safety, as well as being much
deeper than any driver actually ever creates.

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: David Howells <dhowells@redhat.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
---
 include/linux/skbuff.h |  8 +++----
 net/core/skbuff.c      | 65 ++++++++++++++++++++++++++++++++------------------
 2 files changed, 46 insertions(+), 27 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a098d95b3d84..f351df28942d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1001,10 +1001,10 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 				     unsigned int headroom);
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
 				int newtailroom, gfp_t priority);
-int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
-			int offset, int len);
-int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset,
-		 int len);
+int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
+				     int offset, int len);
+int __must_check skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg,
+			      int offset, int len);
 int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer);
 int skb_pad(struct sk_buff *skb, int pad);
 #define dev_kfree_skb(a)	consume_skb(a)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 346d3e85dfbc..ec33811559e3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3482,24 +3482,18 @@ void __init skb_init(void)
 						NULL);
 }
 
-/**
- *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
- *	@skb: Socket buffer containing the buffers to be mapped
- *	@sg: The scatter-gather list to map into
- *	@offset: The offset into the buffer's contents to start mapping
- *	@len: Length of buffer space to be mapped
- *
- *	Fill the specified scatter-gather list with mappings/pointers into a
- *	region of the buffer space attached to a socket buffer.
- */
 static int
-__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+__skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len,
+	       unsigned int recursion_level)
 {
 	int start = skb_headlen(skb);
 	int i, copy = start - offset;
 	struct sk_buff *frag_iter;
 	int elt = 0;
 
+	if (unlikely(recursion_level >= 24))
+		return -EMSGSIZE;
+
 	if (copy > 0) {
 		if (copy > len)
 			copy = len;
@@ -3518,6 +3512,8 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
 		end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
 		if ((copy = end - offset) > 0) {
 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+				return -EMSGSIZE;
 
 			if (copy > len)
 				copy = len;
@@ -3532,16 +3528,22 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
 	}
 
 	skb_walk_frags(skb, frag_iter) {
-		int end;
+		int end, ret;
 
 		WARN_ON(start > offset + len);
 
 		end = start + frag_iter->len;
 		if ((copy = end - offset) > 0) {
+			if (unlikely(elt && sg_is_last(&sg[elt - 1])))
+				return -EMSGSIZE;
+
 			if (copy > len)
 				copy = len;
-			elt += __skb_to_sgvec(frag_iter, sg+elt, offset - start,
-					      copy);
+			ret = __skb_to_sgvec(frag_iter, sg+elt, offset - start,
+					      copy, recursion_level + 1);
+			if (unlikely(ret < 0))
+				return ret;
+			elt += ret;
 			if ((len -= copy) == 0)
 				return elt;
 			offset += copy;
@@ -3552,6 +3554,31 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
 	return elt;
 }
 
+/**
+ *	skb_to_sgvec - Fill a scatter-gather list from a socket buffer
+ *	@skb: Socket buffer containing the buffers to be mapped
+ *	@sg: The scatter-gather list to map into
+ *	@offset: The offset into the buffer's contents to start mapping
+ *	@len: Length of buffer space to be mapped
+ *
+ *	Fill the specified scatter-gather list with mappings/pointers into a
+ *	region of the buffer space attached to a socket buffer. Returns either
+ *	the number of scatterlist items used, or -EMSGSIZE if the contents
+ *	could not fit.
+ */
+int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
+{
+	int nsg = __skb_to_sgvec(skb, sg, offset, len, 0);
+
+	if (nsg <= 0)
+		return nsg;
+
+	sg_mark_end(&sg[nsg - 1]);
+
+	return nsg;
+}
+EXPORT_SYMBOL_GPL(skb_to_sgvec);
+
 /* As compared with skb_to_sgvec, skb_to_sgvec_nomark only map skb to given
  * sglist without mark the sg which contain last skb data as the end.
  * So the caller can mannipulate sg list as will when padding new data after
@@ -3574,19 +3601,11 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
 int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
 			int offset, int len)
 {
-	return __skb_to_sgvec(skb, sg, offset, len);
+	return __skb_to_sgvec(skb, sg, offset, len, 0);
 }
 EXPORT_SYMBOL_GPL(skb_to_sgvec_nomark);
 
-int skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
-{
-	int nsg = __skb_to_sgvec(skb, sg, offset, len);
 
-	sg_mark_end(&sg[nsg - 1]);
-
-	return nsg;
-}
-EXPORT_SYMBOL_GPL(skb_to_sgvec);
 
 /**
  *	skb_cow_data - Check that a socket buffer's data buffers are writable
-- 
2.13.0

^ permalink raw reply related

* [PATCH v8 2/5] ipsec: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-05-11 19:41 UTC (permalink / raw)
  To: netdev, linux-kernel, davem, kernel-hardening
  Cc: Jason A. Donenfeld, Steffen Klassert, Herbert Xu
In-Reply-To: <20170511194134.31183-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
---
 net/ipv4/ah4.c  |  8 ++++++--
 net/ipv4/esp4.c | 20 +++++++++++++-------
 net/ipv6/ah6.c  |  8 ++++++--
 net/ipv6/esp6.c | 20 +++++++++++++-------
 4 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 22377c8ff14b..e8f862358518 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -220,7 +220,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
 	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
@@ -393,7 +395,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	skb_push(skb, ihl);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 65cc02bd82bc..392432860bb9 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -374,9 +374,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 	esp->esph = esph;
 
 	sg_init_table(sg, esp->nfrags);
-	skb_to_sgvec(skb, sg,
-		     (unsigned char *)esph - skb->data,
-		     assoclen + ivlen + esp->clen + alen);
+	err = skb_to_sgvec(skb, sg,
+		           (unsigned char *)esph - skb->data,
+		           assoclen + ivlen + esp->clen + alen);
+	if (unlikely(err < 0))
+		goto error;
 
 	if (!esp->inplace) {
 		int allocsize;
@@ -400,9 +402,11 @@ int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *
 		spin_unlock_bh(&x->lock);
 
 		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-		skb_to_sgvec(skb, dsg,
-			     (unsigned char *)esph - skb->data,
-			     assoclen + ivlen + esp->clen + alen);
+		err = skb_to_sgvec(skb, dsg,
+			           (unsigned char *)esph - skb->data,
+			           assoclen + ivlen + esp->clen + alen);
+		if (unlikely(err < 0))
+			goto error;
 	}
 
 	if ((x->props.flags & XFRM_STATE_ESN))
@@ -687,7 +691,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	esp_input_set_header(skb, seqhi);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	err = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out;
 
 	skb->ip_summed = CHECKSUM_NONE;
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index dda6035e3b84..755f38271dd5 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -423,7 +423,9 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
@@ -606,7 +608,9 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	ip6h->hop_limit   = 0;
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 1fe99ba8066c..2ede4e459c4e 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -346,9 +346,11 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 	esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi);
 
 	sg_init_table(sg, esp->nfrags);
-	skb_to_sgvec(skb, sg,
-		     (unsigned char *)esph - skb->data,
-		     assoclen + ivlen + esp->clen + alen);
+	err = skb_to_sgvec(skb, sg,
+		           (unsigned char *)esph - skb->data,
+		           assoclen + ivlen + esp->clen + alen);
+	if (unlikely(err < 0))
+		goto error;
 
 	if (!esp->inplace) {
 		int allocsize;
@@ -372,9 +374,11 @@ int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info
 		spin_unlock_bh(&x->lock);
 
 		sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-		skb_to_sgvec(skb, dsg,
-			     (unsigned char *)esph - skb->data,
-			     assoclen + ivlen + esp->clen + alen);
+		err = skb_to_sgvec(skb, dsg,
+			           (unsigned char *)esph - skb->data,
+			           assoclen + ivlen + esp->clen + alen);
+		if (unlikely(err < 0))
+			goto error;
 	}
 
 	if ((x->props.flags & XFRM_STATE_ESN))
@@ -618,7 +622,9 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	esp_input_set_header(skb, seqhi);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	ret = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(ret < 0))
+		goto out;
 
 	skb->ip_summed = CHECKSUM_NONE;
 
-- 
2.13.0

^ permalink raw reply related

* [PATCH v8 3/5] rxrpc: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-05-11 19:41 UTC (permalink / raw)
  To: netdev, linux-kernel, davem, kernel-hardening
  Cc: Jason A. Donenfeld, David Howells
In-Reply-To: <20170511194134.31183-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: David Howells <dhowells@redhat.com>
---
 net/rxrpc/rxkad.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 1bb9b2ccc267..ecab9334e3c1 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -227,7 +227,9 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
 	len &= ~(call->conn->size_align - 1);
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, 0, len);
+	err = skb_to_sgvec(skb, sg, 0, len);
+	if (unlikely(err < 0))
+		goto out;
 	skcipher_request_set_crypt(req, sg, sg, len, iv.x);
 	crypto_skcipher_encrypt(req);
 
@@ -342,7 +344,8 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
 		goto nomem;
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, offset, 8);
+	if (unlikely(skb_to_sgvec(skb, sg, offset, 8) < 0))
+		goto nomem;
 
 	/* start the decryption afresh */
 	memset(&iv, 0, sizeof(iv));
@@ -434,7 +437,11 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
 	}
 
 	sg_init_table(sg, nsg);
-	skb_to_sgvec(skb, sg, offset, len);
+	if (unlikely(skb_to_sgvec(skb, sg, offset, len) < 0)) {
+		if (sg != _sg)
+			kfree(sg);
+		goto nomem;
+	}
 
 	/* decrypt from the session key */
 	token = call->conn->params.key->payload.data[0];
-- 
2.13.0

^ permalink raw reply related

* [PATCH v8 4/5] macsec: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-05-11 19:41 UTC (permalink / raw)
  To: netdev, linux-kernel, davem, kernel-hardening
  Cc: Jason A. Donenfeld, Sabrina Dubroca
In-Reply-To: <20170511194134.31183-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: Sabrina Dubroca <sd@queasysnail.net>
---
 drivers/net/macsec.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index cdc347be68f2..dfcb1e9d2ab2 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -742,7 +742,12 @@ static struct sk_buff *macsec_encrypt(struct sk_buff *skb,
 	macsec_fill_iv(iv, secy->sci, pn);
 
 	sg_init_table(sg, ret);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	ret = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(ret < 0)) {
+		macsec_txsa_put(tx_sa);
+		kfree_skb(skb);
+		return ERR_PTR(ret);
+	}
 
 	if (tx_sc->encrypt) {
 		int len = skb->len - macsec_hdr_len(sci_present) -
@@ -952,7 +957,11 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
 	macsec_fill_iv(iv, sci, ntohl(hdr->packet_number));
 
 	sg_init_table(sg, ret);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	ret = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(ret < 0)) {
+		kfree_skb(skb);
+		return ERR_PTR(ret);
+	}
 
 	if (hdr->tci_an & MACSEC_TCI_E) {
 		/* confidentiality: ethernet + macsec header
-- 
2.13.0

^ permalink raw reply related

* [PATCH v8 5/5] virtio_net: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-05-11 19:41 UTC (permalink / raw)
  To: netdev, linux-kernel, davem, kernel-hardening
  Cc: Jason A. Donenfeld, Michael S. Tsirkin, Jason Wang
In-Reply-To: <20170511194134.31183-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Cc: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 9320d96a1632..13fbe4b349c2 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1150,7 +1150,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 	struct virtnet_info *vi = sq->vq->vdev->priv;
-	unsigned num_sg;
+	int num_sg;
 	unsigned hdr_len = vi->hdr_len;
 	bool can_push;
 
@@ -1177,11 +1177,16 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	if (can_push) {
 		__skb_push(skb, hdr_len);
 		num_sg = skb_to_sgvec(skb, sq->sg, 0, skb->len);
+                if (unlikely(num_sg < 0))
+			return num_sg;
 		/* Pull header back to avoid skew in tx bytes calculations. */
 		__skb_pull(skb, hdr_len);
 	} else {
 		sg_set_buf(sq->sg, hdr, hdr_len);
-		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
+		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
+		if (unlikely(num_sg < 0))
+			return num_sg;
+		num_sg++;
 	}
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
-- 
2.13.0

^ permalink raw reply related

* Re: Fw: [Bug 195713] New: TCP recv queue grows huge
From: Eric Dumazet @ 2017-05-11 19:42 UTC (permalink / raw)
  To: Michael Madsen; +Cc: Stephen Hemminger, Eric Dumazet, netdev
In-Reply-To: <c5a4858f-1559-7c75-b224-99391e6bb6c4@nabto.com>

On Thu, 2017-05-11 at 21:29 +0200, Michael Madsen wrote:
> 
> On 05/11/2017 07:06 PM, Eric Dumazet wrote:
> > On Thu, 2017-05-11 at 09:47 -0700, Stephen Hemminger wrote:
> >> Begin forwarded message:
> >>
> >> Date: Thu, 11 May 2017 13:25:23 +0000
> >> From: bugzilla-daemon@bugzilla.kernel.org
> >> To: stephen@networkplumber.org
> >> Subject: [Bug 195713] New: TCP recv queue grows huge
> >>
> >>
> >> https://bugzilla.kernel.org/show_bug.cgi?id=195713
> >>
> >>              Bug ID: 195713
> >>             Summary: TCP recv queue grows huge
> >>             Product: Networking
> >>             Version: 2.5
> >>      Kernel Version: 3.13.0 4.4.0 4.9.0
> >>            Hardware: All
> >>                  OS: Linux
> >>                Tree: Mainline
> >>              Status: NEW
> >>            Severity: normal
> >>            Priority: P1
> >>           Component: IPV4
> >>            Assignee: stephen@networkplumber.org
> >>            Reporter: mkm@nabto.com
> >>          Regression: No
> >>
> >> I was testing how TCP handled advertising reductions of the window sizes
> >> especially Window Full events. To create this setup I made a slow TCP receiver
> >> and a fast TCP sender. To add some reality to the scenario I simulated 10ms
> >> delay on the loopback device using the netem tc module.
> >>
> >> Steps to reproduce:
> >> Bevare these steps will use all the memory on your system
> >>
> >> 1. create latency on loopback
> >>> sudo tc qdisc change dev lo root netem delay 0ms
> >> 2. slow tcp receiver:
> >>> nc -l 4242 | pv -L 1k
> >> 3. fast tcp sender:
> >>> nc 127.0.0.1 4242 < /dev/zero
> >> What to expect:
> >> It is expected that the TCP recv queue is not groving unbounded e.g. the
> >> following output from netstat:
> >>
> >>> netstat -an | grep 4242
> >>> tcp   5563486      0 127.0.0.1:4242          127.0.0.1:59113
> >>> ESTABLISHED
> >>> tcp        0 3415559 127.0.0.1:59113         127.0.0.1:4242
> >>> ESTABLISHED
> >> What is seen:
> >>
> >> The TCP receive queue grows until there is no more memory available on the
> >> system.
> >>
> >>> netstat -an | grep 4242
> >>> tcp   223786525      0 127.0.0.1:4242          127.0.0.1:59114
> >>> ESTABLISHED
> >>> tcp        0   4191037 127.0.0.1:59114         127.0.0.1:4242
> >>> ESTABLISHED
> >> Note: After the TCP recv queue reaches ~ 2^31 bytes netstat reports a 0 which
> >> is not correct, it has probably not been created with this bug in mind.
> >>
> >> Systems on which the bug reproducible:
> >>
> >>    * debian testing, kernel 4.9.0
> >>    * ubuntu 14.04, kernel 3.13.0
> >>    * ubuntu 16.04, kernel 4.4.0
> >>
> >> I have not testet on other systems than the above mentioned.
> >>
> >
> > Not reproducible on my test machine.
> >
> > Somehow some sysctl must have been set to an insane value by
> > mkm@nabto.com ?
> >
> > Please use/report ss -temoi instead of old netstat which does not
> > provide info.
> >
> > lpaa23:~# tc -s -d qd sh dev lo
> > qdisc netem 8002: root refcnt 2 limit 1000
> >   Sent 1153017 bytes 388 pkt (dropped 0, overlimits 0 requeues 0)
> >   backlog 0b 0p requeues 0
> >
> > lpaa23:~# ss -temoi dst :4242 or src :4242
> > State      Recv-Q Send-Q Local Address:Port                 Peer
> > Address:Port
> > ESTAB      0      3255206 127.0.0.1:35672                127.0.0.1:4242
> > timer:(persist,15sec,0) ino:3740676 sk:1 <->
> > 	 skmem:(r0,rb1060272,t0,tb4194304,f2650,w3319206,o0,bl0,d0) ts sack
> > cubic wscale:8,8 rto:230 backoff:7 rtt:20.879/26.142 mss:65483
> > rcvmss:536 advmss:65483 cwnd:19 ssthresh:19 bytes_acked:3258385
> > segs_out:86 segs_in:50 data_segs_out:68 send 476.7Mbps lastsnd:43940
> > lastrcv:163390 lastack:13500 pacing_rate 572.0Mbps delivery_rate
> > 11146.0Mbps busy:163390ms rwnd_limited:163380ms(100.0%) retrans:0/1
> > rcv_space:43690 notsent:3255206 minrtt:0.002
> > ESTAB      3022864 0      127.0.0.1:4242                 127.0.0.1:35672
> > ino:3703653 sk:2 <->
> > 	 skmem:(r3259664,rb3406910,t0,tb2626560,f752,w0,o0,bl0,d17) ts sack
> > cubic wscale:8,8 rto:210 rtt:0.019/0.009 ato:120 mss:21888 rcvmss:65483
> > advmss:65483 cwnd:10 bytes_received:3258384 segs_out:49 segs_in:86
> > data_segs_in:68 send 92160.0Mbps lastsnd:163390 lastrcv:43940
> > lastack:43940 rcv_rtt:0.239 rcv_space:61440 minrtt:0.019
> >
> >
> > lpaa23:~# uname -a
> > Linux lpaa23 4.11.0-smp-DEV #197 SMP @1494476384 x86_64 GNU/Linux
> >
> >
> >
> 
> I've made an error in the bugreport, sorry, the tc step should set a 
> nonzero delay e.g.
> tc qdisc change dev lo root netem delay 100ms
> 
> tc -s -d qd sh dev lo
> qdisc netem 8001: root refcnt 2 limit 1000 delay 100.0ms
>   Sent 2310729789 bytes 56051 pkt (dropped 0, overlimits 0 requeues 0)
>   backlog 0b 0p requeues 0
> 
> netstat -an | grep 4242
> tcp   1737737598      0 127.0.0.1:4242 127.0.0.1:47724         ESTABLISHED
> tcp        0 3734810 127.0.0.1:47724         127.0.0.1:4242 ESTABLISHED
> 
> ss -temoi dst :4242 or src :4242
> State      Recv-Q Send-Q Local Address:Port                 Peer 
> Address:Port
> ESTAB      1771226600 0      127.0.0.1:4242 
> 127.0.0.1:47724                 uid:1000 ino:248318 sk:21 <->
> skmem:(r4292138050,rb5633129,t40,tb2626560,f3006,w0,o0,bl0,d0) ts sack 
> cubic wscale:7,7 rto:600 rtt:200.15/100.075 ato:40 mss:21888 cwnd:10 
> bytes_received:1771576125 segs_out:13932 segs_in:27728 
> data_segs_in:27726 send 8.7Mbps lastsnd:132144 lastrcv:4 lastack:4852 
> pacing_rate 17.5Mbps rcv_rtt:202 rcv_space:188413 minrtt:200.15
> ESTAB      0      3866200 127.0.0.1:47724 
> 127.0.0.1:4242                  timer:(on,372ms,0) uid:1000 ino:246613 
> sk:22 <->
> skmem:(r0,rb1061808,t4,tb4194304,f267688,w3943000,o0,bl0,d0) ts sack 
> cubic wscale:7,7 rto:404 rtt:200.112/0.058 mss:65483 cwnd:89 
> bytes_acked:1769019586 segs_out:27732 segs_in:13913 data_segs_out:27730 
> send 233.0Mbps lastsnd:32 lastrcv:26247708 lastack:32 pacing_rate 
> 466.0Mbps unacked:44 rcv_space:43690 notsent:1047728 minrtt:200.011
> 
> uname -a
> Linux mkm 4.9.0-2-amd64 #1 SMP Debian 4.9.18-1 (2017-03-30) x86_64 GNU/Linux

Oh this is a bug in netem, using skb_orphan_partial() even for packets
that might loopback to this host.

I will send a fix, thanks for the report.

^ permalink raw reply

* Re: [PATCH v2 1/2] net: Added mtu parameter to dev_forward_skb calls
From: Stephen Hemminger @ 2017-05-11 19:44 UTC (permalink / raw)
  To: Fredrik Markström
  Cc: Eric Dumazet, Daniel Borkmann, netdev, bridge, linux-kernel,
	Alexei Starovoitov, David S. Miller
In-Reply-To: <CAKdL+dS_-UiHA5tr7zP-5LtwDd0aXpZq5tpMYf9kr7hu2SwdGQ@mail.gmail.com>

On Thu, 11 May 2017 21:10:11 +0200
Fredrik Markström <fredrik.markstrom@gmail.com> wrote:

> On Thu, May 11, 2017 at 6:01 PM, Stephen Hemminger
> <stephen@networkplumber.org> wrote:
> > On Thu, 11 May 2017 15:46:27 +0200
> > Fredrik Markstrom <fredrik.markstrom@gmail.com> wrote:
> >  
> >> From: Fredrik Markström <fredrik.markstrom@gmail.com>
> >>
> >> is_skb_forwardable() currently checks if the packet size is <= mtu of
> >> the receiving interface. This is not consistent with most of the hardware
> >> ethernet drivers that happily receives packets larger then MTU.  
> >
> > Wrong.  
> 
> What is "Wrong" ? I was initially skeptical to implement this patch,
> since it feels odd to have different MTU:s set on the two sides of a
> link. After consulting some IP people and the RFC:s I kind of changed
> my mind and thought I'd give it a shot. In the RFCs I couldn't find
> anything that defined when and when not a received packet should be
> dropped.
> 
> >
> > Hardware interfaces are free to drop any packet greater than MTU (actually MTU + VLAN).
> > The actual limit is a function of the hardware. Some hardware can only limit by
> > power of 2; some can only limit frames larger than 1500; some have no limiting at all.  
> 
> Agreed. The purpose of these patches is to be able to configure an
> veth interface to mimic these different behaviors. Non of the Ethernet
> interfaces I have access to drops packets due to them being larger
> then the configured MTU like veth does.
> 
> Being able to mimic real Ethernet hardware is useful when
> consolidating hardware using containers/namespaces.
> 
> In a reply to a comment from David Miller in my previous version of
> the patch I attached the example below to demonstrate the case in
> detail.
> 
> This works with all ethernet hardware setups I have access to:
> 

Why not just use an iptables rule to enforce what ever semantic you
want?

^ permalink raw reply

* Re: [PATCH v5 13/17] dt-bindings: qca7000: rename binding
From: Michael Heimpold @ 2017-05-11 19:48 UTC (permalink / raw)
  To: Stefan Wahren
  Cc: Rob Herring, David S. Miller, Mark Rutland, Greg Kroah-Hartman,
	Jiri Slaby, linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1494406408-31760-14-git-send-email-stefan.wahren-eS4NqCHxEME@public.gmane.org>

Hi,

Am Mittwoch, 10. Mai 2017, 10:53:24 CEST schrieb Stefan Wahren:
> Before we can merge the QCA7000 UART binding the document needs to be
> renamed.
> 
> Signed-off-by: Stefan Wahren <stefan.wahren-eS4NqCHxEME@public.gmane.org>
> ---
>  .../devicetree/bindings/net/{qca-qca7000-spi.txt => qca-qca7000.txt}      |
> 0 1 file changed, 0 insertions(+), 0 deletions(-)
>  rename Documentation/devicetree/bindings/net/{qca-qca7000-spi.txt =>
> qca-qca7000.txt} (100%)
> 
> diff --git a/Documentation/devicetree/bindings/net/qca-qca7000-spi.txt
> b/Documentation/devicetree/bindings/net/qca-qca7000.txt similarity index
> 100%
> rename from Documentation/devicetree/bindings/net/qca-qca7000-spi.txt
> rename to Documentation/devicetree/bindings/net/qca-qca7000.txt

I was told here [1], that it is prefered when the filename matchs the
compatible string, so using "qca,qca7000.txt" should be better.

Regards,
Michael


[1]
https://www.spinics.net/lists/devicetree/msg124088.html
--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: Fw: [Bug 195713] New: TCP recv queue grows huge
From: Michael Madsen @ 2017-05-11 19:29 UTC (permalink / raw)
  To: Eric Dumazet, Stephen Hemminger; +Cc: Eric Dumazet, netdev
In-Reply-To: <1494522379.7796.112.camel@edumazet-glaptop3.roam.corp.google.com>



On 05/11/2017 07:06 PM, Eric Dumazet wrote:
> On Thu, 2017-05-11 at 09:47 -0700, Stephen Hemminger wrote:
>> Begin forwarded message:
>>
>> Date: Thu, 11 May 2017 13:25:23 +0000
>> From: bugzilla-daemon@bugzilla.kernel.org
>> To: stephen@networkplumber.org
>> Subject: [Bug 195713] New: TCP recv queue grows huge
>>
>>
>> https://bugzilla.kernel.org/show_bug.cgi?id=195713
>>
>>              Bug ID: 195713
>>             Summary: TCP recv queue grows huge
>>             Product: Networking
>>             Version: 2.5
>>      Kernel Version: 3.13.0 4.4.0 4.9.0
>>            Hardware: All
>>                  OS: Linux
>>                Tree: Mainline
>>              Status: NEW
>>            Severity: normal
>>            Priority: P1
>>           Component: IPV4
>>            Assignee: stephen@networkplumber.org
>>            Reporter: mkm@nabto.com
>>          Regression: No
>>
>> I was testing how TCP handled advertising reductions of the window sizes
>> especially Window Full events. To create this setup I made a slow TCP receiver
>> and a fast TCP sender. To add some reality to the scenario I simulated 10ms
>> delay on the loopback device using the netem tc module.
>>
>> Steps to reproduce:
>> Bevare these steps will use all the memory on your system
>>
>> 1. create latency on loopback
>>> sudo tc qdisc change dev lo root netem delay 0ms
>> 2. slow tcp receiver:
>>> nc -l 4242 | pv -L 1k
>> 3. fast tcp sender:
>>> nc 127.0.0.1 4242 < /dev/zero
>> What to expect:
>> It is expected that the TCP recv queue is not groving unbounded e.g. the
>> following output from netstat:
>>
>>> netstat -an | grep 4242
>>> tcp   5563486      0 127.0.0.1:4242          127.0.0.1:59113
>>> ESTABLISHED
>>> tcp        0 3415559 127.0.0.1:59113         127.0.0.1:4242
>>> ESTABLISHED
>> What is seen:
>>
>> The TCP receive queue grows until there is no more memory available on the
>> system.
>>
>>> netstat -an | grep 4242
>>> tcp   223786525      0 127.0.0.1:4242          127.0.0.1:59114
>>> ESTABLISHED
>>> tcp        0   4191037 127.0.0.1:59114         127.0.0.1:4242
>>> ESTABLISHED
>> Note: After the TCP recv queue reaches ~ 2^31 bytes netstat reports a 0 which
>> is not correct, it has probably not been created with this bug in mind.
>>
>> Systems on which the bug reproducible:
>>
>>    * debian testing, kernel 4.9.0
>>    * ubuntu 14.04, kernel 3.13.0
>>    * ubuntu 16.04, kernel 4.4.0
>>
>> I have not testet on other systems than the above mentioned.
>>
>
> Not reproducible on my test machine.
>
> Somehow some sysctl must have been set to an insane value by
> mkm@nabto.com ?
>
> Please use/report ss -temoi instead of old netstat which does not
> provide info.
>
> lpaa23:~# tc -s -d qd sh dev lo
> qdisc netem 8002: root refcnt 2 limit 1000
>   Sent 1153017 bytes 388 pkt (dropped 0, overlimits 0 requeues 0)
>   backlog 0b 0p requeues 0
>
> lpaa23:~# ss -temoi dst :4242 or src :4242
> State      Recv-Q Send-Q Local Address:Port                 Peer
> Address:Port
> ESTAB      0      3255206 127.0.0.1:35672                127.0.0.1:4242
> timer:(persist,15sec,0) ino:3740676 sk:1 <->
> 	 skmem:(r0,rb1060272,t0,tb4194304,f2650,w3319206,o0,bl0,d0) ts sack
> cubic wscale:8,8 rto:230 backoff:7 rtt:20.879/26.142 mss:65483
> rcvmss:536 advmss:65483 cwnd:19 ssthresh:19 bytes_acked:3258385
> segs_out:86 segs_in:50 data_segs_out:68 send 476.7Mbps lastsnd:43940
> lastrcv:163390 lastack:13500 pacing_rate 572.0Mbps delivery_rate
> 11146.0Mbps busy:163390ms rwnd_limited:163380ms(100.0%) retrans:0/1
> rcv_space:43690 notsent:3255206 minrtt:0.002
> ESTAB      3022864 0      127.0.0.1:4242                 127.0.0.1:35672
> ino:3703653 sk:2 <->
> 	 skmem:(r3259664,rb3406910,t0,tb2626560,f752,w0,o0,bl0,d17) ts sack
> cubic wscale:8,8 rto:210 rtt:0.019/0.009 ato:120 mss:21888 rcvmss:65483
> advmss:65483 cwnd:10 bytes_received:3258384 segs_out:49 segs_in:86
> data_segs_in:68 send 92160.0Mbps lastsnd:163390 lastrcv:43940
> lastack:43940 rcv_rtt:0.239 rcv_space:61440 minrtt:0.019
>
>
> lpaa23:~# uname -a
> Linux lpaa23 4.11.0-smp-DEV #197 SMP @1494476384 x86_64 GNU/Linux
>
>
>

I've made an error in the bugreport, sorry, the tc step should set a 
nonzero delay e.g.
tc qdisc change dev lo root netem delay 100ms

tc -s -d qd sh dev lo
qdisc netem 8001: root refcnt 2 limit 1000 delay 100.0ms
  Sent 2310729789 bytes 56051 pkt (dropped 0, overlimits 0 requeues 0)
  backlog 0b 0p requeues 0

netstat -an | grep 4242
tcp   1737737598      0 127.0.0.1:4242 127.0.0.1:47724         ESTABLISHED
tcp        0 3734810 127.0.0.1:47724         127.0.0.1:4242 ESTABLISHED

ss -temoi dst :4242 or src :4242
State      Recv-Q Send-Q Local Address:Port                 Peer 
Address:Port
ESTAB      1771226600 0      127.0.0.1:4242 
127.0.0.1:47724                 uid:1000 ino:248318 sk:21 <->
skmem:(r4292138050,rb5633129,t40,tb2626560,f3006,w0,o0,bl0,d0) ts sack 
cubic wscale:7,7 rto:600 rtt:200.15/100.075 ato:40 mss:21888 cwnd:10 
bytes_received:1771576125 segs_out:13932 segs_in:27728 
data_segs_in:27726 send 8.7Mbps lastsnd:132144 lastrcv:4 lastack:4852 
pacing_rate 17.5Mbps rcv_rtt:202 rcv_space:188413 minrtt:200.15
ESTAB      0      3866200 127.0.0.1:47724 
127.0.0.1:4242                  timer:(on,372ms,0) uid:1000 ino:246613 
sk:22 <->
skmem:(r0,rb1061808,t4,tb4194304,f267688,w3943000,o0,bl0,d0) ts sack 
cubic wscale:7,7 rto:404 rtt:200.112/0.058 mss:65483 cwnd:89 
bytes_acked:1769019586 segs_out:27732 segs_in:13913 data_segs_out:27730 
send 233.0Mbps lastsnd:32 lastrcv:26247708 lastack:32 pacing_rate 
466.0Mbps unacked:44 rcv_space:43690 notsent:1047728 minrtt:200.011

uname -a
Linux mkm 4.9.0-2-amd64 #1 SMP Debian 4.9.18-1 (2017-03-30) x86_64 GNU/Linux

^ permalink raw reply

* [PATCH net] samples/bpf: run cleanup routines when receiving SIGTERM
From: Andy Gospodarek @ 2017-05-11 19:52 UTC (permalink / raw)
  To: netdev; +Cc: shahid.habib, Andy Gospodarek

Shahid Habib noticed that when xdp1 was killed from a different console the xdp
program was not cleaned-up properly in the kernel and it continued to forward
traffic.

Most of the applications in samples/bpf cleanup properly, but only when getting
SIGINT.  Since kill defaults to using SIGTERM, add support to cleanup when the
application receives either SIGINT or SIGTERM.

Signed-off-by: Andy Gospodarek <andy@greyhouse.net>
Reported-by: Shahid Habib <shahid.habib@broadcom.com>
---
 samples/bpf/cookie_uid_helper_example.c | 4 +++-
 samples/bpf/offwaketime_user.c          | 1 +
 samples/bpf/sampleip_user.c             | 1 +
 samples/bpf/trace_event_user.c          | 1 +
 samples/bpf/tracex2_user.c              | 1 +
 samples/bpf/xdp1_user.c                 | 1 +
 samples/bpf/xdp_tx_iptunnel_user.c      | 1 +
 7 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c
index b08ab4e..9d751e2 100644
--- a/samples/bpf/cookie_uid_helper_example.c
+++ b/samples/bpf/cookie_uid_helper_example.c
@@ -306,7 +306,9 @@ int main(int argc, char *argv[])
 	prog_attach_iptables(argv[2]);
 	if (cfg_test_traffic) {
 		if (signal(SIGINT, finish) == SIG_ERR)
-			error(1, errno, "register handler failed");
+			error(1, errno, "register SIGINT handler failed");
+		if (signal(SIGTERM, finish) == SIG_ERR)
+			error(1, errno, "register SIGTERM handler failed");
 		while (!test_finish) {
 			print_table();
 			printf("\n");
diff --git a/samples/bpf/offwaketime_user.c b/samples/bpf/offwaketime_user.c
index 9cce2a6..512f87a 100644
--- a/samples/bpf/offwaketime_user.c
+++ b/samples/bpf/offwaketime_user.c
@@ -100,6 +100,7 @@ int main(int argc, char **argv)
 	setrlimit(RLIMIT_MEMLOCK, &r);
 
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	if (load_kallsyms()) {
 		printf("failed to process /proc/kallsyms\n");
diff --git a/samples/bpf/sampleip_user.c b/samples/bpf/sampleip_user.c
index be59d7d..4ed690b 100644
--- a/samples/bpf/sampleip_user.c
+++ b/samples/bpf/sampleip_user.c
@@ -180,6 +180,7 @@ int main(int argc, char **argv)
 		return 1;
 	}
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	/* do sampling */
 	printf("Sampling at %d Hertz for %d seconds. Ctrl-C also ends.\n",
diff --git a/samples/bpf/trace_event_user.c b/samples/bpf/trace_event_user.c
index 0c5561d..fa43364 100644
--- a/samples/bpf/trace_event_user.c
+++ b/samples/bpf/trace_event_user.c
@@ -192,6 +192,7 @@ int main(int argc, char **argv)
 	setrlimit(RLIMIT_MEMLOCK, &r);
 
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	if (load_kallsyms()) {
 		printf("failed to process /proc/kallsyms\n");
diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c
index 7fee0f1..7321a3f 100644
--- a/samples/bpf/tracex2_user.c
+++ b/samples/bpf/tracex2_user.c
@@ -127,6 +127,7 @@ int main(int ac, char **argv)
 	}
 
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	/* start 'ping' in the background to have some kfree_skb events */
 	f = popen("ping -c5 localhost", "r");
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 378850c..c5dc13f 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -102,6 +102,7 @@ int main(int argc, char **argv)
 	}
 
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) {
 		printf("link set xdp fd failed\n");
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 92b8bde..cf09f08 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -239,6 +239,7 @@ int main(int argc, char **argv)
 	}
 
 	signal(SIGINT, int_exit);
+	signal(SIGTERM, int_exit);
 
 	while (min_port <= max_port) {
 		vip.dport = htons(min_port++);
-- 
2.7.4

^ permalink raw reply related

* Re: [PATCH][V2] ethernet: aquantia: remove redundant checks on error status
From: Lino Sanfilippo @ 2017-05-11 19:55 UTC (permalink / raw)
  To: Colin King, Pavel Belous, David S . Miller, David VomLehn,
	Alexander Loktionov, Dmitry Bezrukov, Dmitrii Tarakanov, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20170511182940.18774-1-colin.king@canonical.com>

Hi,

On 11.05.2017 20:29, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
> 
> The error status err is initialized as zero and then being checked
> several times to see if it is less than zero even when it has not
> been updated.  It may seem that the err should be assigned to the
> return code of the call to the various *offload_en_set calls and
> then we check for failure, however, these functions are void and
> never actually return any status.  
> 
> Since these error checks are redundant we can remove these
> as well as err and the error exit label err_exit.
> 
> Detected by CoverityScan, CID#1398313 and CID#1398306 ("Logically
> dead code")
> 
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

FWIW:


Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>

Regards,
Lino

^ permalink raw reply

* Re: [PATCH][V2] ethernet: aquantia: remove redundant checks on error status
From: Pavel Belous @ 2017-05-11 20:02 UTC (permalink / raw)
  To: Colin King, David S . Miller, David VomLehn, Alexander Loktionov,
	Dmitry Bezrukov, Dmitrii Tarakanov, netdev
  Cc: kernel-janitors, linux-kernel
In-Reply-To: <20170511182940.18774-1-colin.king@canonical.com>



On 11.05.2017 21:29, Colin King wrote:
> From: Colin Ian King <colin.king@canonical.com>
>
> The error status err is initialized as zero and then being checked
> several times to see if it is less than zero even when it has not
> been updated.  It may seem that the err should be assigned to the
> return code of the call to the various *offload_en_set calls and
> then we check for failure, however, these functions are void and
> never actually return any status.
>
> Since these error checks are redundant we can remove these
> as well as err and the error exit label err_exit.
>
> Detected by CoverityScan, CID#1398313 and CID#1398306 ("Logically
> dead code")
>
> Signed-off-by: Colin Ian King <colin.king@canonical.com>

Acked-by: Pavel Belous <pavel.belous@aquantia.com>

Regards,
Pavel

^ permalink raw reply

* Advice on user space application integration with tc
From: Morgan Yang @ 2017-05-11 20:45 UTC (permalink / raw)
  To: netdev

Hi All:

I want to build a solution that leverages the filtering and actions of
tc in kernel space, but have the ability to hook  to a userspace
application that can additional packet processing (such as payload
masking). I'm curious what are the best ways to go about doing that? I
have been looking into tc-skbmod and tc-pedit, but as good as they
are, they would require newer kernels. I have also tried using tc to
mirror filterd packets to a dummy or tap interface, and have the
userspace application pick up there, but the performance has been
supar. I'm hoping to have a solution that avoids the extra mirroring.
Much Thanks
Morgan Yang

^ permalink raw reply

* Re: [PATCH] ip: mpls: fix printing of mpls labels
From: David Ahern @ 2017-05-11 20:52 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev, roopa
In-Reply-To: <20170511110941.344a3011@xeon-e3>

On 5/11/17 11:09 AM, Stephen Hemminger wrote:
> On Mon,  8 May 2017 23:04:13 -0700
> David Ahern <dsahern@gmail.com> wrote:
> 
>> If the kernel returns more labels than iproute2 expects, none of
>> the labels are printed and (null) is shown instead:
>>     $ ip -f mpls ro ls
>>     101 as to (null) via inet 172.16.2.2 dev virt12
>>     201 as to 202/203 via inet6 2001:db8:2::2 dev virt12
>>
>> Remove the use of MPLS_MAX_LABELS and rely on buffer length that is
>> passed to mpls_ntop. With this change ip can print the label stack
>> returned by the kernel up to 255 characters (limit is due to size of
>> buf passed in) which amounts to 31 labels with a separator.
>>
>> With this change the above is:
>>     $ ip/ip -f mpls ro ls
>>     101 as to 102/103/104/105/106/107/108/109/110 via inet 172.16.2.2 dev virt12
>>
>> Signed-off-by: David Ahern <dsahern@gmail.com>
> 
> Much better. Applied thanks.
> 

This is only one-half of the solution; the install side is harder. I'll
send something in the next few days.

^ permalink raw reply

* Re: [PATCH v2 5/7] bpf: Add verifier test case for alignment.
From: Alexander Alemayhu @ 2017-05-11 21:19 UTC (permalink / raw)
  To: David Miller; +Cc: daniel, ast, alexei.starovoitov, netdev
In-Reply-To: <20170511.120601.246881624526451669.davem@davemloft.net>

On Thu, May 11, 2017 at 12:06:01PM -0400, David Miller wrote:
> 
> +static int do_test_single(struct bpf_align_test *test)
> +{
> +	struct bpf_insn *prog = test->insns;
> +	int prog_type = test->prog_type;
> +	int prog_len, i;
> +	int fd_prog;
> +	int ret;
> +
> +	prog_len = probe_filter_length(prog);
> +	fd_prog = bpf_verify_program(prog_type ? : BPF_PROG_TYPE_SOCKET_FILTER,
> +				     prog, prog_len, 1, "GPL", 0,
> +				     bpf_vlog, sizeof(bpf_vlog));
> +	if (fd_prog < 0) {
> +		printf("Failed to load program.\n");
> +		printf("%s", bpf_vlog);
> +		ret = 1;
> +	} else {
> +		ret = 0;
> +		for (i = 0; i < MAX_MATCHES; i++) {
> +			const char *t, *m = test->matches[i];
> +
> +			if (!m)
> +				break;
> +			t = strstr(bpf_vlog, m);
> +			if (!t) {
> +				printf("Failed to find match: %s\n", m);
> +				ret = 1;
> +				printf("%s", bpf_vlog);
> +				break;
> +			}
> +		}
> +		/* printf("%s", bpf_vlog); */
Why is this commented out?

-- 
Mit freundlichen Grüßen

Alexander Alemayhu

^ permalink raw reply

* [net-dsa-mv88e6xxx] question about potential use of uninitialized variable
From: Gustavo A. R. Silva @ 2017-05-11 21:35 UTC (permalink / raw)
  To: Andrew Lunn, Vivien Didelot, Florian Fainelli; +Cc: netdev, linux-kernel


Hello everybody,

While looking into Coverity ID 1398130 I ran into the following piece  
of code at drivers/net/dsa/mv88e6xxx/chip.c:849:

  849static uint64_t _mv88e6xxx_get_ethtool_stat(struct mv88e6xxx_chip *chip,
  850                                            struct mv88e6xxx_hw_stat *s,
  851                                            int port, u16 bank1_select,
  852                                            u16 histogram)
  853{
  854        u32 low;
  855        u32 high = 0;
  856        u16 reg = 0;
  857        int err;
  858        u64 value;
  859
  860        switch (s->type) {
  861        case STATS_TYPE_PORT:
  862                err = mv88e6xxx_port_read(chip, port, s->reg, &reg);
  863                if (err)
  864                        return UINT64_MAX;
  865
  866                low = reg;
  867                if (s->sizeof_stat == 4) {
  868                        err = mv88e6xxx_port_read(chip, port,  
s->reg + 1, &reg);
  869                        if (err)
  870                                return UINT64_MAX;
  871                        high = reg;
  872                }
  873                break;
  874        case STATS_TYPE_BANK1:
  875                reg = bank1_select;
  876                /* fall through */
  877        case STATS_TYPE_BANK0:
  878                reg |= s->reg | histogram;
  879                mv88e6xxx_g1_stats_read(chip, reg, &low);
  880                if (s->sizeof_stat == 8)
  881                        mv88e6xxx_g1_stats_read(chip, reg + 1, &high);
  882        }
  883        value = (((u64)high) << 16) | low;
  884        return value;
  885}

My question here is if there is any chance for the execution path to  
directly jump from line 860 to line 883, hence ending up using the  
uninitialized variable _low_?

I'm trying to figure out if this is a false positive or something that  
needs to be fixed.

I'd really appreciate any comment on this.

Thank you!
--
Gustavo A. R. Silva

^ permalink raw reply

* Re: [PATCH v2 5/7] bpf: Add verifier test case for alignment.
From: David Miller @ 2017-05-11 21:53 UTC (permalink / raw)
  To: alexander; +Cc: daniel, ast, alexei.starovoitov, netdev
In-Reply-To: <20170511211945.GB20107@gmail.com>

From: Alexander Alemayhu <alexander@alemayhu.com>
Date: Thu, 11 May 2017 23:19:45 +0200

> Why is this commented out?

That was a debugging hack while developing this code, I'll
remove it.

Thanks.

^ permalink raw reply

* Re: [PATCH v5 17/17] net: qualcomm: add QCA7000 UART driver
From: Lino Sanfilippo @ 2017-05-11 21:58 UTC (permalink / raw)
  To: Stefan Wahren, Rob Herring, David S. Miller
  Cc: Mark Rutland, Greg Kroah-Hartman, Jiri Slaby,
	linux-serial-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	netdev-u79uwXL29TY76Z2rM5mHXA, devicetree-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <1494406408-31760-18-git-send-email-stefan.wahren-eS4NqCHxEME@public.gmane.org>

Hi,

On 10.05.2017 10:53, Stefan Wahren wrote:

> +static int qcauart_netdev_init(struct net_device *dev)
> +{
> +	struct qcauart *qca = netdev_priv(dev);
> +	size_t len;
> +
> +	/* Finish setting up the device info. */
> +	dev->mtu = QCAFRM_MAX_MTU;
> +	dev->type = ARPHRD_ETHER;
> +
> +	qca->rx_skb = netdev_alloc_skb_ip_align(qca->net_dev,
> +						qca->net_dev->mtu +
> +						VLAN_ETH_HLEN);
> +	if (!qca->rx_skb)
> +		return -ENOBUFS;
> +
> +	len = QCAFRM_HEADER_LEN + QCAFRM_MAX_LEN + QCAFRM_FOOTER_LEN;
> +	qca->tx_buffer = kmalloc(len, GFP_KERNEL);
> +	if (!qca->tx_buffer)
> +		return -ENOBUFS;

Freeing the rx_skb is missing here.

> +
> +static void qcauart_netdev_setup(struct net_device *dev)
> +{
> +	struct qcauart *qca;
> +
> +	dev->netdev_ops = &qcauart_netdev_ops;
> +	dev->watchdog_timeo = QCAUART_TX_TIMEOUT;
> +	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
> +	dev->tx_queue_len = 100;
> +
> +	/* MTU range: 46 - 1500 */
> +	dev->min_mtu = QCAFRM_MIN_MTU;
> +	dev->max_mtu = QCAFRM_MAX_MTU;
> +
> +	qca = netdev_priv(dev);
> +	memset(qca, 0, sizeof(struct qcauart));

Zeroing the private data is not necessary since it is already done
at device allocation.


> +
> +static int qca_uart_probe(struct serdev_device *serdev)
> +{
> +	struct net_device *qcauart_dev = alloc_etherdev(sizeof(struct qcauart));
> +	struct qcauart *qca;
> +	const char *mac;
> +	u32 speed = 115200;
> +	int ret;
> +
> +	if (!qcauart_dev)
> +		return -ENOMEM;
> +
> +	qcauart_netdev_setup(qcauart_dev);
> +
> +	qca = netdev_priv(qcauart_dev);
> +	if (!qca) {
> +		pr_err("qca_uart: Fail to retrieve private structure\n");
> +		ret = -ENOMEM;
> +		goto free;
> +	}
> +	qca->net_dev = qcauart_dev;
> +	qca->serdev = serdev;
> +	qcafrm_fsm_init_uart(&qca->frm_handle);
> +
> +	spin_lock_init(&qca->lock);
> +	INIT_WORK(&qca->tx_work, qcauart_transmit);
> +
> +	of_property_read_u32(serdev->dev.of_node, "current-speed", &speed);
> +
> +	mac = of_get_mac_address(serdev->dev.of_node);
> +
> +	if (mac)
> +		ether_addr_copy(qca->net_dev->dev_addr, mac);
> +
> +	if (!is_valid_ether_addr(qca->net_dev->dev_addr)) {
> +		eth_hw_addr_random(qca->net_dev);
> +		dev_info(&serdev->dev, "Using random MAC address: %pM\n",
> +			 qca->net_dev->dev_addr);
> +	}
> +
> +	netif_carrier_on(qca->net_dev);
> +	serdev_device_set_drvdata(serdev, qca);
> +	serdev_device_set_client_ops(serdev, &qca_serdev_ops);
> +
> +	ret = serdev_device_open(serdev);
> +	if (ret) {
> +		dev_err(&serdev->dev, "Unable to open device %s\n",
> +			qcauart_dev->name);
> +		goto free;
> +	}
> +
> +	speed = serdev_device_set_baudrate(serdev, speed);
> +	dev_info(&serdev->dev, "Using baudrate: %u\n", speed);
> +
> +	serdev_device_set_flow_control(serdev, false);
> +
> +	ret = register_netdev(qcauart_dev);
> +	if (ret) {
> +		dev_err(&serdev->dev, "Unable to register net device %s\n",
> +			qcauart_dev->name);

	serdev_device_close() ?

> +		goto free;
> +	}
> +
> +	return 0;
> +
> +free:
> +	free_netdev(qcauart_dev);
> +	return ret;
> +}
> +
> +static void qca_uart_remove(struct serdev_device *serdev)
> +{
> +	struct qcauart *qca = serdev_device_get_drvdata(serdev);

	Here you should make sure that the tx_work is not running/pending any
	more (e.g by calling cancel_delayed_work_sync()).

> +	/* Flush any pending characters in the driver. */
> +	serdev_device_close(serdev);
> +
> +	netif_carrier_off(qca->net_dev);
> +	unregister_netdev(qca->net_dev);
> +	free_netdev(qca->net_dev);
> +}

Regards,
Lino

--
To unsubscribe from this list: send the line "unsubscribe devicetree" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH net] netem: fix skb_orphan_partial()
From: Eric Dumazet @ 2017-05-11 22:24 UTC (permalink / raw)
  To: Michael Madsen, David Miller; +Cc: Stephen Hemminger, Eric Dumazet, netdev
In-Reply-To: <1494531770.7796.115.camel@edumazet-glaptop3.roam.corp.google.com>

From: Eric Dumazet <edumazet@google.com>

I should have known that lowering skb->truesize was dangerous :/

In case packets are not leaving the host via a standard Ethernet device,
but looped back to local sockets, bad things can happen, as reported
by Michael Madsen ( https://bugzilla.kernel.org/show_bug.cgi?id=195713 )

So instead of tweaking skb->truesize, lets change skb->destructor
and keep a reference on the owner socket via its sk_refcnt.

Fixes: f2f872f9272a ("netem: Introduce skb_orphan_partial() helper")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Michael Madsen <mkm@nabto.com>
---
 net/core/sock.c |   20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 79c6aee6af9b817bd7086f04ae8f46342a3bf4b6..e43e71d7856b385111cd4c4b1bd835a78c670c60 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1803,28 +1803,24 @@ EXPORT_SYMBOL(skb_set_owner_w);
  * delay queue. We want to allow the owner socket to send more
  * packets, as if they were already TX completed by a typical driver.
  * But we also want to keep skb->sk set because some packet schedulers
- * rely on it (sch_fq for example). So we set skb->truesize to a small
- * amount (1) and decrease sk_wmem_alloc accordingly.
+ * rely on it (sch_fq for example).
  */
 void skb_orphan_partial(struct sk_buff *skb)
 {
-	/* If this skb is a TCP pure ACK or already went here,
-	 * we have nothing to do. 2 is already a very small truesize.
-	 */
-	if (skb->truesize <= 2)
+	if (skb_is_tcp_pure_ack(skb))
 		return;
 
-	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
-	 * so we do not completely orphan skb, but transfert all
-	 * accounted bytes but one, to avoid unexpected reorders.
-	 */
 	if (skb->destructor == sock_wfree
 #ifdef CONFIG_INET
 	    || skb->destructor == tcp_wfree
 #endif
 		) {
-		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
-		skb->truesize = 1;
+		struct sock *sk = skb->sk;
+
+		if (atomic_inc_not_zero(&sk->sk_refcnt)) {
+			atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+			skb->destructor = sock_efree;
+		}
 	} else {
 		skb_orphan(skb);
 	}

^ permalink raw reply related

* [PATCH] ravb: add wake-on-lan support via magic packet
From: Niklas Söderlund @ 2017-05-11 22:27 UTC (permalink / raw)
  To: Sergei Shtylyov; +Cc: netdev, linux-renesas-soc, Niklas Söderlund

WoL is enabled in the suspend callback by setting MagicPacket detection
and disabling all interrupts expect MagicPacket. In the resume path the
driver needs to reset the hardware to rearm the WoL logic, this prevents
the driver from simply restoring the registers and to take advantage of
that ravb was not suspended to reduce resume time. To reset the
hardware the driver closes the device, sets it in reset mode and reopens
the device just like it would do in a normal suspend/resume scenario
without WoL enabled, but it both closes and opens the device in the
resume callback since the device needs to be reset for WoL to work.

One quirk needed for WoL is that the module clock needs to be prevented
from being switched off by Runtime PM. To keep the clock alive the
suspend callback need to call clk_enable() directly to increase the
usage count of the clock. Then when Runtime PM decreases the clock usage
count it won't reach 0 and be switched off.

Signed-off-by: Niklas Söderlund <niklas.soderlund+renesas@ragnatech.se>
---

Tested on Salvator-X H3 and M3-W.

 drivers/net/ethernet/renesas/ravb.h      |   2 +
 drivers/net/ethernet/renesas/ravb_main.c | 108 +++++++++++++++++++++++++++++--
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/renesas/ravb.h b/drivers/net/ethernet/renesas/ravb.h
index 0525bd696d5d02e5..96a27b00c90e212a 100644
--- a/drivers/net/ethernet/renesas/ravb.h
+++ b/drivers/net/ethernet/renesas/ravb.h
@@ -991,6 +991,7 @@ struct ravb_private {
 	struct net_device *ndev;
 	struct platform_device *pdev;
 	void __iomem *addr;
+	struct clk *clk;
 	struct mdiobb_ctrl mdiobb;
 	u32 num_rx_ring[NUM_RX_QUEUE];
 	u32 num_tx_ring[NUM_TX_QUEUE];
@@ -1033,6 +1034,7 @@ struct ravb_private {
 
 	unsigned no_avb_link:1;
 	unsigned avb_link_active_low:1;
+	unsigned wol_enabled:1;
 };
 
 static inline u32 ravb_read(struct net_device *ndev, enum ravb_reg reg)
diff --git a/drivers/net/ethernet/renesas/ravb_main.c b/drivers/net/ethernet/renesas/ravb_main.c
index 3cd7989c007dfe46..90913cf2477eb25d 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -680,6 +680,9 @@ static void ravb_emac_interrupt_unlocked(struct net_device *ndev)
 
 	ecsr = ravb_read(ndev, ECSR);
 	ravb_write(ndev, ecsr, ECSR);	/* clear interrupt */
+
+	if (ecsr & ECSR_MPD)
+		pm_wakeup_event(&priv->pdev->dev, 0);
 	if (ecsr & ECSR_ICD)
 		ndev->stats.tx_carrier_errors++;
 	if (ecsr & ECSR_LCHNG) {
@@ -1330,6 +1333,33 @@ static int ravb_get_ts_info(struct net_device *ndev,
 	return 0;
 }
 
+static void ravb_get_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct ravb_private *priv = netdev_priv(ndev);
+
+	wol->supported = 0;
+	wol->wolopts = 0;
+
+	if (priv->clk) {
+		wol->supported = WAKE_MAGIC;
+		wol->wolopts = priv->wol_enabled ? WAKE_MAGIC : 0;
+	}
+}
+
+static int ravb_set_wol(struct net_device *ndev, struct ethtool_wolinfo *wol)
+{
+	struct ravb_private *priv = netdev_priv(ndev);
+
+	if (!priv->clk || wol->wolopts & ~WAKE_MAGIC)
+		return -EOPNOTSUPP;
+
+	priv->wol_enabled = !!(wol->wolopts & WAKE_MAGIC);
+
+	device_set_wakeup_enable(&priv->pdev->dev, priv->wol_enabled);
+
+	return 0;
+}
+
 static const struct ethtool_ops ravb_ethtool_ops = {
 	.nway_reset		= ravb_nway_reset,
 	.get_msglevel		= ravb_get_msglevel,
@@ -1343,6 +1373,8 @@ static const struct ethtool_ops ravb_ethtool_ops = {
 	.get_ts_info		= ravb_get_ts_info,
 	.get_link_ksettings	= ravb_get_link_ksettings,
 	.set_link_ksettings	= ravb_set_link_ksettings,
+	.get_wol		= ravb_get_wol,
+	.set_wol		= ravb_set_wol,
 };
 
 static inline int ravb_hook_irq(unsigned int irq, irq_handler_t handler,
@@ -2041,6 +2073,11 @@ static int ravb_probe(struct platform_device *pdev)
 
 	priv->chip_id = chip_id;
 
+	/* Get clock, if not found that's OK but Wake-On-Lan is unavailable */
+	priv->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(priv->clk))
+		priv->clk = NULL;
+
 	/* Set function */
 	ndev->netdev_ops = &ravb_netdev_ops;
 	ndev->ethtool_ops = &ravb_ethtool_ops;
@@ -2107,6 +2144,9 @@ static int ravb_probe(struct platform_device *pdev)
 	if (error)
 		goto out_napi_del;
 
+	if (priv->clk)
+		device_set_wakeup_capable(&pdev->dev, 1);
+
 	/* Print device information */
 	netdev_info(ndev, "Base address at %#x, %pM, IRQ %d.\n",
 		    (u32)ndev->base_addr, ndev->dev_addr, ndev->irq);
@@ -2160,15 +2200,69 @@ static int ravb_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static int ravb_wol_setup(struct net_device *ndev)
+{
+	struct ravb_private *priv = netdev_priv(ndev);
+
+	/* Disable interrupts by clearing the interrupt masks. */
+	ravb_write(ndev, 0, RIC0);
+	ravb_write(ndev, 0, RIC2);
+	ravb_write(ndev, 0, TIC);
+
+	/* Only allow ECI interrupts */
+	synchronize_irq(priv->emac_irq);
+	napi_disable(&priv->napi[RAVB_NC]);
+	napi_disable(&priv->napi[RAVB_BE]);
+	ravb_write(ndev, ECSIPR_MPDIP, ECSIPR);
+
+	/* Enable MagicPacket */
+	ravb_modify(ndev, ECMR, ECMR_MPDE, ECMR_MPDE);
+
+	/* Increased clock usage so device won't be suspended */
+	clk_enable(priv->clk);
+
+	return enable_irq_wake(priv->emac_irq);
+}
+
+static int ravb_wol_restore(struct net_device *ndev)
+{
+	struct ravb_private *priv = netdev_priv(ndev);
+	int ret;
+
+	napi_enable(&priv->napi[RAVB_NC]);
+	napi_enable(&priv->napi[RAVB_BE]);
+
+	/* Disable MagicPacket */
+	ravb_modify(ndev, ECMR, ECMR_MPDE, 0);
+
+	ret = ravb_close(ndev);
+	if (ret < 0)
+		return ret;
+
+	/* Restore clock usage count */
+	clk_disable(priv->clk);
+
+	/* Set reset mode */
+	ravb_write(ndev, CCC_OPC_RESET, CCC);
+
+	return disable_irq_wake(priv->emac_irq);
+}
+
 static int __maybe_unused ravb_suspend(struct device *dev)
 {
 	struct net_device *ndev = dev_get_drvdata(dev);
-	int ret = 0;
+	struct ravb_private *priv = netdev_priv(ndev);
+	int ret;
 
-	if (netif_running(ndev)) {
-		netif_device_detach(ndev);
+	if (!netif_running(ndev))
+		return 0;
+
+	netif_device_detach(ndev);
+
+	if (priv->wol_enabled)
+		ret = ravb_wol_setup(ndev);
+	else
 		ret = ravb_close(ndev);
-	}
 
 	return ret;
 }
@@ -2179,6 +2273,12 @@ static int __maybe_unused ravb_resume(struct device *dev)
 	struct ravb_private *priv = netdev_priv(ndev);
 	int ret = 0;
 
+	if (priv->wol_enabled) {
+		ret = ravb_wol_restore(ndev);
+		if (ret)
+			return ret;
+	}
+
 	/* All register have been reset to default values.
 	 * Restore all registers which where setup at probe time and
 	 * reopen device if it was running before system suspended.
-- 
2.12.2

^ permalink raw reply related

* Re: [PATCH v2 1/7] bpf: Track alignment of register values in the verifier.
From: Alexei Starovoitov @ 2017-05-11 22:53 UTC (permalink / raw)
  To: David Miller, daniel; +Cc: alexei.starovoitov, netdev
In-Reply-To: <20170511.120516.2089919860112370473.davem@davemloft.net>

On 5/11/17 9:05 AM, David Miller wrote:
> +		had_id = (dst_reg->id != 0);
> +
>  		/* dst_reg stays as pkt_ptr type and since some positive
>  		 * integer value was added to the pointer, increment its 'id'
>  		 */
>  		dst_reg->id = ++env->id_gen;
>
> -		/* something was added to pkt_ptr, set range and off to zero */
> +		/* something was added to pkt_ptr, set range to zero */
> +		dst_reg->aux_off = dst_reg->off;

what about 2nd addition of a variable to pkt_ptr ?
aux_off sort-of remembers already accumulated offset in pkt_ptr, but
above line will hard assign it which doesn't seem right for the 2nd
addition.
Ex:
before first add, reg->off == 14
after first add, aux_off = 14, off = 0
then imm4 added, now we have reg->off=4, aux_off=14
now we do 2nd add of variable and
reg->aux_off becomes 4
and if we later do u64 load from the packet it will be rejected
due to (net_ip_align + 4) whereas it should have been ok
due to (net_ip_align + 14 + 4).


>  		dst_reg->off = 0;
>  		dst_reg->range = 0;
> +		if (had_id)
> +			dst_reg->aux_off_align = min(dst_reg->aux_off_align,
> +						     src_reg->min_align);
> +		else
> +			dst_reg->aux_off_align = src_reg->min_align;

all aux_off_align logic here and in other places looks fine to me,
since it's conservative.

^ permalink raw reply

* Re: [PATCH net-next] selftests/bpf: get rid of -D__x86_64__
From: Alexei Starovoitov @ 2017-05-11 22:58 UTC (permalink / raw)
  To: David Miller; +Cc: daniel, netdev
In-Reply-To: <20170511.150246.650724054596949813.davem@davemloft.net>

On 5/11/17 12:02 PM, David Miller wrote:
> From: Alexei Starovoitov <ast@fb.com>
> Date: Thu, 4 May 2017 16:34:08 -0700
>
>> Hence I think the cleanest solution is to have bpf arch's types.h
>> either installed with llvm/gcc or picked from selftests's dir.
>
> Something like this?

yes :)
Ack.

> ====================
> [PATCH] bpf: Provide a linux/types.h override for bpf selftests.
>
> We do not want to use the architecture's type.h header when
> building BPF programs which are always 64-bit.
>
> Signed-off-by: David S. Miller <davem@davemloft.net>
> ---
>  tools/testing/selftests/bpf/Makefile                   | 3 ++-
>  tools/testing/selftests/bpf/include/uapi/linux/types.h | 6 ++++++
>  2 files changed, 8 insertions(+), 1 deletion(-)
>  create mode 100644 tools/testing/selftests/bpf/include/uapi/linux/types.h
>
> diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
> index f92f27d..f389b02 100644
> --- a/tools/testing/selftests/bpf/Makefile
> +++ b/tools/testing/selftests/bpf/Makefile
> @@ -35,6 +35,7 @@ $(BPFOBJ): force
>  CLANG ?= clang
>
>  %.o: %.c
> -	$(CLANG) -I. -I../../../include/uapi -I../../../../samples/bpf/ \
> +	$(CLANG) -I. -I./include/uapi -I../../../include/uapi \

Can we than move gnu/stubs.h into include/uapi as well and remove
the first -I. ?
Or keep them separate, since this linux/types.h is bpf's arch types.h
whereas gnu/stubs.h is a hack for glibc /usr/include/features.h ?
I'm fine whichever way including keeping this patch as-is.

^ permalink raw reply

* [PATCH net v2 2/2] xdp: refine xdp api with regards to generic xdp
From: Daniel Borkmann @ 2017-05-11 23:04 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, kubakici, john.fastabend, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1494542162.git.daniel@iogearbox.net>

While working on the iproute2 generic XDP frontend, I noticed that
as of right now it's possible to have native *and* generic XDP
programs loaded both at the same time for the case when a driver
supports native XDP.

The intended model for generic XDP from b5cdae3291f7 ("net: Generic
XDP") is, however, that only one out of the two can be present at
once which is also indicated as such in the XDP netlink dump part.
The main rationale for generic XDP is to ease accessibility (in
case a driver does not yet have XDP support) and to generically
provide a semantical model as an example for driver developers
wanting to add XDP support. The generic XDP option for an XDP
aware driver can still be useful for comparing and testing both
implementations.

However, it is not intended to have a second XDP processing stage
or layer with exactly the same functionality of the first native
stage. Only reason could be to have a partial fallback for future
XDP features that are not supported yet in the native implementation
and we probably also shouldn't strive for such fallback and instead
encourage native feature support in the first place. Given there's
currently no such fallback issue or use case, lets not go there yet
if we don't need to.

Therefore, change semantics for loading XDP and bail out if the
user tries to load a generic XDP program when a native one is
present and vice versa. Another alternative to bailing out would
be to handle the transition from one flavor to another gracefully,
but that would require to bring the device down, exchange both
types of programs, and bring it up again in order to avoid a tiny
window where a packet could hit both hooks. Given this complicates
the logic for just a debugging feature in the native case, I went
with the simpler variant.

For the dump, remove IFLA_XDP_FLAGS that was added with b5cdae3291f7
and reuse IFLA_XDP_ATTACHED for indicating the mode. Dumping all
or just a subset of flags that were used for loading the XDP prog
is suboptimal in the long run since not all flags are useful for
dumping and if we start to reuse the same flag definitions for
load and dump, then we'll waste bit space. What we really just
want is to dump the mode for now.

Current IFLA_XDP_ATTACHED semantics are: nothing was installed (0),
a program is running at the native driver layer (1). Thus, add a
mode that says that a program is running at generic XDP layer (2).
Applications will handle this fine in that older binaries will
just indicate that something is attached at XDP layer, effectively
this is similar to IFLA_XDP_FLAGS attr that we would have had
modulo the redundancy.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/linux/netdevice.h    |  8 +++++--
 include/uapi/linux/if_link.h |  7 ++++++
 net/core/dev.c               | 55 +++++++++++++++++++++++++++++---------------
 net/core/rtnetlink.c         | 40 +++++++++++++++-----------------
 4 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9c23bd2..3f39d27 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3296,11 +3296,15 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
-int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
-		      int fd, u32 flags);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
+
+typedef int (*xdp_op_t)(struct net_device *dev, struct netdev_xdp *xdp);
+int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
+		      int fd, u32 flags);
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op);
+
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
 bool is_skb_forwardable(const struct net_device *dev,
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 549ac8a..15ac203 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -894,6 +894,13 @@ enum {
 					 XDP_FLAGS_SKB_MODE | \
 					 XDP_FLAGS_DRV_MODE)
 
+/* These are stored into IFLA_XDP_ATTACHED on dump. */
+enum {
+	XDP_ATTACHED_NONE = 0,
+	XDP_ATTACHED_DRV,
+	XDP_ATTACHED_SKB,
+};
+
 enum {
 	IFLA_XDP_UNSPEC,
 	IFLA_XDP_FD,
diff --git a/net/core/dev.c b/net/core/dev.c
index e56cb71..fca407b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6852,6 +6852,32 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down);
 
+bool __dev_xdp_attached(struct net_device *dev, xdp_op_t xdp_op)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_QUERY_PROG;
+
+	/* Query must always succeed. */
+	WARN_ON(xdp_op(dev, &xdp) < 0);
+	return xdp.prog_attached;
+}
+
+static int dev_xdp_install(struct net_device *dev, xdp_op_t xdp_op,
+			   struct netlink_ext_ack *extack,
+			   struct bpf_prog *prog)
+{
+	struct netdev_xdp xdp;
+
+	memset(&xdp, 0, sizeof(xdp));
+	xdp.command = XDP_SETUP_PROG;
+	xdp.extack = extack;
+	xdp.prog = prog;
+
+	return xdp_op(dev, &xdp);
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -6864,43 +6890,34 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, u32 flags)
 {
-	int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp);
 	const struct net_device_ops *ops = dev->netdev_ops;
 	struct bpf_prog *prog = NULL;
-	struct netdev_xdp xdp;
+	xdp_op_t xdp_op, xdp_chk;
 	int err;
 
 	ASSERT_RTNL();
 
-	xdp_op = ops->ndo_xdp;
+	xdp_op = xdp_chk = ops->ndo_xdp;
 	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
 		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
+	if (xdp_op == xdp_chk)
+		xdp_chk = generic_xdp_install;
 
 	if (fd >= 0) {
-		if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) {
-			memset(&xdp, 0, sizeof(xdp));
-			xdp.command = XDP_QUERY_PROG;
-
-			err = xdp_op(dev, &xdp);
-			if (err < 0)
-				return err;
-			if (xdp.prog_attached)
-				return -EBUSY;
-		}
+		if (xdp_chk && __dev_xdp_attached(dev, xdp_chk))
+			return -EEXIST;
+		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
+		    __dev_xdp_attached(dev, xdp_op))
+			return -EBUSY;
 
 		prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
 		if (IS_ERR(prog))
 			return PTR_ERR(prog);
 	}
 
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_SETUP_PROG;
-	xdp.extack = extack;
-	xdp.prog = prog;
-
-	err = xdp_op(dev, &xdp);
+	err = dev_xdp_install(dev, xdp_op, extack, prog);
 	if (err < 0 && prog)
 		bpf_prog_put(prog);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index dda9f16..d7f82c3 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -899,8 +899,7 @@ static size_t rtnl_port_size(const struct net_device *dev,
 static size_t rtnl_xdp_size(void)
 {
 	size_t xdp_size = nla_total_size(0) +	/* nest IFLA_XDP */
-			  nla_total_size(1) +	/* XDP_ATTACHED */
-			  nla_total_size(4);	/* XDP_FLAGS */
+			  nla_total_size(1);	/* XDP_ATTACHED */
 
 	return xdp_size;
 }
@@ -1247,37 +1246,34 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
 	return 0;
 }
 
+static u8 rtnl_xdp_attached_mode(struct net_device *dev)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+
+	ASSERT_RTNL();
+
+	if (rcu_access_pointer(dev->xdp_prog))
+		return XDP_ATTACHED_SKB;
+	if (ops->ndo_xdp && __dev_xdp_attached(dev, ops->ndo_xdp))
+		return XDP_ATTACHED_DRV;
+
+	return XDP_ATTACHED_NONE;
+}
+
 static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
 {
 	struct nlattr *xdp;
-	u32 xdp_flags = 0;
-	u8 val = 0;
 	int err;
 
 	xdp = nla_nest_start(skb, IFLA_XDP);
 	if (!xdp)
 		return -EMSGSIZE;
-	if (rcu_access_pointer(dev->xdp_prog)) {
-		xdp_flags = XDP_FLAGS_SKB_MODE;
-		val = 1;
-	} else if (dev->netdev_ops->ndo_xdp) {
-		struct netdev_xdp xdp_op = {};
-
-		xdp_op.command = XDP_QUERY_PROG;
-		err = dev->netdev_ops->ndo_xdp(dev, &xdp_op);
-		if (err)
-			goto err_cancel;
-		val = xdp_op.prog_attached;
-	}
-	err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val);
+
+	err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
+			 rtnl_xdp_attached_mode(dev));
 	if (err)
 		goto err_cancel;
 
-	if (xdp_flags) {
-		err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags);
-		if (err)
-			goto err_cancel;
-	}
 	nla_nest_end(skb, xdp);
 	return 0;
 
-- 
1.9.3

^ permalink raw reply related

* [PATCH net v2 0/2] Two generic xdp related follow-ups
From: Daniel Borkmann @ 2017-05-11 23:04 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, kubakici, john.fastabend, netdev,
	Daniel Borkmann

Two follow-ups for the generic XDP API, would be great if
both could still be considered, since the XDP API is not
frozen yet. For details please see individual patches.

v1 -> v2:
  - Implemented feedback from Jakub Kicinski (reusing
    attribute on dump), thanks!
  - Rest as is.

Thanks!

Daniel Borkmann (2):
  xdp: add flag to enforce driver mode
  xdp: refine xdp api with regards to generic xdp

 include/linux/netdevice.h          |  8 ++++--
 include/uapi/linux/if_link.h       | 13 +++++++--
 net/core/dev.c                     | 57 +++++++++++++++++++++++++-------------
 net/core/rtnetlink.c               | 45 +++++++++++++++---------------
 samples/bpf/xdp1_user.c            |  8 ++++--
 samples/bpf/xdp_tx_iptunnel_user.c |  7 ++++-
 6 files changed, 90 insertions(+), 48 deletions(-)

-- 
1.9.3

^ permalink raw reply

* [PATCH net v2 1/2] xdp: add flag to enforce driver mode
From: Daniel Borkmann @ 2017-05-11 23:04 UTC (permalink / raw)
  To: davem; +Cc: alexei.starovoitov, kubakici, john.fastabend, netdev,
	Daniel Borkmann
In-Reply-To: <cover.1494542162.git.daniel@iogearbox.net>

After commit b5cdae3291f7 ("net: Generic XDP") we automatically fall
back to a generic XDP variant if the driver does not support native
XDP. Allow for an option where the user can specify that always the
native XDP variant should be selected and in case it's not supported
by a driver, just bail out.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/if_link.h       | 6 ++++--
 net/core/dev.c                     | 2 ++
 net/core/rtnetlink.c               | 5 +++++
 samples/bpf/xdp1_user.c            | 8 ++++++--
 samples/bpf/xdp_tx_iptunnel_user.c | 7 ++++++-
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 8e56ac7..549ac8a 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -888,9 +888,11 @@ enum {
 /* XDP section */
 
 #define XDP_FLAGS_UPDATE_IF_NOEXIST	(1U << 0)
-#define XDP_FLAGS_SKB_MODE		(2U << 0)
+#define XDP_FLAGS_SKB_MODE		(1U << 1)
+#define XDP_FLAGS_DRV_MODE		(1U << 2)
 #define XDP_FLAGS_MASK			(XDP_FLAGS_UPDATE_IF_NOEXIST | \
-					 XDP_FLAGS_SKB_MODE)
+					 XDP_FLAGS_SKB_MODE | \
+					 XDP_FLAGS_DRV_MODE)
 
 enum {
 	IFLA_XDP_UNSPEC,
diff --git a/net/core/dev.c b/net/core/dev.c
index 96cf83d..e56cb71 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6873,6 +6873,8 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 	ASSERT_RTNL();
 
 	xdp_op = ops->ndo_xdp;
+	if (!xdp_op && (flags & XDP_FLAGS_DRV_MODE))
+		return -EOPNOTSUPP;
 	if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE))
 		xdp_op = generic_xdp_install;
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index bcb0f610..dda9f16 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2199,6 +2199,11 @@ static int do_setlink(const struct sk_buff *skb,
 				err = -EINVAL;
 				goto errout;
 			}
+			if ((xdp_flags & XDP_FLAGS_SKB_MODE) &&
+			    (xdp_flags & XDP_FLAGS_DRV_MODE)) {
+				err = -EINVAL;
+				goto errout;
+			}
 		}
 
 		if (xdp[IFLA_XDP_FD]) {
diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c
index 378850c..17be9ea 100644
--- a/samples/bpf/xdp1_user.c
+++ b/samples/bpf/xdp1_user.c
@@ -62,13 +62,14 @@ static void usage(const char *prog)
 	fprintf(stderr,
 		"usage: %s [OPTS] IFINDEX\n\n"
 		"OPTS:\n"
-		"    -S    use skb-mode\n",
+		"    -S    use skb-mode\n"
+		"    -N    enforce native mode\n",
 		prog);
 }
 
 int main(int argc, char **argv)
 {
-	const char *optstr = "S";
+	const char *optstr = "SN";
 	char filename[256];
 	int opt;
 
@@ -77,6 +78,9 @@ int main(int argc, char **argv)
 		case 'S':
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
 		default:
 			usage(basename(argv[0]));
 			return 1;
diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c
index 92b8bde..631cdcc 100644
--- a/samples/bpf/xdp_tx_iptunnel_user.c
+++ b/samples/bpf/xdp_tx_iptunnel_user.c
@@ -79,6 +79,8 @@ static void usage(const char *cmd)
 	printf("    -m <dest-MAC> Used in sending the IP Tunneled pkt\n");
 	printf("    -T <stop-after-X-seconds> Default: 0 (forever)\n");
 	printf("    -P <IP-Protocol> Default is TCP\n");
+	printf("    -S use skb-mode\n");
+	printf("    -N enforce native mode\n");
 	printf("    -h Display this help\n");
 }
 
@@ -138,7 +140,7 @@ int main(int argc, char **argv)
 {
 	unsigned char opt_flags[256] = {};
 	unsigned int kill_after_s = 0;
-	const char *optstr = "i:a:p:s:d:m:T:P:Sh";
+	const char *optstr = "i:a:p:s:d:m:T:P:SNh";
 	int min_port = 0, max_port = 0;
 	struct iptnl_info tnl = {};
 	struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
@@ -206,6 +208,9 @@ int main(int argc, char **argv)
 		case 'S':
 			xdp_flags |= XDP_FLAGS_SKB_MODE;
 			break;
+		case 'N':
+			xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
 		default:
 			usage(argv[0]);
 			return 1;
-- 
1.9.3

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox