Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH v5 5/5] virtio_net: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-04-25 15:52 UTC (permalink / raw)
  To: netdev, linux-kernel, David.Laight, kernel-hardening, davem
  Cc: Jason A. Donenfeld
In-Reply-To: <20170425155215.4835-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 drivers/net/virtio_net.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f36584616e7d..1709fd0b4bf7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1081,7 +1081,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
 	const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
 	struct virtnet_info *vi = sq->vq->vdev->priv;
-	unsigned num_sg;
+	int num_sg;
 	unsigned hdr_len = vi->hdr_len;
 	bool can_push;
 
@@ -1114,6 +1114,8 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		sg_set_buf(sq->sg, hdr, hdr_len);
 		num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
 	}
+	if (unlikely(num_sg < 0))
+		return num_sg;
 	return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
 }
 
-- 
2.12.2

^ permalink raw reply related

* Re: [PATCH net-next 0/6] qed/qede: VF tunnelling support
From: David Miller @ 2017-04-25 15:52 UTC (permalink / raw)
  To: manish.chopra; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170424170049.16027-1-manish.chopra@cavium.com>

From: Manish Chopra <manish.chopra@cavium.com>
Date: Mon, 24 Apr 2017 10:00:43 -0700

> With this series VFs can run vxlan/geneve/gre tunnels over it.
> Please consider applying this series to "net-next"

Series applied.

^ permalink raw reply

* [PATCH v5 2/5] ipsec: check return value of skb_to_sgvec always
From: Jason A. Donenfeld @ 2017-04-25 15:52 UTC (permalink / raw)
  To: netdev, linux-kernel, David.Laight, kernel-hardening, davem
  Cc: Jason A. Donenfeld
In-Reply-To: <20170425155215.4835-1-Jason@zx2c4.com>

Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
 net/ipv4/ah4.c  |  8 ++++++--
 net/ipv4/esp4.c | 30 ++++++++++++++++++++----------
 net/ipv6/ah6.c  |  8 ++++++--
 net/ipv6/esp6.c | 31 +++++++++++++++++++++----------
 4 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 22377c8ff14b..e8f862358518 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -220,7 +220,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
 	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
@@ -393,7 +395,9 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
 	skb_push(skb, ihl);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index b1e24446e297..42cb09cc8533 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -360,9 +360,13 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			esph = esp_output_set_extra(skb, esph, extra);
 
 			sg_init_table(sg, nfrags);
-			skb_to_sgvec(skb, sg,
-				     (unsigned char *)esph - skb->data,
-				     assoclen + ivlen + clen + alen);
+			err = skb_to_sgvec(skb, sg,
+				           (unsigned char *)esph - skb->data,
+				           assoclen + ivlen + clen + alen);
+			if (unlikely(err < 0)) {
+				spin_unlock_bh(&x->lock);
+				goto error;
+			}
 
 			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
 
@@ -381,11 +385,13 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			pfrag->offset = pfrag->offset + allocsize;
 
 			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-			skb_to_sgvec(skb, dsg,
-				     (unsigned char *)esph - skb->data,
-				     assoclen + ivlen + clen + alen);
+			err = skb_to_sgvec(skb, dsg,
+				           (unsigned char *)esph - skb->data,
+				           assoclen + ivlen + clen + alen);
 
 			spin_unlock_bh(&x->lock);
+			if (unlikely(err < 0))
+				goto error;
 
 			goto skip_cow2;
 		}
@@ -422,9 +428,11 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	esph = esp_output_set_extra(skb, esph, extra);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg,
-		     (unsigned char *)esph - skb->data,
-		     assoclen + ivlen + clen + alen);
+	err = skb_to_sgvec(skb, sg,
+		           (unsigned char *)esph - skb->data,
+		           assoclen + ivlen + clen + alen);
+	if (unlikely(err < 0))
+		goto error;
 
 skip_cow2:
 	if ((x->props.flags & XFRM_STATE_ESN))
@@ -658,7 +666,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	esp_input_set_header(skb, seqhi);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	err = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out;
 
 	skb->ip_summed = CHECKSUM_NONE;
 
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index dda6035e3b84..755f38271dd5 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -423,7 +423,9 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
 	ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
@@ -606,7 +608,9 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb)
 	ip6h->hop_limit   = 0;
 
 	sg_init_table(sg, nfrags + sglists);
-	skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	err = skb_to_sgvec_nomark(skb, sg, 0, skb->len);
+	if (unlikely(err < 0))
+		goto out_free;
 
 	if (x->props.flags & XFRM_STATE_ESN) {
 		/* Attach seqhi sg right after packet payload */
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index ff54faa75631..017e2c2d36e1 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -339,9 +339,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			esph = esp_output_set_esn(skb, esph, seqhi);
 
 			sg_init_table(sg, nfrags);
-			skb_to_sgvec(skb, sg,
-				     (unsigned char *)esph - skb->data,
-				     assoclen + ivlen + clen + alen);
+			err = skb_to_sgvec(skb, sg,
+				           (unsigned char *)esph - skb->data,
+				           assoclen + ivlen + clen + alen);
+			if (unlikely(err < 0)) {
+				spin_unlock_bh(&x->lock);
+				goto error;
+			}
 
 			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
 
@@ -360,12 +364,15 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			pfrag->offset = pfrag->offset + allocsize;
 
 			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
-			skb_to_sgvec(skb, dsg,
-				     (unsigned char *)esph - skb->data,
-				     assoclen + ivlen + clen + alen);
+			err = skb_to_sgvec(skb, dsg,
+				           (unsigned char *)esph - skb->data,
+				           assoclen + ivlen + clen + alen);
 
 			spin_unlock_bh(&x->lock);
 
+			if (unlikely(err < 0))
+				goto error;
+
 			goto skip_cow2;
 		}
 	}
@@ -403,9 +410,11 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	esph = esp_output_set_esn(skb, esph, seqhi);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg,
-		     (unsigned char *)esph - skb->data,
-		     assoclen + ivlen + clen + alen);
+	err = skb_to_sgvec(skb, sg,
+		           (unsigned char *)esph - skb->data,
+		           assoclen + ivlen + clen + alen);
+	if (unlikely(err < 0))
+		goto error;
 
 skip_cow2:
 	if ((x->props.flags & XFRM_STATE_ESN))
@@ -600,7 +609,9 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	esp_input_set_header(skb, seqhi);
 
 	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	ret = skb_to_sgvec(skb, sg, 0, skb->len);
+	if (unlikely(ret < 0))
+		goto out;
 
 	skb->ip_summed = CHECKSUM_NONE;
 
-- 
2.12.2

^ permalink raw reply related

* Re: [PATCH] net: hso: fix module unloading
From: David Miller @ 2017-04-25 15:52 UTC (permalink / raw)
  To: andreas; +Cc: joe, peter, linux-usb, netdev, linux-kernel, letux-kernel
In-Reply-To: <1493061519-15785-1-git-send-email-andreas@kemnade.info>

From: Andreas Kemnade <andreas@kemnade.info>
Date: Mon, 24 Apr 2017 21:18:39 +0200

> keep tty driver until usb driver is unregistered
> rmmod hso
> produces traces like this without that:
 ...
> Signed-off-by: Andreas Kemnade <andreas@kemnade.info>

Applied, thank you.

^ permalink raw reply

* Re: [PATCH v3 net] net: ipv6: regenerate host route if moved to gc list
From: David Ahern @ 2017-04-25 15:54 UTC (permalink / raw)
  To: Andrey Konovalov; +Cc: netdev, Dmitry Vyukov, mmanning, Martin KaFai Lau
In-Reply-To: <CAAeHK+yLe1PoFMqmGK0B6wXqfc0AZzRAmMLxks-vSkOp5Eq3VQ@mail.gmail.com>

On 4/25/17 6:50 AM, Andrey Konovalov wrote:
> I've been running syzkaller with your patch and got another report
> from ip6_pol_route.

In general the existing patch cleans up all of the ipv6 fib kasan and
WARN_ON traces that were seen?

> 
> It happened only once so far and I couldn't reproduce it.

Some similarity in the sense of a ipv4 route in the ipv6 fib. That's
only going to happen if it hits the dst gc list and then back.

This duplicates what Dmitry reported on March 3rd.

^ permalink raw reply

* Re: [PATCH] net: ethernet: ti: netcp_core: remove unused compl queue mapping
From: David Miller @ 2017-04-25 15:55 UTC (permalink / raw)
  To: ivan.khoronzhuk
  Cc: netdev, w-kwok2, m-karicheri2, linux-kernel, grygorii.strashko
In-Reply-To: <1493067246-3247-1-git-send-email-ivan.khoronzhuk@linaro.org>

From: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
Date: Mon, 24 Apr 2017 23:54:06 +0300

> This code is unused and probably was unintentionally left while
> moving completion queue mapping in submit function.
> 
> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>

Applied.

^ permalink raw reply

* Re: [PATCH net-next] selftests/net: Fix broken test case in psock_fanout
From: David Miller @ 2017-04-25 15:56 UTC (permalink / raw)
  To: maloneykernel; +Cc: netdev, maloney
In-Reply-To: <20170425012911.903-1-maloneykernel@gmail.com>

From: Mike Maloney <maloneykernel@gmail.com>
Date: Mon, 24 Apr 2017 21:29:11 -0400

> From: Mike Maloney <maloney@google.com>
> 
> The error return falue form sock_fanout_open is -1, not zero.  One test
> case was checking for 0 instead of -1.
> 
> Tested: Built and tested in clean client.
> Signed-off-by: Mike Maloney <maloney@google.com>

Applied, thanks.

^ permalink raw reply

* Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone
From: David Ahern @ 2017-04-25 15:56 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Mahesh Bandewar, Eric Dumazet, David Miller, Alexey Kuznetsov,
	James Morris, Hideaki YOSHIFUJI, Patrick McHardy, netdev, LKML,
	Cong Wang, syzkaller
In-Reply-To: <CACT4Y+YCPxKkaZBfAabxtAV71R0ghLT-vUKVewJpm9wb2GXaEQ@mail.gmail.com>

On 3/4/17 11:57 AM, Dmitry Vyukov wrote:
> ==================================================================
> BUG: KASAN: slab-out-of-bounds in rt6_dump_route+0x293/0x2f0
> net/ipv6/route.c:3551 at addr ffff88007e523694
> Read of size 4 by task syz-executor3/24426
> CPU: 2 PID: 24426 Comm: syz-executor3 Not tainted 4.10.0+ #293
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52
>  kasan_object_err+0x1c/0x90 mm/kasan/report.c:166
>  print_address_description mm/kasan/report.c:208 [inline]
>  kasan_report_error mm/kasan/report.c:292 [inline]
>  kasan_report.part.2+0x1b0/0x460 mm/kasan/report.c:314
>  kasan_report mm/kasan/report.c:334 [inline]
>  __asan_report_load4_noabort+0x29/0x30 mm/kasan/report.c:334
>  rt6_dump_route+0x293/0x2f0 net/ipv6/route.c:3551
>  fib6_dump_node+0x101/0x1a0 net/ipv6/ip6_fib.c:315
>  fib6_walk_continue+0x4b3/0x620 net/ipv6/ip6_fib.c:1576
>  fib6_walk+0x91/0xf0 net/ipv6/ip6_fib.c:1621
>  fib6_dump_table net/ipv6/ip6_fib.c:374 [inline]
>  inet6_dump_fib+0x832/0xea0 net/ipv6/ip6_fib.c:447

My expectation is that this one is fixed with the ipv6 patch I have sent
(not yet committed). Are you seeing this backtrace with that patch?

^ permalink raw reply

* Re: [PATCH net] netvsc: fix calculation of available send sections
From: David Miller @ 2017-04-25 15:57 UTC (permalink / raw)
  To: stephen; +Cc: netdev, sthemmin
In-Reply-To: <20170425013338.2653-1-sthemmin@microsoft.com>

From: Stephen Hemminger <stephen@networkplumber.org>
Date: Mon, 24 Apr 2017 18:33:38 -0700

> My change (introduced in 4.11) to use find_first_clear_bit
> incorrectly assumed that the size argument was words, not bits.
> The effect was only a small limited number of the available send
> sections were being actually used. This can cause performance loss
> with some workloads.
> 
> Since map_words is now used only during initialization, it can
> be on stack instead of in per-device data.
> 
> Fixes: b58a185801da ("netvsc: simplify get next send section")
> Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>

Applied, thanks.

^ permalink raw reply

* Re: net: heap out-of-bounds in fib6_clean_node/rt6_fill_node/fib6_age/fib6_prune_clone
From: David Ahern @ 2017-04-25 15:57 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Eric Dumazet, Mahesh Bandewar, Eric Dumazet, David Miller,
	Alexey Kuznetsov, James Morris, Hideaki YOSHIFUJI,
	Patrick McHardy, netdev, LKML, Cong Wang, syzkaller
In-Reply-To: <CACT4Y+aDzAYs9mbNGeFEkYSB2wTwomZTGEKn4hDH0PAb4uiLqw@mail.gmail.com>

On 3/7/17 2:21 AM, Dmitry Vyukov wrote:
> ------------[ cut here ]------------
> WARNING: CPU: 2 PID: 3990 at net/ipv6/ip6_fib.c:991
> fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991
> Kernel panic - not syncing: panic_on_warn set ...
> 
> CPU: 2 PID: 3990 Comm: kworker/2:4 Not tainted 4.11.0-rc1+ #311
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
> Workqueue: ipv6_addrconf addrconf_dad_work
> Call Trace:
>  __dump_stack lib/dump_stack.c:16 [inline]
>  __dump_stack lib/dump_stack.c:16 [inline] lib/dump_stack.c:52
>  dump_stack+0x2fb/0x3fd lib/dump_stack.c:52 lib/dump_stack.c:52
>  panic+0x20f/0x426 kernel/panic.c:180 kernel/panic.c:180
>  __warn+0x1c4/0x1e0 kernel/panic.c:541 kernel/panic.c:541
>  warn_slowpath_null+0x2c/0x40 kernel/panic.c:584 kernel/panic.c:584
>  fib6_add+0x2e12/0x3290 net/ipv6/ip6_fib.c:991 net/ipv6/ip6_fib.c:991
>  __ip6_ins_rt+0x60/0x80 net/ipv6/route.c:948 net/ipv6/route.c:948
>  ip6_ins_rt+0x19b/0x220 net/ipv6/route.c:959 net/ipv6/route.c:959
>  __ipv6_ifa_notify+0x62e/0x7a0 net/ipv6/addrconf.c:5485 net/ipv6/addrconf.c:5485
>  ipv6_ifa_notify+0xdf/0x1d0 net/ipv6/addrconf.c:5518 net/ipv6/addrconf.c:5518
>  addrconf_dad_completed+0xe6/0x950 net/ipv6/addrconf.c:3983
> net/ipv6/addrconf.c:3983
>  addrconf_dad_begin net/ipv6/addrconf.c:3797 [inline]

Similarly for this one.

^ permalink raw reply

* Re: [PATCH net-next] bpf: map_get_next_key to return first key on NULL
From: David Miller @ 2017-04-25 15:57 UTC (permalink / raw)
  To: ast; +Cc: daniel, qinteng, netdev, kernel-team
In-Reply-To: <20170425020037.1114047-1-ast@fb.com>

From: Alexei Starovoitov <ast@fb.com>
Date: Mon, 24 Apr 2017 19:00:37 -0700

> From: Teng Qin <qinteng@fb.com>
> 
> When iterating through a map, we need to find a key that does not exist
> in the map so map_get_next_key will give us the first key of the map.
> This often requires a lot of guessing in production systems.
> 
> This patch makes map_get_next_key return the first key when the key
> pointer in the parameter is NULL.
> 
> Signed-off-by: Teng Qin <qinteng@fb.com>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> Acked-by: Daniel Borkmann <daniel@iogearbox.net>

Applied, thanks Alexei.

^ permalink raw reply

* Re: [PATCH net 1/1] qed: Fix error in the dcbx app meta data initialization.
From: David Miller @ 2017-04-25 15:58 UTC (permalink / raw)
  To: sudarsana.kalluru; +Cc: netdev, Yuval.Mintz
In-Reply-To: <20170425035910.29296-1-sudarsana.kalluru@cavium.com>

From: Sudarsana Reddy Kalluru <sudarsana.kalluru@cavium.com>
Date: Mon, 24 Apr 2017 20:59:10 -0700

> DCBX app_data array is initialized with the incorrect values for
> personality field. This would  prevent offloaded protocols from
> honoring the PFC.
> 
> Signed-off-by: Sudarsana Reddy Kalluru <Sudarsana.Kalluru@cavium.com>

Applied.

^ permalink raw reply

* Re: [PATCH net] macsec: avoid heap overflow in skb_to_sgvec on receive
From: Jason A. Donenfeld @ 2017-04-25 15:59 UTC (permalink / raw)
  To: Sabrina Dubroca; +Cc: Netdev
In-Reply-To: <20170425155140.GA13414@bistromath.localdomain>

On Tue, Apr 25, 2017 at 5:51 PM, Sabrina Dubroca <sd@queasysnail.net> wrote:
> 2017-04-25, 17:39:09 +0200, Jason A. Donenfeld wrote:
>> Hi Sabrina,
>>
>> I think I may have beaten you to the punch here by a few minutes. :)
>
> I said I was going to post a patch.
> Mail headers seem to disagree with you ;)

Oh, whoops, sorry -- thought you wanted me to. Misread, but happy to
have done it anyway.

> Unless I missed something, encrypt was already handling fragments
> correctly. An skb with ->frag_list should have no skb_tailroom, so it
> will be linearized skb_copy_expand().

You're right. In that case, you might as well add back the FRAGLIST
annotation to the FEATURES, so that packets with frag_lists aren't
copied twice -- once upon linearization into xmit and again when
calling copy_expand in case they need more head or tail space. In
general, too, using the precise number of frags saves memory. But
above all, it seems to me the important thing is that it's very
_clear_ there isn't going to be an overflow, that the code doesn't
rely on so many odd particulars that then get violated later in its
life. Using the explicit length makes things a lot more clear. So
maybe better to go with mine?

^ permalink raw reply

* Re: [PATCH net-next] qed: fix invalid use of sizeof in qed_alloc_qm_data()
From: David Miller @ 2017-04-25 16:00 UTC (permalink / raw)
  To: weiyj.lk; +Cc: Yuval.Mintz, Ariel.Elior, weiyongjun1, everest-linux-l2, netdev
In-Reply-To: <20170425070718.14790-1-weiyj.lk@gmail.com>

From: Wei Yongjun <weiyj.lk@gmail.com>
Date: Tue, 25 Apr 2017 07:07:18 +0000

> From: Wei Yongjun <weiyongjun1@huawei.com>
> 
> sizeof() when applied to a pointer typed expression gives the
> size of the pointer, not that of the pointed data.
> 
> Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>

Applied with Yuval's Fixes: tag added.

^ permalink raw reply

* Re: [PATCH net-next v8 2/3] net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch
From: Jiri Pirko @ 2017-04-25 16:04 UTC (permalink / raw)
  To: Jamal Hadi Salim; +Cc: davem, xiyou.wangcong, eric.dumazet, netdev
In-Reply-To: <5e54edd8-3943-6f09-490f-ff04b83077f6@mojatatu.com>

Tue, Apr 25, 2017 at 03:01:22PM CEST, jhs@mojatatu.com wrote:
>On 17-04-25 08:13 AM, Jiri Pirko wrote:
>> Tue, Apr 25, 2017 at 01:54:06PM CEST, jhs@mojatatu.com wrote:
>
>
>[..]
>
>> > -#define TCAA_MAX 1
>> > +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS
>> > + *
>> > + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO
>> > + * actions in a dump. All dump responses will contain the number of actions
>> > + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT
>> > + *
>> > + */
>> > +#define TCA_FLAG_LARGE_DUMP_ON		(1 << 0)
>> 
>> BIT (I think I mentioned this before)
>> 
>
>You did - but i took it out about two submissions back (per cover
>letter) because it is no part of UAPI today. I noticed devlink was
>using it but they defined  their own variant.
>So if i added this, iproute2 doesnt compile. I could fix iproute2
>to move it somewhere to a common header then restore this.

So fix iproute2. It is always first kernel, then iproute2.


>
>> > +#define VALID_TCA_FLAGS TCA_FLAG_LARGE_DUMP_ON
>> 
>> Again, namespace please. "TCA_ROOT_FLAGS_VALID"
>
>Good point.
>
>> Why is this UAPI?
>> 
>
>Should not be. I will fix in next update.
>
>
>> 
>> > 
>> > /* New extended info filters for IFLA_EXT_MASK */
>> > #define RTEXT_FILTER_VF		(1 << 0)
>> > diff --git a/net/sched/act_api.c b/net/sched/act_api.c
>> > index 9ce22b7..2756213 100644
>> > --- a/net/sched/act_api.c
>> > +++ b/net/sched/act_api.c
>> > @@ -83,6 +83,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			   struct netlink_callback *cb)
>> > {
>> > 	int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
>> > +	u32 act_flags = cb->args[2];
>> > 	struct nlattr *nest;
>> > 
>> > 	spin_lock_bh(&hinfo->lock);
>> > @@ -111,14 +112,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
>> > 			}
>> > 			nla_nest_end(skb, nest);
>> > 			n_i++;
>> > -			if (n_i >= TCA_ACT_MAX_PRIO)
>> > +			if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) &&
>> > +			    n_i >= TCA_ACT_MAX_PRIO)
>> > 				goto done;
>> > 		}
>> > 	}
>> > done:
>> > 	spin_unlock_bh(&hinfo->lock);
>> > -	if (n_i)
>> > +	if (n_i) {
>> > 		cb->args[0] += n_i;
>> > +		if (act_flags & TCA_FLAG_LARGE_DUMP_ON)
>> > +			cb->args[1] = n_i;
>> > +	}
>> > 	return n_i;
>> > 
>> > nla_put_failure:
>> > @@ -993,11 +998,15 @@ static int tcf_action_add(struct net *net, struct nlattr *nla,
>> > 	return tcf_add_notify(net, n, &actions, portid);
>> > }
>> > 
>> > +static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = {
>> > +	[TCA_ROOT_FLAGS]      = { .type = NLA_U32 },
>> > +};
>> > +
>> > static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 			 struct netlink_ext_ack *extack)
>> > {
>> > 	struct net *net = sock_net(skb->sk);
>> > -	struct nlattr *tca[TCAA_MAX + 1];
>> > +	struct nlattr *tca[TCA_ROOT_MAX + 1];
>> > 	u32 portid = skb ? NETLINK_CB(skb).portid : 0;
>> > 	int ret = 0, ovr = 0;
>> > 
>> > @@ -1005,7 +1014,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	    !netlink_capable(skb, CAP_NET_ADMIN))
>> > 		return -EPERM;
>> > 
>> > -	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL,
>> > +	ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL,
>> > 			  extack);
>> > 	if (ret < 0)
>> > 		return ret;
>> > @@ -1046,16 +1055,12 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n,
>> > 	return ret;
>> > }
>> > 
>> > -static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> > +static struct nlattr *find_dump_kind(struct nlattr **nla)
>> > {
>> > 	struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
>> > 	struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
>> > -	struct nlattr *nla[TCAA_MAX + 1];
>> > 	struct nlattr *kind;
>> > 
>> > -	if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX,
>> > -			NULL, NULL) < 0)
>> > -		return NULL;
>> > 	tb1 = nla[TCA_ACT_TAB];
>> > 	if (tb1 == NULL)
>> > 		return NULL;
>> > @@ -1073,6 +1078,16 @@ static struct nlattr *find_dump_kind(const struct nlmsghdr *n)
>> > 	return kind;
>> > }
>> > 
>> > +static inline bool tca_flags_valid(u32 act_flags)
>> > +{
>> > +	u32 invalid_flags_mask  = ~VALID_TCA_FLAGS;
>> > +
>> > +	if (act_flags & invalid_flags_mask)
>> > +		return false;
>> 
>> I don't see how this resolves anything. VALID_TCA_FLAGS is set in stone
>> not going to change anytime in future, right?
>
>Every time a new bit gets added VALID_TCA_FLAGS changes.

You mean flag that user can set? If that is the case, you are breaking
UAPI for newer app running on older kernel.


>
>> Then the TCA_ROOT_FLAGS
>> attr will always contain only one flag, right?
>
>Not true - forever. Just in this patch discussion:
>I have added 2 other flags since removed. So I cant predict the
>future. You keep coming back to this assumption there will always
>ever be this one flag. I am not following how you reach this
>conclusion.

I read this paragraph 5 times, still I don't get what you say :/


>
>> Then, I don't see why do we need this dance... u8 flag attr resolves
>> this in elegant way. I know I sound like a broken record. This is the
>> last time I'm commenting on this. I give up.
>> 
>
>Why is a u8 better than u32 Jiri?
>The TLV space consumed is still 64 bits in both cases. If I define u8,
>u16, u32 - it is still 64 bits being alloced. If i use a u8 then i have
>24 bits which are pads - given current discussions I can never ever use
>again!

I don't care, use u8 or u32. Just 1 attr - 1 flag.


>
>If i keep 32 bits I am free to use those without ever changing the
>data structure (except for the restrictions to now make sure nothing
>gets set).

what datastructure? I'm confused.

>
>cheers,
>jamal

^ permalink raw reply

* Re: qed*: debug infrastructures
From: Florian Fainelli @ 2017-04-25 16:13 UTC (permalink / raw)
  To: Elior, Ariel, David Miller
  Cc: netdev@vger.kernel.org, Mintz, Yuval, Tayar, Tomer, Dupuis, Chad
In-Reply-To: <CY1PR0701MB1337FA26F1503FC9928B0340901F0@CY1PR0701MB1337.namprd07.prod.outlook.com>

On 04/24/2017 10:38 AM, Elior, Ariel wrote:
> Hi Dave,
> 
> According to the recent messages on the list indicating debugfs is not the way
> to go, I am looking for some guidance on what is. dpipe approach was
> mentioned as favorable, but I wanted to make sure molding our debug features to
> this infrastructure will result in something acceptable. A few points:
> 
> 1.
> One of our HW debug features is a signal recording feature. HW is configured to
> output specific signals, which are then continuously dumped into a cyclic
> buffer on host. There are ~8000 signals, which can be logically divided to ~100
> groups. I believe this can be modeled in dpipe (or similar tool) as a set of
> ~100 tables with ~80 entries each, and user would be able to see them all and
> choose what they like. The output data itself is binary, and meaningless to
> "the user". The amount of data is basically as large a contiguous buffer as
> driver can allocate, i.e. usually 4MB. When user selects the signals, and sets
> meta data regarding to mode of operations, some device configuration will have
> to take place. Does that sound reasonable?
> This debug feature already exists out of tree for bnx2x and qed* drivers and is
> *very* effective in field deployments. I would very much like to see this as an
> in-tree feature via some infrastructure or another.

By signals do you mean actual electrical signals coming from your chip's
pins/balls? FWIW, there is kind of something similar existing with ARM
Ltd's Embedded Trace Module which allows capturing a SoC's CPU activity
(loads/stores/pc progression etc.) using a high speed interface. You may
want to look at drivers/hwtracing/{coresight,stm}/ for examples on how
this interfaces with Linux' perf events subsystems. There are existing
trace buffers infrastructures and it sounds like your use case fits
nicely within the perf events framework here where you may want the
entire set or subset of these 8000 signals to be recorded in a buffer.


> 
> 2.
> Sometimes we want to debug the probe or removal flow of the driver. ethtool has
> the disadvantage of only being available once network device is available. If a
> bug stops the load flow before the ethtool debug paths are available, there is
> no way to collect a dump. Similarly, removal flows which hit a bug but do remove
> the network device, can't be debugged from ethtool. Does dpipe suffer from the
> same problem? qed* (like mlx*) has a common-functionality module. This allows
> creating debugfs nodes even before the network drivers are probed, providing a
> solution for this (debug nodes are also available after network driver removal).
> If dpipe does hold an answer here (e.g. provide preconfiguration which would
> kick in when network device registers) then we might want to port all of our
> register dump logic over there for this advantage. Does that sound reasonable?

Can you consider using tracing for that particular purpose with trace
filters that are specific to the device of interest? That should allow
you to get events down from the Linux device driver model all the way to
when your driver actually gets initialized. Is that an option?
-- 
Florian

^ permalink raw reply

* Re: IGMP on IPv6
From: Murali Karicheri @ 2017-04-25 16:16 UTC (permalink / raw)
  To: Cong Wang; +Cc: Hangbin Liu, open list:TI NETCP ETHERNET DRIVER, David Miller
In-Reply-To: <CAM_iQpWJ_oPSWV-PBeBGj5LoddMHTmrimLbViXLJ-VKR21e4mQ@mail.gmail.com>

On 04/18/2017 06:37 PM, Cong Wang wrote:
> On Tue, Apr 18, 2017 at 10:20 AM, Murali Karicheri <m-karicheri2@ti.com> wrote:
>> On 04/18/2017 01:12 PM, Murali Karicheri wrote:
>>> On 04/17/2017 05:38 PM, Cong Wang wrote:
>>>> Hello,
>>>>
>>>> On Thu, Apr 13, 2017 at 9:36 AM, Murali Karicheri <m-karicheri2@ti.com> wrote:
>>>>> On 03/22/2017 11:04 AM, Murali Karicheri wrote:
>>>>>> This is going directly to the slave Ethernet interface.
>>>>>>
>>>>>> When I put a WARN_ONCE, I found this is coming directly from
>>>>>> mld_ifc_timer_expire() -> mld_sendpack() -> ip6_output()
>>>>>>
>>>>>> Do you think this is fixed in latest kernel at master? If so, could
>>>>>> you point me to some commits.
>>>>>>
>>>>>>
>>>>> Ping... I see this behavior is also seen on v4.9.x Kernel. Any clue if
>>>>> this is fixed by some commit or I need to debug? I see IGMPv6 has some
>>>>> fixes on the list to make it similar to IGMPv4. So can someone clarify this is
>>>>> is a bug at IGMPv6 code or I need to look into the HSR driver code?
>>>>> Since IGMPv4 is going over the HSR interface I am assuming this is a
>>>>> bug in the IGMPv6 code. But since I have not experience with this code
>>>>> can some expert comment please?
>>>>>
>>>>
>>>> How did you configure your network interfaces and IPv4/IPv6 multicast?
>>>> IOW, how did you reproduce this? For example, did you change your
>>>> HSR setup when this happened since you mentioned
>>>> NETDEV_CHANGEUPPER?
>>>>
>>> Thanks for responding! Really appreciate.
>>>
>>> I didn't set up anything explicitly for IPv4/IPv6 multicast. As part of
>>> my testing, I dump the packets going through the slave interfaces attached
>>> to the hsr interface (for example my Ethernet interfaces eth2 and eth3
>>> are attached to the hsr interface and I dump the packets at the egress
>>> of eth2 and eth3 in my driver along with that at hsr xmit function). As
>>> soon as I create the hsr interface, I see a bunch of packets going directly
>>> through the lower interface, not through the upper one (i.e hsr interface)
>>> and these are of eth_type = 86 dd. Please ignore my reference to
>>> NETDEV_CHANGEUPPER for now as it was wild guess.
> 
> OK. Note: I know nothing about HSR, I assume it is similar to bonding
> in your case?
> 

Similar in the sense it glues two standard Ethernet interfaces and run
HSR protocol frames over it to support redundancy.

>>>
>>> I have not done any debugging, but the WARN_ONCE which I have placed
>>> in the lower level driver looking for eth_type = 86 dd provided the
>>> above trace.
>>>
>> Here is the command I have used to create the hsr interface...
>>
>> ip link add name hsr0 type hsr slave1 eth2 slave2 eth3 supervision 45 version 1
> 
> Did you assign IPv4 and IPv6 addresses to the HSR master device?

No. I just used IPv4. From the trace mld_ifc_timer_expire() -> mld_sendpack() -> ip6_output()
do you know what is it trying to do? Is it some neighbor discovery message or something
going over the lower interface instead of the hsr interface?

Murali

> 


-- 
Murali Karicheri
Linux Kernel, Keystone

^ permalink raw reply

* [PATCH v4 net] net: ipv6: regenerate host route if moved to gc list
From: David Ahern @ 2017-04-25 16:17 UTC (permalink / raw)
  To: netdev; +Cc: dvyukov, andreyknvl, mmanning, kafai, eric.dumazet, David Ahern

Taking down the loopback device wreaks havoc on IPv6 routing. By
extension, taking down a VRF device wreaks havoc on its table.

Dmitry and Andrey both reported heap out-of-bounds reports in the IPv6
FIB code while running syzkaller fuzzer. The root cause is a dead dst
that is on the garbage list gets reinserted into the IPv6 FIB. While on
the gc (or perhaps when it gets added to the gc list) the dst->next is
set to an IPv4 dst. A subsequent walk of the ipv6 tables causes the
out-of-bounds access.

Andrey's reproducer was the key to getting to the bottom of this.

With IPv6, host routes for an address have the dst->dev set to the
loopback device. When the 'lo' device is taken down, rt6_ifdown initiates
a walk of the fib evicting routes with the 'lo' device which means all
host routes are removed. That process moves the dst which is attached to
an inet6_ifaddr to the gc list and marks it as dead.

The recent change to keep global IPv6 addresses added a new function,
fixup_permanent_addr, that is called on admin up. That function restarts
dad for an inet6_ifaddr and when it completes the host route attached
to it is inserted into the fib. Since the route was marked dead and
moved to the gc list, re-inserting the route causes the reported
out-of-bounds accesses. If the device with the address is taken down
or the address is removed, the WARN_ON in fib6_del is triggered.

All of those faults are fixed by regenerating the host route if the
existing one has been moved to the gc list, something that can be
determined by checking if the rt6i_ref counter is 0.

Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reported-by: Andrey Konovalov <andreyknvl@google.com>
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
---
v4
- move 'prev = ifp->rt;' under spinlock as requested by Eric

v3
- removed 'if (prev)' and just call ip6_rt_put; added comment about spinlock

v2
- change ifp->rt under spinlock vs cmpxchg
- add comment about rt6i_ref == 0

 net/ipv6/addrconf.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 80ce478c4851..0ea96c4d334d 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -3271,14 +3271,24 @@ static void addrconf_gre_config(struct net_device *dev)
 static int fixup_permanent_addr(struct inet6_dev *idev,
 				struct inet6_ifaddr *ifp)
 {
-	if (!ifp->rt) {
-		struct rt6_info *rt;
+	/* rt6i_ref == 0 means the host route was removed from the
+	 * FIB, for example, if 'lo' device is taken down. In that
+	 * case regenerate the host route.
+	 */
+	if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) {
+		struct rt6_info *rt, *prev;

 		rt = addrconf_dst_alloc(idev, &ifp->addr, false);
 		if (unlikely(IS_ERR(rt)))
 			return PTR_ERR(rt);

+		/* ifp->rt can be accessed outside of rtnl */
+		spin_lock(&ifp->lock);
+		prev = ifp->rt;
 		ifp->rt = rt;
+		spin_unlock(&ifp->lock);
+
+		ip6_rt_put(prev);
 	}

 	if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
-- 
2.1.4

^ permalink raw reply related

* Re: [PATCH v4 net-next] mdio_bus: Issue GPIO RESET to PHYs.
From: Lars-Peter Clausen @ 2017-04-25 16:22 UTC (permalink / raw)
  To: Roger Quadros, Andrew Lunn
  Cc: davem, Florian Fainelli, tony, nsekhar, jsarha, netdev,
	linux-omap, linux-kernel
In-Reply-To: <551ee7a0-d153-3a36-e025-2c1f70f866b7@ti.com>

On 04/24/2017 11:04 AM, Roger Quadros wrote:
> On 24/04/17 02:35, Andrew Lunn wrote:
>> On Fri, Apr 21, 2017 at 03:31:09PM +0200, Lars-Peter Clausen wrote:
>>> On 04/21/2017 03:15 PM, Roger Quadros wrote:
>>>> diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt
>>>> new file mode 100644
>>>> index 0000000..4ffbbac
>>>> --- /dev/null
>>>> +++ b/Documentation/devicetree/bindings/net/mdio.txt
>>>> @@ -0,0 +1,33 @@
>>>> +Common MDIO bus properties.
>>>> +
>>>> +These are generic properties that can apply to any MDIO bus.
>>>> +
>>>> +Optional properties:
>>>> +- reset-gpios: List of one or more GPIOs that control the RESET lines
>>>> +  of the PHYs on that MDIO bus.
>>>> +- reset-delay-us: RESET pulse width in microseconds as per PHY datasheet.
>>>> +
>>>> +A list of child nodes, one per device on the bus is expected. These
>>>> +should follow the generic phy.txt, or a device specific binding document.
>>>> +
>>>> +Example :
>>>> +This example shows these optional properties, plus other properties
>>>> +required for the TI Davinci MDIO driver.
>>>> +
>>>> +	davinci_mdio: ethernet@0x5c030000 {
>>>> +		compatible = "ti,davinci_mdio";
>>>> +		reg = <0x5c030000 0x1000>;
>>>> +		#address-cells = <1>;
>>>> +		#size-cells = <0>;
>>>> +
>>>> +		reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>;
>>>> +		reset-delay-us = <2>;   /* PHY datasheet states 1us min */
>>>
>>> If this is the reset line of the PHY shouldn't it be a property of the PHY
>>> node rather than of the MDIO controller node (which might have a reset on
>>> its own)?
>>>> +
>>>> +		ethphy0: ethernet-phy@1 {
>>>> +			reg = <1>;
>>>> +		};
>>>> +
>>>> +		ethphy1: ethernet-phy@3 {
>>>> +			reg = <3>;
>>>> +		};
>>
>> Hi Lars-Peter
>>
>> We discussed this when the first proposal was made. There are two
>> cases, to consider.
>>
>> 1) Here, one GPIO line resets all PHYs on the same MDIO bus. In this
>> example, two PHYs.
>>
>> 2) There is one GPIO line per PHY. That is a separate case, and as you
>> say, the reset line should probably be considered a PHY property, not
>> an MDIO property. However, it can be messy, since in order to probe
>> the MDIO bus, you probably need to take the PHY out of reset.
>>

But the DT binding documentation says something else "List of one or more
GPIOs that control the RESET lines of the PHYs on that MDIO bus".

>> Anyway, this patch addresses the first case, so should be accepted. If
>> anybody wants to address the second case, they are free to do so.

I think we all know that that's not going to happen. Once there is a working
kludge there is no incentive to do a proper implementation anymore.


> Thanks for the explanation Andrew.
> 
> For the second case, even if the RESET GPIO property is specified
> in the PHY node, the RESET *will* have to be done by the MDIO bus driver
> else the PHY might not be probed at all.

I'm not arguing with that, just that the hardware description should be
truthful to the hardware topology and not to the software topology, i.e. the
implementation details of the Linux kernel in this case. Reset GPIOs are not
the only resource that is connected to the PHY that needs to be enabled
before they can be enumerated. E.g. clocks and regulators fall into the same
realm. And while you might argue that with a on-SoC phy controller node
there wont be any conflicts in regard to the reset-gpios property, this not
so very true for the clocks property.

And MDIO is not really special in this regard, other discoverable buses
(like USB, SDIO, ULPI) have the very same issue. Having a standardized
binding approach where the resources are declared as part as the child child
is preferable in my opinion.

> 
> Whether we need additional code to just to make the DT look prettier is
> questionable and if required can come as a separate patch.

Unfortunately not, once it is merged it can't be changed anymore.

^ permalink raw reply

* Re: Network cooling device and how to control NIC speed on thermal condition
From: Florian Fainelli @ 2017-04-25 16:23 UTC (permalink / raw)
  To: Waldemar Rymarkiewicz, netdev; +Cc: linux-kernel
In-Reply-To: <CAHKzcEMQPQzg6PYCSuv7Ufad+4AZ2gnqMgz3-VH5NTeb_pv4zQ@mail.gmail.com>

Hello,

On 04/25/2017 01:36 AM, Waldemar Rymarkiewicz wrote:
> Hi,
> 
> I am not much aware of linux networking architecture so I'd like to
> ask first before will start to dig into the code. Appreciate any
> feedback.
> 
> I am looking on Linux thermal framework and on how to cool down the
> system effectively when it hits thermal condition. Already existing
> cooling methods cpu_cooling and clock_cooling are good. However, I
> wanted to go further and dynamically control also a switch ports'
> speed based on thermal condition. Lowering speed means less power,
> less power means lower temp.
> 
> Is there any in-kernel interface to configure switch port/NIC from other driver?

Well, there is mostly under the form of notifiers though. For instance
there are lots of devices that do converged FCoE/RoCE/Ethernet that have
a two headed set of drivers, one for normal ethernet, and another one
for RDMA/IB for instance. To some extent stacked devices (VLAN, bond,
team, etc.) also call back down into their lower device, but in an
abstracted way, at the net_device level of course (layering).

> 
> Is there any mechanism to power save, when port/interface is not
> really used (not much or low data traffic), embedded in networking
> stack  or is it a task for NIC driver itself ?

The thing we did (currently out of tree) in the Starfighter 2 switch
driver (drivers/net/dsa/bcm_sf2.c) is that any time a port is brought
up/down (a port = a network device) we recalculate the switch core
clock, and we also resize the buffers and that yields to a little bit of
power savings  here and there. I don't recall the numbers from the top
of my head, but it was significant enough our HW designers convinced me
into doing it ;)

> 
> I was thinking to create net_cooling device similarly to cpu_cooling
> device which cool down the system scaling down cpu freq.  net_cooling
> could lower down interface speed (or tune more parameters to achieve
> ).  Do you thing could this work form networking stack perspective?

This sounds like a good idea, but it could be very tricky to get right,
because even if you can somehow throttle your transmit activity (since
the host is in control), you can't do that without being disruptive to
the receive path (or not as effectively).

Unlike any kind of host driven activity: CPU run queue, block devices,
USB etc. (SPI, I2C and so on when no using slave driven interrupts) you
cannot simply apply a "duty cycle" pattern where you turn on your HW
just enough of time that is needed for you to set it up for transfer,
signal transfer completion and go back to sleep. Networking needs to be
able to asynchronously receive packets in a way that is usually not
predictable although it could be for very specific workloads though.

Another thing is that there is still a fair amount of energy that needs
to be spent in maintaining the link, and the HW design may be entirely
clocked based on the link speed. Depending on the HW architecture (store
and forward, cut through etc.) there would still be a cost associated
with maintaining RAMs in a state where they are operational and so on.

You could imagine writing a queuing discipline driver that would
throttle transmission based on temperature sensors present in your NIC,
you could definitively do this in a way that is completely device driver
agnostic by using Linux's thermal framework trip point and temperature
notifications.

For reception, if you are okay with dropping some packets, you could
implement something similar, but chances are that your NIC would still
need to receive packets, be able to fully process them before SW drops
them, at which point, you have a myriad of solutions about how not to
process incoming traffic.

Hope this helps

> 
> Any pointers  to the code or a doc highly appreciated.
> 
> Thanks,
> /Waldek
> 

-- 
Florian

^ permalink raw reply

* Re: [PATCH v2] net/packet: initialize val in packet_getsockopt()
From: Alexander Potapenko @ 2017-04-25 16:27 UTC (permalink / raw)
  To: David Miller
  Cc: Dmitriy Vyukov, Kostya Serebryany, Eric Dumazet, Alexey Kuznetsov,
	LKML, Networking
In-Reply-To: <20170425.114433.143144279134920277.davem@davemloft.net>

On Tue, Apr 25, 2017 at 5:44 PM, David Miller <davem@davemloft.net> wrote:
> From: Alexander Potapenko <glider@google.com>
> Date: Mon, 24 Apr 2017 14:59:14 +0200
>
>> In the case getsockopt() is called with PACKET_HDRLEN and optlen < 4
>> |val| remains uninitialized and the syscall may behave differently
>> depending on its value. This doesn't have security consequences (as the
>> uninit bytes aren't copied back), but it's still cleaner to initialize
>> |val| and ensure optlen is not less than sizeof(int).
>>
>> This bug has been detected with KMSAN.
>>
>> Signed-off-by: Alexander Potapenko <glider@google.com>
>> ---
>> v2: - if len < sizeof(int), make it 0
>
> No, you should signal an error if the len is too small.
According to manpages, only setsockopt() may return EINVAL.
Is it ok to change the behavior of getsockopt() to return EINVAL in
this case? (I.e. won't we break existing users that don't expect it?)
> Returning zero bytes to userspace silently makes the user think that
> he got the data he asked for.



-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Matthew Scott Sucherman, Paul Terence Manicle
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply

* Re: [PATCH v4 net-next] mdio_bus: Issue GPIO RESET to PHYs.
From: Florian Fainelli @ 2017-04-25 16:31 UTC (permalink / raw)
  To: Lars-Peter Clausen, Roger Quadros, Andrew Lunn
  Cc: davem, tony, nsekhar, jsarha, netdev, linux-omap, linux-kernel
In-Reply-To: <5954130c-c08c-1555-4d34-83307ed68d92@metafoo.de>

On 04/25/2017 09:22 AM, Lars-Peter Clausen wrote:
> On 04/24/2017 11:04 AM, Roger Quadros wrote:
>> On 24/04/17 02:35, Andrew Lunn wrote:
>>> On Fri, Apr 21, 2017 at 03:31:09PM +0200, Lars-Peter Clausen wrote:
>>>> On 04/21/2017 03:15 PM, Roger Quadros wrote:
>>>>> diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt
>>>>> new file mode 100644
>>>>> index 0000000..4ffbbac
>>>>> --- /dev/null
>>>>> +++ b/Documentation/devicetree/bindings/net/mdio.txt
>>>>> @@ -0,0 +1,33 @@
>>>>> +Common MDIO bus properties.
>>>>> +
>>>>> +These are generic properties that can apply to any MDIO bus.
>>>>> +
>>>>> +Optional properties:
>>>>> +- reset-gpios: List of one or more GPIOs that control the RESET lines
>>>>> +  of the PHYs on that MDIO bus.
>>>>> +- reset-delay-us: RESET pulse width in microseconds as per PHY datasheet.
>>>>> +
>>>>> +A list of child nodes, one per device on the bus is expected. These
>>>>> +should follow the generic phy.txt, or a device specific binding document.
>>>>> +
>>>>> +Example :
>>>>> +This example shows these optional properties, plus other properties
>>>>> +required for the TI Davinci MDIO driver.
>>>>> +
>>>>> +	davinci_mdio: ethernet@0x5c030000 {
>>>>> +		compatible = "ti,davinci_mdio";
>>>>> +		reg = <0x5c030000 0x1000>;
>>>>> +		#address-cells = <1>;
>>>>> +		#size-cells = <0>;
>>>>> +
>>>>> +		reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>;
>>>>> +		reset-delay-us = <2>;   /* PHY datasheet states 1us min */
>>>>
>>>> If this is the reset line of the PHY shouldn't it be a property of the PHY
>>>> node rather than of the MDIO controller node (which might have a reset on
>>>> its own)?
>>>>> +
>>>>> +		ethphy0: ethernet-phy@1 {
>>>>> +			reg = <1>;
>>>>> +		};
>>>>> +
>>>>> +		ethphy1: ethernet-phy@3 {
>>>>> +			reg = <3>;
>>>>> +		};
>>>
>>> Hi Lars-Peter
>>>
>>> We discussed this when the first proposal was made. There are two
>>> cases, to consider.
>>>
>>> 1) Here, one GPIO line resets all PHYs on the same MDIO bus. In this
>>> example, two PHYs.
>>>
>>> 2) There is one GPIO line per PHY. That is a separate case, and as you
>>> say, the reset line should probably be considered a PHY property, not
>>> an MDIO property. However, it can be messy, since in order to probe
>>> the MDIO bus, you probably need to take the PHY out of reset.
>>>
> 
> But the DT binding documentation says something else "List of one or more
> GPIOs that control the RESET lines of the PHYs on that MDIO bus".

I agree, it should be defined more strictly as:

"One GPIO that controls the reset line of *all* PHYs populated on that
MDIO bus"

If there are separate lines, these automatically become properties of
the PHY nodes.

> 
>>> Anyway, this patch addresses the first case, so should be accepted. If
>>> anybody wants to address the second case, they are free to do so.
> 
> I think we all know that that's not going to happen. Once there is a working
> kludge there is no incentive to do a proper implementation anymore.
> 
> 
>> Thanks for the explanation Andrew.
>>
>> For the second case, even if the RESET GPIO property is specified
>> in the PHY node, the RESET *will* have to be done by the MDIO bus driver
>> else the PHY might not be probed at all.
> 
> I'm not arguing with that, just that the hardware description should be
> truthful to the hardware topology and not to the software topology, i.e. the
> implementation details of the Linux kernel in this case. Reset GPIOs are not
> the only resource that is connected to the PHY that needs to be enabled
> before they can be enumerated. E.g. clocks and regulators fall into the same
> realm. And while you might argue that with a on-SoC phy controller node
> there wont be any conflicts in regard to the reset-gpios property, this not
> so very true for the clocks property.

Agreed, but with the exception of the unfortunate choice of words here
(single vs. multiple) there is not a really a divergence in how the
shared reset line is represented compared to other similar control
busses, is there?

> 
> And MDIO is not really special in this regard, other discoverable buses
> (like USB, SDIO, ULPI) have the very same issue. Having a standardized
> binding approach where the resources are declared as part as the child child
> is preferable in my opinion.
> 
>>
>> Whether we need additional code to just to make the DT look prettier is
>> questionable and if required can come as a separate patch.
> 
> Unfortunately not, once it is merged it can't be changed anymore.

There are no in tree users yet, so let's get the different things fixed
right now.
-- 
Florian

^ permalink raw reply

* Re: [PATCH v2] net/packet: initialize val in packet_getsockopt()
From: David Miller @ 2017-04-25 16:32 UTC (permalink / raw)
  To: glider; +Cc: dvyukov, kcc, edumazet, kuznet, linux-kernel, netdev
In-Reply-To: <CAG_fn=URNuXxc7wEaXd_p7B46RRCYYMTv5rOVc_U1CdvervzPQ@mail.gmail.com>

From: Alexander Potapenko <glider@google.com>
Date: Tue, 25 Apr 2017 18:27:04 +0200

> On Tue, Apr 25, 2017 at 5:44 PM, David Miller <davem@davemloft.net> wrote:
>> From: Alexander Potapenko <glider@google.com>
>> Date: Mon, 24 Apr 2017 14:59:14 +0200
>>
>>> In the case getsockopt() is called with PACKET_HDRLEN and optlen < 4
>>> |val| remains uninitialized and the syscall may behave differently
>>> depending on its value. This doesn't have security consequences (as the
>>> uninit bytes aren't copied back), but it's still cleaner to initialize
>>> |val| and ensure optlen is not less than sizeof(int).
>>>
>>> This bug has been detected with KMSAN.
>>>
>>> Signed-off-by: Alexander Potapenko <glider@google.com>
>>> ---
>>> v2: - if len < sizeof(int), make it 0
>>
>> No, you should signal an error if the len is too small.
> According to manpages, only setsockopt() may return EINVAL.
> Is it ok to change the behavior of getsockopt() to return EINVAL in
> this case? (I.e. won't we break existing users that don't expect it?)

They are currently getting corrupt data depending upon the endianness,
so -EINVAL is a serious improvement.

^ permalink raw reply

* Re: [PATCH v3 net] net: ipv6: regenerate host route if moved to gc list
From: Andrey Konovalov @ 2017-04-25 16:35 UTC (permalink / raw)
  To: David Ahern; +Cc: netdev, Dmitry Vyukov, mmanning, Martin KaFai Lau
In-Reply-To: <c86e9056-151c-6b9d-9475-99158de0874b@cumulusnetworks.com>

On Tue, Apr 25, 2017 at 5:54 PM, David Ahern <dsa@cumulusnetworks.com> wrote:
> On 4/25/17 6:50 AM, Andrey Konovalov wrote:
>> I've been running syzkaller with your patch and got another report
>> from ip6_pol_route.
>
> In general the existing patch cleans up all of the ipv6 fib kasan and
> WARN_ON traces that were seen?

Before applying your patch I was hitting reports related to fib6 all the time.
I've stopped seeing them for some time after I applied your patch.
However today another one was triggered (the one I sent above).

>
>
>>
>> It happened only once so far and I couldn't reproduce it.
>
> Some similarity in the sense of a ipv4 route in the ipv6 fib. That's
> only going to happen if it hits the dst gc list and then back.
>
> This duplicates what Dmitry reported on March 3rd.

^ permalink raw reply

* Re: [PATCH] macsec: dynamically allocate space for sglist
From: Sabrina Dubroca @ 2017-04-25 16:36 UTC (permalink / raw)
  To: Jason A. Donenfeld; +Cc: Netdev, LKML, David Miller, stable, security
In-Reply-To: <20170425152300.3830-1-Jason@zx2c4.com>

2017-04-25, 17:23:00 +0200, Jason A. Donenfeld wrote:
> We call skb_cow_data, which is good anyway to ensure we can actually
> modify the skb as such (another error from prior). Now that we have the
> number of fragments required, we can safely allocate exactly that amount
> of memory.
> 
> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
> Cc: Sabrina Dubroca <sd@queasysnail.net>
> Cc: security@kernel.org
> Cc: stable@vger.kernel.org
> ---
>  drivers/net/macsec.c | 25 ++++++++++++++++++++-----
>  1 file changed, 20 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
> index dbab05afcdbe..56dafdee4c9c 100644
> --- a/drivers/net/macsec.c
> +++ b/drivers/net/macsec.c
[...]
> @@ -917,6 +926,7 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
>  {
>  	int ret;
>  	struct scatterlist *sg;
> +	struct sk_buff *trailer;
>  	unsigned char *iv;
>  	struct aead_request *req;
>  	struct macsec_eth_header *hdr;
> @@ -927,7 +937,12 @@ static struct sk_buff *macsec_decrypt(struct sk_buff *skb,
>  	if (!skb)
>  		return ERR_PTR(-ENOMEM);
>  
> -	req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg);
> +	ret = skb_cow_data(skb, 0, &trailer);
> +	if (unlikely(ret < 0)) {
> +		kfree_skb(skb);
> +		return ERR_PTR(ret);
> +	}
> +	req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret);
>  	if (!req) {
>  		kfree_skb(skb);
>  		return ERR_PTR(-ENOMEM);

There's a problem here (and in macsec_encrypt): you need to update the
call to sg_init_table, like I did in my patch.  Otherwise,
sg_init_table() is going to access sg[MAX_SKB_FRAGS], which may be
past what you allocated.

How did you test this? ;)

-- 
Sabrina

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox