Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH V2 net-next 0/6] virtio-net:  Add SCTP checksum offload support
From: Vladislav Yasevich @ 2018-05-02  2:07 UTC (permalink / raw)
  To: netdev
  Cc: linux-sctp, virtualization, virtio-dev, mst, jasowang, nhorman,
	marcelo.leitner, Vladislav Yasevich

Now that we have SCTP offload capabilities in the kernel, we can add
them to virtio as well.  First step is SCTP checksum.

We need a new freature in virtio to negotiate this support since
SCTP is excluded with the stardard checksum and requires a little
bit extra.  This series proposes VIRTIO_NET_F_SCTP_CSUM feature bit.

As the "little bit extra",  the kernel uses a new bit in the skb
(skb->csum_not_inet) to determine whether to use standard inet checksum
or the SCTP CRC32c checksum.  This bit has to be communicated between
the host and the guest.  This bit is carried in the vnet header.

Tap and macvtap support is added through an extra feature for the
TUNSETOFFLOAD ioctl.  Additionally macvtap will no correctly
do sctp checksumming if the receive doesn't support SCTP offload.
This also turns on sctp offloading for macvlan devices.

As for the perf numbers, I am seeing about a 5% increase in vm-to-vm
and vm-to-hos throughput which is the same as manually disabling
sctp checksumming,since this is exactly what we are emulatting.
Sending outside the host,  the increase about 2.5-3%.

Since v1:
  - Fixed some old patch bugs that snuck in.
  - virtio feature bits moved to high bits
  - distinguish between GUEST and HOST features
  - Fixed offload control command to control sctp checksum offload
  - added ipvlan support

Vladislav Yasevich (6):
  virtio: Add support for SCTP checksum offloading
  sctp: Handle sctp packets with CHECKSUM_PARTIAL
  sctp: Build sctp offload support into the base kernel
  tun: Add support for SCTP checksum offload
  macvlan/macvtap: Add support for SCTP checksum offload.
  ipvlan: Add support for SCTP checksum offload

 drivers/net/ipvlan/ipvlan_main.c |  3 ++-
 drivers/net/macvlan.c            |  5 +++--
 drivers/net/tap.c                |  8 +++++---
 drivers/net/tun.c                |  7 ++++++-
 drivers/net/virtio_net.c         | 14 +++++++++++++-
 include/linux/virtio_net.h       |  6 ++++++
 include/net/sctp/sctp.h          |  5 -----
 include/uapi/linux/if_tun.h      |  1 +
 include/uapi/linux/virtio_net.h  |  5 +++++
 net/Kconfig                      |  1 +
 net/sctp/Kconfig                 |  1 -
 net/sctp/Makefile                |  3 ++-
 net/sctp/input.c                 | 11 ++++++++++-
 net/sctp/offload.c               |  4 +++-
 net/sctp/protocol.c              |  3 ---
 15 files changed, 57 insertions(+), 20 deletions(-)

-- 
2.9.5

^ permalink raw reply

* [PATCH V2 net-next 3/6] sctp: Build sctp offload support into the base kernel
From: Vladislav Yasevich @ 2018-05-02  2:07 UTC (permalink / raw)
  To: netdev
  Cc: virtio-dev, marcelo.leitner, nhorman, mst, virtualization,
	linux-sctp
In-Reply-To: <20180502020739.19239-1-vyasevic@redhat.com>

With the SCTP checksum offload support added to virtio, it is
now possible to get into a situation where SCTP not present
in the kernel, but the feature is negotiated.  Handle this just
like we do IPv6 and other modular offloads.
Move the sctp offload out of the sctp module and into the base
kernel.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
---
 include/net/sctp/sctp.h | 5 -----
 net/Kconfig             | 1 +
 net/sctp/Kconfig        | 1 -
 net/sctp/Makefile       | 3 ++-
 net/sctp/offload.c      | 4 +++-
 net/sctp/protocol.c     | 3 ---
 6 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 72c5b8f..625b45f 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -183,11 +183,6 @@ struct sctp_transport *sctp_epaddr_lookup_transport(
 int __net_init sctp_proc_init(struct net *net);
 
 /*
- * sctp/offload.c
- */
-int sctp_offload_init(void);
-
-/*
  * sctp/stream_sched.c
  */
 void sctp_sched_ops_init(void);
diff --git a/net/Kconfig b/net/Kconfig
index 0428f12..2773f98 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -64,6 +64,7 @@ config INET
 	bool "TCP/IP networking"
 	select CRYPTO
 	select CRYPTO_AES
+	select LIBCRC32C
 	---help---
 	  These are the protocols used on the Internet and on most local
 	  Ethernets. It is highly recommended to say Y here (this will enlarge
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index c740b18..d07477a 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -9,7 +9,6 @@ menuconfig IP_SCTP
 	select CRYPTO
 	select CRYPTO_HMAC
 	select CRYPTO_SHA1
-	select LIBCRC32C
 	---help---
 	  Stream Control Transmission Protocol
 
diff --git a/net/sctp/Makefile b/net/sctp/Makefile
index e845e45..ee206ca 100644
--- a/net/sctp/Makefile
+++ b/net/sctp/Makefile
@@ -5,6 +5,7 @@
 
 obj-$(CONFIG_IP_SCTP) += sctp.o
 obj-$(CONFIG_INET_SCTP_DIAG) += sctp_diag.o
+obj-$(CONFIG_INET) += offload.o
 
 sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  protocol.o endpointola.o associola.o \
@@ -12,7 +13,7 @@ sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \
 	  inqueue.o outqueue.o ulpqueue.o \
 	  tsnmap.o bind_addr.o socket.o primitive.o \
 	  output.o input.o debug.o stream.o auth.o \
-	  offload.o stream_sched.o stream_sched_prio.o \
+	  stream_sched.o stream_sched_prio.o \
 	  stream_sched_rr.o stream_interleave.o
 
 sctp_diag-y := diag.o
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 123e9f2..c61cbde 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -107,7 +107,7 @@ static const struct skb_checksum_ops crc32c_csum_ops = {
 	.combine = sctp_csum_combine,
 };
 
-int __init sctp_offload_init(void)
+static int __init sctp_offload_init(void)
 {
 	int ret;
 
@@ -127,3 +127,5 @@ int __init sctp_offload_init(void)
 out:
 	return ret;
 }
+
+fs_initcall(sctp_offload_init);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a24cde2..46d2b63 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1479,9 +1479,6 @@ static __init int sctp_init(void)
 	if (status)
 		goto err_v6_add_protocol;
 
-	if (sctp_offload_init() < 0)
-		pr_crit("%s: Cannot add SCTP protocol offload\n", __func__);
-
 out:
 	return status;
 err_v6_add_protocol:
-- 
2.9.5

^ permalink raw reply related

* Re: Suggestions on iterating eBPF maps
From: Lorenzo Colitti @ 2018-05-02  2:05 UTC (permalink / raw)
  To: Alexei Starovoitov; +Cc: Chenbo Feng, netdev, Daniel Borkmann, Joel Fernandes
In-Reply-To: <20180428010439.qryq3ejdyyhtb25u@ast-mbp>

On Sat, Apr 28, 2018 at 10:04 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> Another approach could be to use map-in-map and have almost atomic
> replace of the whole map with new potentially empty map. The prog
> can continue using the new map, while user space walks no longer
> accessed old map.

That sounds like a promising approach. I assume this would be
functionally equivalent to an approach where there is a map containing
a boolean that says whether to write to map A or map B? We'd then do
the following:

0. Kernel program is writing to map A.
1. Userspace pushes config that says to write to map B.
2. Kernel program starts to write to map B.
3. Userspace scans map A, collecting stats and deleting everything it finds.

One problem with this is: if the effects of #1 are not immediately
visible to the programs running on all cores, the program could still
be writing to map A and the deletes in #3 would result in loss of
data. Are there any guarantees around this? I know that hash map
writes are atomic, but I'm not aware of any other guarantees here. Are
there memory barriers around map writes and reads?

In the absence of guarantees, userspace could put a sleep between #1
and #3 and things would be correct Most Of The Time(TM), but if the
kernel is busy doing other things that might not be sufficient.
Thoughts?

^ permalink raw reply

* [PATCH] net/xfrm: Fix lookups for states with spi == 0
From: Dmitry Safonov @ 2018-05-02  2:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: 0x7f454c46, Dmitry Safonov, Steffen Klassert, Herbert Xu,
	David S. Miller, netdev

It seems to be a valid use case to add xfrm state without
Security Parameter Indexes (SPI) value associated:
ip xfrm state add src $src dst $dst proto $proto mode $mode sel src $src dst $dst $algo

The bad thing is that it's currently impossible to get/delete the state
without SPI: __xfrm_state_insert() obviously doesn't add hash for zero
SPI in xfrm.state_byspi, and xfrm_user_state_lookup() will fail as
xfrm_state_lookup() does lookups by hash.

It also isn't possible to workaround from userspace as
xfrm_id_proto_match() will be always true for ah/esp/comp protos.

So, don't try looking up by hash if SPI == 0.

Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: netdev@vger.kernel.org
Signed-off-by: Dmitry Safonov <dima@arista.com>
---
 net/xfrm/xfrm_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..6b38503255c8 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -681,7 +681,7 @@ static struct xfrm_state *xfrm_user_state_lookup(struct net *net,
 	int err;
 	u32 mark = xfrm_mark_get(attrs, &m);
 
-	if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY)) {
+	if (xfrm_id_proto_match(p->proto, IPSEC_PROTO_ANY) && p->spi) {
 		err = -ESRCH;
 		x = xfrm_state_lookup(net, mark, &p->daddr, p->spi, p->proto, p->family);
 	} else {
-- 
2.13.6

^ permalink raw reply related

* linux-next: manual merge of the net-next tree with the net tree
From: Stephen Rothwell @ 2018-05-02  1:52 UTC (permalink / raw)
  To: David Miller, Networking
  Cc: Linux-Next Mailing List, Linux Kernel Mailing List, Thomas Winter,
	David Ahern

[-- Attachment #1: Type: text/plain, Size: 1445 bytes --]

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  include/net/ip6_route.h

between commit:

  edd7ceb78296 ("ipv6: Allow non-gateway ECMP for IPv6")

from the net tree and commit:

  93c2fb253d17 ("net/ipv6: Rename fib6_info struct elements")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc include/net/ip6_route.h
index abceb5864d99,8df4ff798b04..000000000000
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@@ -66,9 -66,10 +66,9 @@@ static inline bool rt6_need_strict(cons
  		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
  }
  
- static inline bool rt6_qualify_for_ecmp(const struct rt6_info *rt)
+ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
  {
- 	return (rt->rt6i_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0;
 -	return (f6i->fib6_flags & (RTF_GATEWAY|RTF_ADDRCONF|RTF_DYNAMIC)) ==
 -	       RTF_GATEWAY;
++	return (f6i->fib6_flags & (RTF_ADDRCONF | RTF_DYNAMIC)) == 0;
  }
  
  void ip6_route_input(struct sk_buff *skb);

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net] tcp_bbr: fix to zero idle_restart only upon S/ACKed data
From: Neal Cardwell @ 2018-05-02  1:45 UTC (permalink / raw)
  To: davem, netdev, ncardwell, ycheng, soheil, priyarjha, ysseung
  Cc: netdev, Neal Cardwell, Yuchung Cheng, Soheil Hassas Yeganeh,
	Priyaranjan Jha, Yousuk Seung

Previously the bbr->idle_restart tracking was zeroing out the
bbr->idle_restart bit upon ACKs that did not SACK or ACK anything,
e.g. receiving incoming data or receiver window updates. In such
situations BBR would forget that this was a restart-from-idle
situation, and if the min_rtt had expired it would unnecessarily enter
PROBE_RTT (even though we were actually restarting from idle but had
merely forgotten that fact).

The fix is simple: we need to remember we are restarting from idle
until we receive a S/ACK for some data (a S/ACK for the first flight
of data we send as we are restarting).

This commit is a stable candidate for kernels back as far as 4.9.

Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control")
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Priyaranjan Jha <priyarjha@google.com>
Signed-off-by: Yousuk Seung <ysseung@google.com>
---
 net/ipv4/tcp_bbr.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 158d105e76da1..58e2f479ffb4d 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -806,7 +806,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs)
 			}
 		}
 	}
-	bbr->idle_restart = 0;
+	/* Restart after idle ends only once we process a new S/ACK for data */
+	if (rs->delivered > 0)
+		bbr->idle_restart = 0;
 }

 static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
-- 
2.17.0.441.gb46fe60e1d-goog

^ permalink raw reply related

* [v2 PATCH 1/1] tg3: fix meaningless hw_stats reading after tg3_halt memset 0 hw_stats
From: Zumeng Chen @ 2018-05-02  0:42 UTC (permalink / raw)
  To: netdev, linux-kernel
  Cc: michael.chan, siva.kallam, prashant, davem, Zumeng Chen,
	Zumeng Chen

From: Zumeng Chen <zumeng.chen@windriver.com>

Reading hw_stats will get the actual data after a sucessfull tg3_reset_hw,
which actually after tg3_timer_start, so TG3_FLAG_HALT is introduced to
tell tg3_get_stats64 when hw_stats is ready to read. It will be set after
having done memset(tp->hw_stats, 0) in tg3_halt and be clear when hw_init
done. Both tg3_get_stats64 and tg3_halt are protected by tp->lock in all
scope.

Meanwhile, this patch is also to fix a kernel BUG_ON(in_interrupt) crash
when tg3_free_consistent is stuck in tp->lock, which might get a lot of
in_softirq counts(512 or so), and BUG_ON when vunmap to unmap hw->stats.

------------[ cut here ]------------
kernel BUG at /kernel-source//mm/vmalloc.c:1621!
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
task: ffffffc874310000 task.stack: ffffffc8742bc000
PC is at vunmap+0x48/0x50
LR is at __dma_free+0x98/0xa0
pc : [<ffffff80081eb420>] lr : [<ffffff8008097fb8>] pstate: 00000145
sp : ffffffc8742bfad0
x29: ffffffc8742bfad0 x28: ffffffc874310000
x27: ffffffc878931200 x26: ffffffc874530000
x25: 0000000000000003 x24: ffffff800b3aa000
x23: 00000000700bb000 x22: 0000000000000000
x21: 0000000000000000 x20: ffffffc87aafd0a0
x19: ffffff800b3aa000 x18: 0000000000000020
x17: 0000007f9e191e10 x16: ffffff8008eb0d28
x15: 000000000000000a x14: 0000000000070cc8
x13: ffffff8008c65000 x12: 00000000ffffffff
x11: 000000000000000a x10: ffffffbf21d0e220
x9 : 0000000000000004 x8 : ffffff8008c65000
x7 : 0000000000003ff0 x6 : 0000000000000000
x5 : ffffff8008097f20 x4 : 0000000000000000
x3 : ffffff8008fd4fff x2 : ffffffc87b361788
x1 : ffffff800b3aafff x0 : 0000000000000201
Process connmand (pid: 785, stack limit = 0xffffffc8742bc000)
Stack: (0xffffffc8742bfad0 to 0xffffffc8742c0000)
fac0:                                   ffffffc8742bfaf0 ffffff8008097fb8
fae0: 0000000000001000 ffffff80ffffffff ffffffc8742bfb30 ffffff8000b717d4
fb00: ffffffc87aafd0a0 ffffff8008a38000 ffffff800b3aa000 ffffffc874530904
fb20: ffffffc874530900 00000000700bb000 ffffffc8742bfb80 ffffff8000b80324
fb40: 0000000000000001 ffffffc874530900 0000000000000100 0000000000000200
fb60: 0000000000009003 ffffffc874530000 0000000000000003 ffffffc874530000
fb80: ffffffc8742bfbd0 ffffff8000b8aa5c ffffffc874530900 ffffffc874530000
fba0: 00000000ffff0001 0000000000000000 0000000000009003 ffffffc878931210
fbc0: 0000000000009002 ffffffc874530000 ffffffc8742bfc00 ffffff80088bf44c
fbe0: ffffffc874530000 ffffffc8742bfc50 00000000ffff0001 ffffffc874310000
fc00: ffffffc8742bfc30 ffffff80088bf5e4 ffffffc874530000 00000000ffff9002
fc20: ffffffc8742bfc40 ffffffc874530000 ffffffc8742bfc60 ffffff80088c9d58
fc40: ffffffc874530000 ffffff80088c9d34 ffffffc874530080 ffffffc874530080
fc60: ffffffc8742bfca0 ffffff80088c9e4c ffffffc874530000 0000000000009003
fc80: 0000000000008914 0000000000000000 0000007ffd94ba10 ffffffc8742bfd38
fca0: ffffffc8742bfcd0 ffffff80089509f8 0000000000000000 00000000ffffff9d
fcc0: 0000000000008914 0000000000000000 ffffffc8742bfd60 ffffff8008953088
fce0: 0000000000008914 ffffffc874b49b80 0000007ffd94ba10 ffffff8008e9b400
fd00: 0000000000000004 0000007ffd94ba10 0000000000000124 000000000000001d
fd20: ffffff8008a32000 ffffff8008e9b400 0000000000000004 0000000034687465
fd40: 0000000000000000 0000000000009002 0000000000000000 0000000000000000
fd60: ffffffc8742bfd90 ffffff80088a1720 ffffffc874b49b80 0000000000008914
fd80: 0000007ffd94ba10 0000000000000000 ffffffc8742bfdc0 ffffff80088a2648
fda0: 0000000000008914 0000007ffd94ba10 ffffff8008e9b400 ffffffc878a73c00
fdc0: ffffffc8742bfe00 ffffff800822e9e0 0000000000008914 0000007ffd94ba10
fde0: ffffffc874b49bb0 ffffffc8747e5800 ffffffc8742bfe50 ffffff800823cd58
fe00: ffffffc8742bfe80 ffffff800822f0ec 0000000000000000 ffffffc878a73c00
fe20: ffffffc878a73c00 0000000000000004 0000000000008914 0000000000080000
fe40: ffffffc8742bfe80 ffffff800822f0b0 0000000000000000 ffffffc878a73c00
fe60: ffffffc878a73c00 0000000000000004 0000000000008914 ffffff8008083730
fe80: 0000000000000000 ffffff8008083730 0000000000000000 00000048771fb000
fea0: ffffffffffffffff 0000007f9e191e1c 0000000000000000 0000000000000015
fec0: 0000000000000004 0000000000008914 0000007ffd94ba10 0000000000000000
fee0: 000000000000002f 0000000000000004 0000000000000010 0000000000000000
ff00: 000000000000001d 0000000fffffffff 0101010101010101 0000000000000000
ff20: 6532336338646634 00656c6261635f38 0000007f9e46a220 0000007f9e45f318
ff40: 00000000004c1a58 0000007f9e191e10 00000000000006df 0000000000000000
ff60: 0000000000000004 00000000004c6470 00000000004c3c40 0000000000512d20
ff80: 0000000000000001 0000000000000000 0000000000000000 0000000000000000
ffa0: 0000000000000000 0000007ffd94b9f0 0000000000463dec 0000007ffd94b9f0
ffc0: 0000007f9e191e1c 0000000000000000 0000000000000004 000000000000001d
ffe0: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
Call trace:
Exception stack(0xffffffc8742bf900 to 0xffffffc8742bfa30)
f900: ffffff800b3aa000 0000008000000000 ffffffc8742bfad0 ffffff80081eb420
f920: ffffff8000000000 ffffff80081a58ec ffffffc8742bf940 ffffff80081c3ea8
f940: ffffffc8742bf990 ffffff80081a591c ffffffc8742bf970 ffffff8008a1d89c
f960: ffffff8008eb1780 ffffff8008eb1780 ffffffc8742bf990 ffffff80081a59dc
f980: ffffffbf21c4ae00 ffffffbf00000000 ffffffc8742bfa20 ffffff80081a5db4
f9a0: 0000000000000201 ffffff800b3aafff ffffffc87b361788 ffffff8008fd4fff
f9c0: 0000000000000000 ffffff8008097f20 0000000000000000 0000000000003ff0
f9e0: ffffff8008c65000 0000000000000004 ffffffbf21d0e220 000000000000000a
fa00: 00000000ffffffff ffffff8008c65000 0000000000070cc8 000000000000000a
fa20: ffffff8008eb0d28 0000007f9e191e10
[<ffffff80081eb420>] vunmap+0x48/0x50
[<ffffff8008097fb8>] __dma_free+0x98/0xa0
[<ffffff8000b717d4>] tg3_free_consistent+0x14c/0x190 [tg3]
[<ffffff8000b80324>] tg3_stop+0x204/0x238 [tg3]
[<ffffff8000b8aa5c>] tg3_close+0x34/0x98 [tg3]
[<ffffff80088bf44c>] __dev_close_many+0x94/0xe8
[<ffffff80088bf5e4>] __dev_close+0x34/0x50
[<ffffff80088c9d58>] __dev_change_flags+0xa0/0x160
[<ffffff80088c9e4c>] dev_change_flags+0x34/0x70
[<ffffff80089509f8>] devinet_ioctl+0x740/0x808
[<ffffff8008953088>] inet_ioctl+0x140/0x158
[<ffffff80088a1720>] sock_do_ioctl+0x40/0x88
[<ffffff80088a2648>] sock_ioctl+0x238/0x368
[<ffffff800822e9e0>] do_vfs_ioctl+0xb0/0x730
[<ffffff800822f0ec>] SyS_ioctl+0x8c/0xa8
[<ffffff8008083730>] el0_svc_naked+0x24/0x28
Code: f9400bf3 a8c27bfd d65f03c0 d503201f (d4210000)
---[ end trace e214990b7cc445ce ]---
Kernel panic - not syncing: Fatal exception in interrupt

Signed-off-by: Zumeng Chen <zumeng.chen@gmail.com>
---

V2 changes:

As Michael mentioned, use TG3_FLAG_HALT to flag the interface down.
Sanity tests include building and runtime, passed.

Cheers,
Zumeng 

 drivers/net/ethernet/broadcom/tg3.c | 13 ++++++++-----
 drivers/net/ethernet/broadcom/tg3.h |  1 +
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 537d571..690cdc6 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -8723,14 +8723,11 @@ static void tg3_free_consistent(struct tg3 *tp)
 	tg3_mem_rx_release(tp);
 	tg3_mem_tx_release(tp);
 
-	/* Protect tg3_get_stats64() from reading freed tp->hw_stats. */
-	tg3_full_lock(tp, 0);
 	if (tp->hw_stats) {
 		dma_free_coherent(&tp->pdev->dev, sizeof(struct tg3_hw_stats),
 				  tp->hw_stats, tp->stats_mapping);
 		tp->hw_stats = NULL;
 	}
-	tg3_full_unlock(tp);
 }
 
 /*
@@ -9334,6 +9331,7 @@ static int tg3_halt(struct tg3 *tp, int kind, bool silent)
 
 		/* And make sure the next sample is new data */
 		memset(tp->hw_stats, 0, sizeof(struct tg3_hw_stats));
+		tg3_flag_set(tp, HALT);
 	}
 
 	return err;
@@ -10732,6 +10730,7 @@ static int tg3_reset_hw(struct tg3 *tp, bool reset_phy)
  */
 static int tg3_init_hw(struct tg3 *tp, bool reset_phy)
 {
+	int retval;
 	/* Chip may have been just powered on. If so, the boot code may still
 	 * be running initialization. Wait for it to finish to avoid races in
 	 * accessing the hardware.
@@ -10743,7 +10742,11 @@ static int tg3_init_hw(struct tg3 *tp, bool reset_phy)
 
 	tw32(TG3PCI_MEM_WIN_BASE_ADDR, 0);
 
-	return tg3_reset_hw(tp, reset_phy);
+	retval = tg3_reset_hw(tp, reset_phy);
+	if (retval == 0)
+		tg3_flag_clear(tp, HALT);
+
+	return retval;
 }
 
 #ifdef CONFIG_TIGON3_HWMON
@@ -14155,7 +14158,7 @@ static void tg3_get_stats64(struct net_device *dev,
 	struct tg3 *tp = netdev_priv(dev);
 
 	spin_lock_bh(&tp->lock);
-	if (!tp->hw_stats) {
+	if (!tp->hw_stats || tg3_flag(tp, HALT)) {
 		*stats = tp->net_stats_prev;
 		spin_unlock_bh(&tp->lock);
 		return;
diff --git a/drivers/net/ethernet/broadcom/tg3.h b/drivers/net/ethernet/broadcom/tg3.h
index 3b5e98e..c61d83c 100644
--- a/drivers/net/ethernet/broadcom/tg3.h
+++ b/drivers/net/ethernet/broadcom/tg3.h
@@ -3102,6 +3102,7 @@ enum TG3_FLAGS {
 	TG3_FLAG_ROBOSWITCH,
 	TG3_FLAG_ONE_DMA_AT_ONCE,
 	TG3_FLAG_RGMII_MODE,
+	TG3_FLAG_HALT,
 
 	/* Add new flags before this comment and TG3_FLAG_NUMBER_OF_FLAGS */
 	TG3_FLAG_NUMBER_OF_FLAGS,	/* Last entry in enum TG3_FLAGS */
-- 
2.9.3

^ permalink raw reply related

* Re: [PATCH] kvmalloc: always use vmalloc if CONFIG_DEBUG_VM
From: Andrew Morton @ 2018-05-02  0:36 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: dm-devel, eric.dumazet, mst, netdev, linux-kernel, Matthew Wilcox,
	Michal Hocko, linux-mm, edumazet, virtualization, David Miller,
	Vlastimil Babka
In-Reply-To: <alpine.LRH.2.02.1804241229410.23702@file01.intranet.prod.int.rdu2.redhat.com>

On Tue, 24 Apr 2018 12:33:01 -0400 (EDT) Mikulas Patocka <mpatocka@redhat.com> wrote:

> 
> 
> On Tue, 24 Apr 2018, Michal Hocko wrote:
> 
> > On Tue 24-04-18 11:30:40, Mikulas Patocka wrote:
> > > 
> > > 
> > > On Tue, 24 Apr 2018, Michal Hocko wrote:
> > > 
> > > > On Mon 23-04-18 20:25:15, Mikulas Patocka wrote:
> > > > 
> > > > > Fixing __vmalloc code 
> > > > > is easy and it doesn't require cooperation with maintainers.
> > > > 
> > > > But it is a hack against the intention of the scope api.
> > > 
> > > It is not!
> > 
> > This discussion simply doesn't make much sense it seems. The scope API
> > is to document the scope of the reclaim recursion critical section. That
> > certainly is not a utility function like vmalloc.
> 
> That 15-line __vmalloc bugfix doesn't prevent you (or any other kernel 
> developer) from converting the code to the scope API. You make nonsensical 
> excuses.
> 

Fun thread!

Winding back to the original problem, I'd state it as

- Caller uses kvmalloc() but passes the address into vmalloc-naive
  DMA API and

- Caller uses kvmalloc() but passes the address into kfree()

Yes?

If so, then...

Is there a way in which, in the kvmalloc-called-kmalloc path, we can
tag the slab-allocated memory with a "this memory was allocated with
kvmalloc()" flag?  I *think* there's extra per-object storage available
with suitable slab/slub debugging options?  Perhaps we could steal one
bit from the redzone, dunno.

If so then we can

a) set that flag in kvmalloc() if the kmalloc() call succeeded

b) check for that flag in the DMA code, WARN if it is set.

c) in kvfree(), clear that flag before calling kfree()

d) in kfree(), check for that flag and go WARN() if set.

So both potential bugs are detected all the time, dependent upon
CONFIG_SLUB_DEBUG (and perhaps other slub config options).

^ permalink raw reply

* Re: [PATCH net-next v9 3/4] virtio_net: Extend virtio to use VF datapath when available
From: Samudrala, Sridhar @ 2018-05-02  0:20 UTC (permalink / raw)
  To: Jiri Pirko
  Cc: mst, stephen, davem, netdev, virtualization, virtio-dev,
	jesse.brandeburg, alexander.h.duyck, kubakici, jasowang,
	loseweigh, aaron.f.brown
In-Reply-To: <20180430072034.GH23854@nanopsycho.orion>

On 4/30/2018 12:20 AM, Jiri Pirko wrote:
>
>>> Now I try to change mac of the failover master:
>>> [root@test1 ~]# ip link set ens3 addr 52:54:00:b2:a7:f3
>>> RTNETLINK answers: Operation not supported
>>>
>>> That I did expect to work. I would expect this would change the mac of
>>> the master and both standby and primary slaves.
>> If a VF is untrusted, a VM will not able to change its MAC and moreover
> Note that at this point, I have no VF. So I'm not sure why you mention
> that.
>
>
>> in this mode we are assuming that the hypervisor has assigned the MAC and
>> guest is not expected to change the MAC.
> Wait, for ordinary old-fashioned virtio_net, as a VM user, I can change
> mac and all works fine. How is this different? Change mac on "failover
> instance" should work and should propagate the mac down to its slaves.
>
>
>> For the initial implementation, i would propose not allowing the guest to
>> change the MAC of failover or standby dev.
> I see no reason for such restriction.
>

It is true that a VM user can change mac address of a normal virtio-net interface,
however when it is in STANDBY mode i think we should not allow this change specifically
because we are creating a failover instance based on a MAC that is assigned by the
hypervisor.

Moreover,  in a cloud environment i would think that PF/hypervisor assigns a MAC to
the VF and it cannot be changed by the guest.

So for the initial implementation, do you see any issues with having this restriction
in STANDBY mode.

^ permalink raw reply

* [PATCH bpf-next 2/2] bpf: add selftest for stackmap with build_id in NMI context
From: Song Liu @ 2018-05-02  0:02 UTC (permalink / raw)
  To: netdev; +Cc: Song Liu, kernel-team
In-Reply-To: <20180502000220.2585320-1-songliubraving@fb.com>

This new test captures stackmap with build_id with hardware event
PERF_COUNT_HW_CPU_CYCLES.

Because we only support one ips-to-build_id lookup per cpu in NMI
context, stack_amap will not be able to do the lookup in this test.
Therefore, we didn't do compare_stack_ips(), as it will alwasy fail.

urandom_read.c is extended to run configurable cycles so that it can be
caught by the perf event.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 tools/testing/selftests/bpf/test_progs.c   | 137 +++++++++++++++++++++++++++++
 tools/testing/selftests/bpf/urandom_read.c |  10 ++-
 2 files changed, 145 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index aa336f0..00bb08c 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -1272,6 +1272,142 @@ static void test_stacktrace_build_id(void)
 	return;
 }
 
+static void test_stacktrace_build_id_nmi(void)
+{
+	int control_map_fd, stackid_hmap_fd, stackmap_fd, stack_amap_fd;
+	const char *file = "./test_stacktrace_build_id.o";
+	int err, pmu_fd, prog_fd;
+	struct perf_event_attr attr = {
+		.sample_freq = 5000,
+		.freq = 1,
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+	};
+	__u32 key, previous_key, val, duration = 0;
+	struct bpf_object *obj;
+	char buf[256];
+	int i, j;
+	struct bpf_stack_build_id id_offs[PERF_MAX_STACK_DEPTH];
+	int build_id_matches = 0;
+
+	err = bpf_prog_load(file, BPF_PROG_TYPE_PERF_EVENT, &obj, &prog_fd);
+	if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno))
+		goto out;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (CHECK(pmu_fd < 0, "perf_event_open",
+		  "err %d errno %d. Does the test host support PERF_COUNT_HW_CPU_CYCLES?\n",
+		  pmu_fd, errno))
+		goto close_prog;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+	if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n",
+		  err, errno))
+		goto close_pmu;
+
+	err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd);
+	if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	/* find map fds */
+	control_map_fd = bpf_find_map(__func__, obj, "control_map");
+	if (CHECK(control_map_fd < 0, "bpf_find_map control_map",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap");
+	if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	stackmap_fd = bpf_find_map(__func__, obj, "stackmap");
+	if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n",
+		  err, errno))
+		goto disable_pmu;
+
+	stack_amap_fd = bpf_find_map(__func__, obj, "stack_amap");
+	if (CHECK(stack_amap_fd < 0, "bpf_find_map stack_amap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	assert(system("dd if=/dev/urandom of=/dev/zero count=4 2> /dev/null")
+	       == 0);
+	assert(system("taskset 0x1 ./urandom_read 100000") == 0);
+	/* disable stack trace collection */
+	key = 0;
+	val = 1;
+	bpf_map_update_elem(control_map_fd, &key, &val, 0);
+
+	/* for every element in stackid_hmap, we can find a corresponding one
+	 * in stackmap, and vise versa.
+	 */
+	err = compare_map_keys(stackid_hmap_fd, stackmap_fd);
+	if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = compare_map_keys(stackmap_fd, stackid_hmap_fd);
+	if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = extract_build_id(buf, 256);
+
+	if (CHECK(err, "get build_id with readelf",
+		  "err %d errno %d\n", err, errno))
+		goto disable_pmu;
+
+	err = bpf_map_get_next_key(stackmap_fd, NULL, &key);
+	if (CHECK(err, "get_next_key from stackmap",
+		  "err %d, errno %d\n", err, errno))
+		goto disable_pmu;
+
+	do {
+		char build_id[64];
+
+		err = bpf_map_lookup_elem(stackmap_fd, &key, id_offs);
+		if (CHECK(err, "lookup_elem from stackmap",
+			  "err %d, errno %d\n", err, errno))
+			goto disable_pmu;
+		for (i = 0; i < PERF_MAX_STACK_DEPTH; ++i)
+			if (id_offs[i].status == BPF_STACK_BUILD_ID_VALID &&
+			    id_offs[i].offset != 0) {
+				for (j = 0; j < 20; ++j)
+					sprintf(build_id + 2 * j, "%02x",
+						id_offs[i].build_id[j] & 0xff);
+				if (strstr(buf, build_id) != NULL)
+					build_id_matches = 1;
+			}
+		previous_key = key;
+	} while (bpf_map_get_next_key(stackmap_fd, &previous_key, &key) == 0);
+
+	if (CHECK(build_id_matches < 1, "build id match",
+		  "Didn't find expected build ID from the map\n"))
+		goto disable_pmu;
+
+	/*
+	 * We intentionally skip compare_stack_ips(). This is because we
+	 * only support one in_nmi() ips-to-build_id translation per cpu
+	 * at any time, thus stack_amap here will always fallback to
+	 * BPF_STACK_BUILD_ID_IP;
+	 */
+
+disable_pmu:
+	ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE);
+
+close_pmu:
+	close(pmu_fd);
+
+close_prog:
+	bpf_object__close(obj);
+
+out:
+	return;
+}
+
 #define MAX_CNT_RAWTP	10ull
 #define MAX_STACK_RAWTP	100
 struct get_stack_trace_t {
@@ -1425,6 +1561,7 @@ int main(void)
 	test_tp_attach_query();
 	test_stacktrace_map();
 	test_stacktrace_build_id();
+	test_stacktrace_build_id_nmi();
 	test_stacktrace_map_raw_tp();
 	test_get_stack_raw_tp();
 
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index 4acfdeb..9de8b7c 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -6,15 +6,21 @@
 #include <stdlib.h>
 
 #define BUF_SIZE 256
-int main(void)
+
+int main(int argc, char *argv[])
 {
 	int fd = open("/dev/urandom", O_RDONLY);
 	int i;
 	char buf[BUF_SIZE];
+	int count = 4;
 
 	if (fd < 0)
 		return 1;
-	for (i = 0; i < 4; ++i)
+
+	if (argc == 2)
+		count = atoi(argv[1]);
+
+	for (i = 0; i < count; ++i)
 		read(fd, buf, BUF_SIZE);
 
 	close(fd);
-- 
2.9.5

^ permalink raw reply related

* [PATCH bpf-next 1/2] bpf: enable stackmap with build_id in nmi context
From: Song Liu @ 2018-05-02  0:02 UTC (permalink / raw)
  To: netdev
  Cc: Song Liu, kernel-team, Alexei Starovoitov, Daniel Borkmann,
	Peter Zijlstra

Currently, we cannot parse build_id in nmi context because of
up_read(&current->mm->mmap_sem), this makes stackmap with build_id
less useful. This patch enables parsing build_id in nmi by putting
the up_read() call in irq_work. To avoid memory allocation in nmi
context, we use per cpu variable for the irq_work. As a result, only
one irq_work per cpu is allowed. If the irq_work is in-use, we
fallback to only report ips.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Song Liu <songliubraving@fb.com>
---
 kernel/bpf/stackmap.c | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 3ba102b..c33fec1 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,7 @@
 #include <linux/perf_event.h>
 #include <linux/elf.h>
 #include <linux/pagemap.h>
+#include <linux/irq_work.h>
 #include "percpu_freelist.h"
 
 #define STACK_CREATE_FLAG_MASK					\
@@ -32,6 +33,23 @@ struct bpf_stack_map {
 	struct stack_map_bucket *buckets[];
 };
 
+/* irq_work to run up_read() for build_id lookup in nmi context */
+struct stack_map_irq_work {
+	struct irq_work work;
+	struct rw_semaphore *sem;
+};
+
+static void up_read_work(struct irq_work *entry)
+{
+	struct stack_map_irq_work *work = container_of(entry,
+			struct stack_map_irq_work, work);
+
+	up_read(work->sem);
+	work->sem = NULL;
+}
+
+DEFINE_PER_CPU(struct stack_map_irq_work, irq_work);
+
 static inline bool stack_map_use_build_id(struct bpf_map *map)
 {
 	return (map->map_flags & BPF_F_STACK_BUILD_ID);
@@ -267,17 +285,27 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 {
 	int i;
 	struct vm_area_struct *vma;
+	bool in_nmi_ctx = in_nmi();
+	bool irq_work_busy = false;
+	struct stack_map_irq_work *work;
+
+	if (in_nmi_ctx) {
+		work = this_cpu_ptr(&irq_work);
+		if (work->sem)
+			/* cannot queue more up_read, fallback */
+			irq_work_busy = true;
+	}
 
 	/*
-	 * We cannot do up_read() in nmi context, so build_id lookup is
-	 * only supported for non-nmi events. If at some point, it is
-	 * possible to run find_vma() without taking the semaphore, we
-	 * would like to allow build_id lookup in nmi context.
+	 * We cannot do up_read() in nmi context. To do build_id lookup
+	 * in nmi context, we need to run up_read() in irq_work. We use
+	 * a percpu variable to do the irq_work. If the irq_work is
+	 * already used by another lookup, we fall back to report ips.
 	 *
 	 * Same fallback is used for kernel stack (!user) on a stackmap
 	 * with build_id.
 	 */
-	if (!user || !current || !current->mm || in_nmi() ||
+	if (!user || !current || !current->mm || irq_work_busy ||
 	    down_read_trylock(&current->mm->mmap_sem) == 0) {
 		/* cannot access current->mm, fall back to ips */
 		for (i = 0; i < trace_nr; i++) {
@@ -299,7 +327,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
 			- vma->vm_start;
 		id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
 	}
-	up_read(&current->mm->mmap_sem);
+
+	if (!in_nmi_ctx)
+		up_read(&current->mm->mmap_sem);
+	else {
+		work->sem = &current->mm->mmap_sem;
+		irq_work_queue(&work->work);
+	}
 }
 
 BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
@@ -575,3 +609,17 @@ const struct bpf_map_ops stack_map_ops = {
 	.map_update_elem = stack_map_update_elem,
 	.map_delete_elem = stack_map_delete_elem,
 };
+
+static int __init stack_map_init(void)
+{
+	int cpu;
+	struct stack_map_irq_work *work;
+
+	for_each_possible_cpu(cpu) {
+		work = per_cpu_ptr(&irq_work, cpu);
+		init_irq_work(&work->work, up_read_work);
+	}
+	pr_info("%s: done\n", __func__);
+	return 0;
+}
+subsys_initcall(stack_map_init);
-- 
2.9.5

^ permalink raw reply related

* Re: [PATCH net-next v7 1/3] vmcore: add API to collect hardware dump in second kernel
From: Stephen Hemminger @ 2018-05-01 23:28 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: netdev, kexec, linux-fsdevel, linux-kernel, davem, viro, ebiederm,
	akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <b6065b53c5446d98ee55e09119f6821f979418d4.1525197408.git.rahul.lakkireddy@chelsio.com>

On Tue,  1 May 2018 23:57:45 +0530
Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> wrote:

> +
> +int vmcore_add_device_dump(struct vmcoredd_data *data)
> +{
> +	return __vmcore_add_device_dump(data);
> +}
> +EXPORT_SYMBOL(vmcore_add_device_dump);
> +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP

Why the stub wrapper function? 
it does nothing.

^ permalink raw reply

* Re: [PATCH net-next 0/2] sctp: unify sctp_make_op_error_fixed and sctp_make_op_error_space
From: Marcelo Ricardo Leitner @ 2018-05-01 23:28 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-sctp, vyasevich, nhorman, lucien.xin
In-Reply-To: <20180501.121353.1918662858023133319.davem@davemloft.net>

On Tue, May 01, 2018 at 12:13:53PM -0400, David Miller wrote:
> From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> Date: Sun, 29 Apr 2018 12:56:30 -0300
>
> > These two variants are very close to each other and can be merged
> > to avoid code duplication. That's what this patchset does.
> >
> > First, we allow sctp_init_cause to return errors, which then allow us to
> > add sctp_make_op_error_limited that handles both situations.
>
> Series applied.

Thanks.

>
> But generally, there are a lot of smtp_init_cause() call sites with non-zero
> payload length that should start checking the return value now.

They are safe as is, because they follow the pattern:
- sctp_make_abort(...., somesize)
- sctp_init_cause(size)
where size is considered in somesize, so sctp_init_cause cannot fail
in there.

This new usage in sctp_make_op_error_limited is the only one where it
allocates a buffer without knowing how much data will actually be
pushed into it.

  Marcelo

^ permalink raw reply

* Re: [PATCH bpf-next] bpf/verifier: enable ctx + const + 0.
From: Alexei Starovoitov @ 2018-05-01 23:16 UTC (permalink / raw)
  To: William Tu; +Cc: netdev, Yonghong Song, Yifeng Sun
In-Reply-To: <1525108505-21175-1-git-send-email-u9012063@gmail.com>

On Mon, Apr 30, 2018 at 10:15:05AM -0700, William Tu wrote:
> Existing verifier does not allow 'ctx + const + const'.  However, due to
> compiler optimization, there is a case where BPF compilerit generates
> 'ctx + const + 0', as shown below:
> 
>   599: (1d) if r2 == r4 goto pc+2
>    R0=inv(id=0) R1=ctx(id=0,off=40,imm=0)
>    R2=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff))
>    R3=inv(id=0,umax_value=65535,var_off=(0x0; 0xffff)) R4=inv0
>    R6=ctx(id=0,off=0,imm=0) R7=inv2
>   600: (bf) r1 = r6			// r1 is ctx
>   601: (07) r1 += 36			// r1 has offset 36
>   602: (61) r4 = *(u32 *)(r1 +0)	// r1 + 0
>   dereference of modified ctx ptr R1 off=36+0, ctx+const is allowed,
>   ctx+const+const is not
> 
> The reason for BPF backend generating this code is due optimization
> likes this, explained from Yonghong:
>     if (...)
>         *(ctx + 60)
>     else
>         *(ctx + 56)
> 
> The compiler translates it to
>     if (...)
>        ptr = ctx + 60
>     else
>        ptr = ctx + 56
>     *(ptr + 0)
> 
> So load ptr memory become an example of 'ctx + const + 0'.  This patch
> enables support for this case.
> 
> Fixes: f8ddadc4db6c7 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net")
> Cc: Yonghong Song <yhs@fb.com>
> Signed-off-by: Yifeng Sun <pkusunyifeng@gmail.com>
> Signed-off-by: William Tu <u9012063@gmail.com>
> ---
>  kernel/bpf/verifier.c                       |  2 +-
>  tools/testing/selftests/bpf/test_verifier.c | 13 +++++++++++++
>  2 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 712d8655e916..c9a791b9cf2a 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1638,7 +1638,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
>  		/* ctx accesses must be at a fixed offset, so that we can
>  		 * determine what type of data were returned.
>  		 */
> -		if (reg->off) {
> +		if (reg->off && off != reg->off) {
>  			verbose(env,
>  				"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
>  				regno, reg->off, off - reg->off);
> diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
> index 1acafe26498b..95ad5d5723ae 100644
> --- a/tools/testing/selftests/bpf/test_verifier.c
> +++ b/tools/testing/selftests/bpf/test_verifier.c
> @@ -8452,6 +8452,19 @@ static struct bpf_test tests[] = {
>  		.prog_type = BPF_PROG_TYPE_SCHED_CLS,
>  	},
>  	{
> +		"arithmetic ops make PTR_TO_CTX + const + 0 valid",
> +		.insns = {
> +			BPF_ALU64_IMM(BPF_ADD, BPF_REG_1,
> +				      offsetof(struct __sk_buff, data) -
> +				      offsetof(struct __sk_buff, mark)),
> +			BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, 0),

How rewritten code looks here?

The patch is allowing check_ctx_access() to proceed with sort-of
correct 'off' and remember ctx_field_size,
but in convert_ctx_accesses() it's using insn->off to do conversion.
Which is zero in this case, so it will convert
struct __sk_buff {
        __u32 len; // offset 0

into access of 'struct sk_buff'->len
and then will add __sk_buff's &data - &mark delta to in-kernel len field.
Which will point to some random field further down in struct sk_buff.
Doesn't look correct at all.

How did you test this patch?

^ permalink raw reply

* Re: [PATCH net] nfp: flower: set tunnel ttl value to net default
From: David Miller @ 2018-05-01 23:04 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: netdev, oss-drivers, john.hurley
In-Reply-To: <20180501224949.23240-1-jakub.kicinski@netronome.com>

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Tue,  1 May 2018 15:49:49 -0700

> From: John Hurley <john.hurley@netronome.com>
> 
> Firmware requires that the ttl value for an encapsulating ipv4 tunnel
> header be included as an action field. Prior to the support of Geneve
> tunnel encap (when ttl set was removed completely), ttl value was
> extracted from the tunnel key. However, tests have shown that this can
> still produce a ttl of 0.
> 
> Fix the issue by setting the namespace default value for each new tunnel.
> Follow up patch for net-next will do a full route lookup.
> 
> Fixes: 3ca3059dc3a9 ("nfp: flower: compile Geneve encap actions")
> Fixes: b27d6a95a70d ("nfp: compile flower vxlan tunnel set actions")
> Signed-off-by: John Hurley <john.hurley@netronome.com>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>

Applied and queued up for -stable.

^ permalink raw reply

* Re: [PATCH net] net/tls: Don't recursively call push_record during tls_write_space callbacks
From: David Miller @ 2018-05-01 23:02 UTC (permalink / raw)
  To: davejwatson; +Cc: andre, netdev, borisp, aviadye
In-Reply-To: <20180501200539.GA69207@davejwatson-mba.dhcp.thefacebook.com>

From: Dave Watson <davejwatson@fb.com>
Date: Tue, 1 May 2018 13:05:39 -0700

> It is reported that in some cases, write_space may be called in
> do_tcp_sendpages, such that we recursively invoke do_tcp_sendpages again:
> 
> [  660.468802]  ? do_tcp_sendpages+0x8d/0x580
> [  660.468826]  ? tls_push_sg+0x74/0x130 [tls]
> [  660.468852]  ? tls_push_record+0x24a/0x390 [tls]
> [  660.468880]  ? tls_write_space+0x6a/0x80 [tls]
> ...
> 
> tls_push_sg already does a loop over all sending sg's, so ignore
> any tls_write_space notifications until we are done sending.
> We then have to call the previous write_space to wake up
> poll() waiters after we are done with the send loop.
> 
> Reported-by: Andre Tomt <andre@tomt.net>
> Signed-off-by: Dave Watson <davejwatson@fb.com>

Applied.

^ permalink raw reply

* Re: [PATCH V4 net-next 2/2] selftest: add test for TCP_INQ
From: David Miller @ 2018-05-01 23:02 UTC (permalink / raw)
  To: soheil.kdev; +Cc: netdev, ycheng, ncardwell, edumazet, willemb, soheil
In-Reply-To: <20180501193916.113642-2-soheil.kdev@gmail.com>

From: Soheil Hassas Yeganeh <soheil.kdev@gmail.com>
Date: Tue,  1 May 2018 15:39:16 -0400

> From: Soheil Hassas Yeganeh <soheil@google.com>
> 
> Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
> Signed-off-by: Yuchung Cheng <ycheng@google.com>
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> Reviewed-by: Eric Dumazet <edumazet@google.com>
> Reviewed-by: Neal Cardwell <ncardwell@google.com>

Applied.

^ permalink raw reply

* Re: [PATCH V4 net-next 1/2] tcp: send in-queue bytes in cmsg upon read
From: David Miller @ 2018-05-01 23:02 UTC (permalink / raw)
  To: soheil.kdev; +Cc: netdev, ycheng, ncardwell, edumazet, willemb, soheil
In-Reply-To: <20180501193916.113642-1-soheil.kdev@gmail.com>

From: Soheil Hassas Yeganeh <soheil.kdev@gmail.com>
Date: Tue,  1 May 2018 15:39:15 -0400

> Add the TCP_INQ socket option to TCP. When this socket
> option is set, recvmsg() relays the number of bytes available
> on the socket for reading to the application via the
> TCP_CM_INQ control message.

Applied.

^ permalink raw reply

* [PATCH net] nfp: flower: set tunnel ttl value to net default
From: Jakub Kicinski @ 2018-05-01 22:49 UTC (permalink / raw)
  To: davem; +Cc: netdev, oss-drivers, John Hurley

From: John Hurley <john.hurley@netronome.com>

Firmware requires that the ttl value for an encapsulating ipv4 tunnel
header be included as an action field. Prior to the support of Geneve
tunnel encap (when ttl set was removed completely), ttl value was
extracted from the tunnel key. However, tests have shown that this can
still produce a ttl of 0.

Fix the issue by setting the namespace default value for each new tunnel.
Follow up patch for net-next will do a full route lookup.

Fixes: 3ca3059dc3a9 ("nfp: flower: compile Geneve encap actions")
Fixes: b27d6a95a70d ("nfp: compile flower vxlan tunnel set actions")
Signed-off-by: John Hurley <john.hurley@netronome.com>
Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
 drivers/net/ethernet/netronome/nfp/flower/action.c | 10 ++++++++--
 drivers/net/ethernet/netronome/nfp/flower/cmsg.h   |  5 ++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/netronome/nfp/flower/action.c b/drivers/net/ethernet/netronome/nfp/flower/action.c
index b3567a596fc1..80df9a5d4217 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/action.c
+++ b/drivers/net/ethernet/netronome/nfp/flower/action.c
@@ -183,17 +183,21 @@ static int
 nfp_fl_set_ipv4_udp_tun(struct nfp_fl_set_ipv4_udp_tun *set_tun,
 			const struct tc_action *action,
 			struct nfp_fl_pre_tunnel *pre_tun,
-			enum nfp_flower_tun_type tun_type)
+			enum nfp_flower_tun_type tun_type,
+			struct net_device *netdev)
 {
 	size_t act_size = sizeof(struct nfp_fl_set_ipv4_udp_tun);
 	struct ip_tunnel_info *ip_tun = tcf_tunnel_info(action);
 	u32 tmp_set_ip_tun_type_index = 0;
 	/* Currently support one pre-tunnel so index is always 0. */
 	int pretun_idx = 0;
+	struct net *net;
 
 	if (ip_tun->options_len)
 		return -EOPNOTSUPP;
 
+	net = dev_net(netdev);
+
 	set_tun->head.jump_id = NFP_FL_ACTION_OPCODE_SET_IPV4_TUNNEL;
 	set_tun->head.len_lw = act_size >> NFP_FL_LW_SIZ;
 
@@ -204,6 +208,7 @@ nfp_fl_set_ipv4_udp_tun(struct nfp_fl_set_ipv4_udp_tun *set_tun,
 
 	set_tun->tun_type_index = cpu_to_be32(tmp_set_ip_tun_type_index);
 	set_tun->tun_id = ip_tun->key.tun_id;
+	set_tun->ttl = net->ipv4.sysctl_ip_default_ttl;
 
 	/* Complete pre_tunnel action. */
 	pre_tun->ipv4_dst = ip_tun->key.u.ipv4.dst;
@@ -511,7 +516,8 @@ nfp_flower_loop_action(const struct tc_action *a,
 		*a_len += sizeof(struct nfp_fl_pre_tunnel);
 
 		set_tun = (void *)&nfp_fl->action_data[*a_len];
-		err = nfp_fl_set_ipv4_udp_tun(set_tun, a, pre_tun, *tun_type);
+		err = nfp_fl_set_ipv4_udp_tun(set_tun, a, pre_tun, *tun_type,
+					      netdev);
 		if (err)
 			return err;
 		*a_len += sizeof(struct nfp_fl_set_ipv4_udp_tun);
diff --git a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
index b6c0fd053a50..bee4367a2c38 100644
--- a/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
+++ b/drivers/net/ethernet/netronome/nfp/flower/cmsg.h
@@ -190,7 +190,10 @@ struct nfp_fl_set_ipv4_udp_tun {
 	__be16 reserved;
 	__be64 tun_id __packed;
 	__be32 tun_type_index;
-	__be32 extra[3];
+	__be16 reserved2;
+	u8 ttl;
+	u8 reserved3;
+	__be32 extra[2];
 };
 
 /* Metadata with L2 (1W/4B)
-- 
2.17.0

^ permalink raw reply related

* [RFC PATCH 3/3] rtlwifi: btcoex: Always use 2ant-functions for RTL8723BE
From: João Paulo Rechi Vita @ 2018-05-01 22:46 UTC (permalink / raw)
  To: Larry Finger
  Cc: Steve deRosier, Yan-Hsuan Chuang, Ping-Ke Shih, Birming Chiu,
	Shaofu, Steven Ting, Chaoming Li, Kalle Valo, linux-wireless,
	Network Development, LKML, Daniel Drake,
	João Paulo Rechi Vita, linux
In-Reply-To: <20180501224613.32460-1-jprvita@endlessm.com>

This partially reverts commit 7937f02d1953952a6eaf626b175ea9db5541e699,
which not only hooked external functions for newer ICs, as described in
its commit message, but also changed the behavior for RTL8723BE
depending on the value of board_info.btdm_ant_num.

When board_info.btdm_ant_num == 1, 7937f02d19 changes the codepath to
use a whole different set of functions ex_btc8723b1ant_*, instead of the
ex_btc8723b2ant_* that were used before it.

This drastically affects the upload performance and signal strenght on
the HP 240 G4 laptop, as shown by the results bellow:

Without this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
        signal: -42.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 39678 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec   735 KBytes  6.02 Mbits/sec    1   1.41 KBytes
 [  4]   1.00-2.00   sec   274 KBytes  2.25 Mbits/sec    1   1.41 KBytes
 [  4]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec    1   28.3 KBytes
 [  4]   5.00-6.00   sec   423 KBytes  3.47 Mbits/sec    3   41.0 KBytes
 [  4]   6.00-7.00   sec   840 KBytes  6.88 Mbits/sec    0   58.0 KBytes
 [  4]   7.00-8.00   sec   830 KBytes  6.79 Mbits/sec    1   1.41 KBytes
 [  4]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  3.03 MBytes  2.54 Mbits/sec    7             sender
 [  4]   0.00-10.00  sec  2.88 MBytes  2.41 Mbits/sec                  receiver

 iperf Done.

With this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
        signal: -14.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 59380 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec  4.63 MBytes  38.8 Mbits/sec    0    194 KBytes
 [  4]   1.00-2.00   sec  4.58 MBytes  38.4 Mbits/sec    0    273 KBytes
 [  4]   2.00-3.00   sec  5.05 MBytes  42.3 Mbits/sec    0    332 KBytes
 [  4]   3.00-4.00   sec  4.98 MBytes  41.8 Mbits/sec    0    393 KBytes
 [  4]   4.00-5.00   sec  4.76 MBytes  39.9 Mbits/sec    0    434 KBytes
 [  4]   5.00-6.00   sec  4.85 MBytes  40.7 Mbits/sec    0    434 KBytes
 [  4]   6.00-7.00   sec  3.96 MBytes  33.2 Mbits/sec    0    464 KBytes
 [  4]   7.00-8.00   sec  4.74 MBytes  39.8 Mbits/sec    0    481 KBytes
 [  4]   8.00-9.00   sec  4.22 MBytes  35.4 Mbits/sec    0    508 KBytes
 [  4]   9.00-10.00  sec  4.09 MBytes  34.3 Mbits/sec    0    564 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  45.9 MBytes  38.5 Mbits/sec    0             sender
 [  4]   0.00-10.00  sec  45.0 MBytes  37.7 Mbits/sec                  receiver

 iperf Done.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
---
 .../realtek/rtlwifi/btcoexist/halbtcoutsrc.c  | 62 ++++---------------
 1 file changed, 12 insertions(+), 50 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
index 86182b917c92..a862b5efdf55 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
@@ -1452,10 +1452,7 @@ void exhalbtc_init_hw_config(struct btc_coexist *btcoexist, bool wifi_only)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_init_hwconfig(btcoexist, wifi_only);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_init_hwconfig(btcoexist);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_init_hwconfig(btcoexist, wifi_only);
+		ex_btc8723b2ant_init_hwconfig(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8723A(btcoexist->adapter)) {
 		/* 8723A has no this function */
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
@@ -1481,10 +1478,7 @@ void exhalbtc_init_coex_dm(struct btc_coexist *btcoexist)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_init_coex_dm(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_init_coex_dm(btcoexist);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_init_coex_dm(btcoexist);
+		ex_btc8723b2ant_init_coex_dm(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_init_coex_dm(btcoexist);
@@ -1516,10 +1510,7 @@ void exhalbtc_ips_notify(struct btc_coexist *btcoexist, u8 type)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_ips_notify(btcoexist, ips_type);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_ips_notify(btcoexist, ips_type);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_ips_notify(btcoexist, ips_type);
+		ex_btc8723b2ant_ips_notify(btcoexist, ips_type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_ips_notify(btcoexist, ips_type);
@@ -1549,10 +1540,7 @@ void exhalbtc_lps_notify(struct btc_coexist *btcoexist, u8 type)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_lps_notify(btcoexist, lps_type);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_lps_notify(btcoexist, lps_type);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_lps_notify(btcoexist, lps_type);
+		ex_btc8723b2ant_lps_notify(btcoexist, lps_type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_lps_notify(btcoexist, lps_type);
@@ -1582,10 +1570,7 @@ void exhalbtc_scan_notify(struct btc_coexist *btcoexist, u8 type)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_scan_notify(btcoexist, scan_type);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_scan_notify(btcoexist, scan_type);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_scan_notify(btcoexist, scan_type);
+		ex_btc8723b2ant_scan_notify(btcoexist, scan_type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_scan_notify(btcoexist, scan_type);
@@ -1630,10 +1615,7 @@ void exhalbtc_connect_notify(struct btc_coexist *btcoexist, u8 action)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_connect_notify(btcoexist, asso_type);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_connect_notify(btcoexist, asso_type);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_connect_notify(btcoexist, asso_type);
+		ex_btc8723b2ant_connect_notify(btcoexist, asso_type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_connect_notify(btcoexist, asso_type);
@@ -1666,10 +1648,7 @@ void exhalbtc_mediastatus_notify(struct btc_coexist *btcoexist,
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_media_status_notify(btcoexist, status);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_media_status_notify(btcoexist, status);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_media_status_notify(btcoexist, status);
+		ex_btc8723b2ant_media_status_notify(btcoexist, status);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_media_status_notify(btcoexist, status);
@@ -1709,12 +1688,8 @@ void exhalbtc_special_packet_notify(struct btc_coexist *btcoexist, u8 pkt_type)
 			ex_btc8821a1ant_special_packet_notify(btcoexist,
 							      packet_type);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_special_packet_notify(btcoexist,
-							      packet_type);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_special_packet_notify(btcoexist,
-							      packet_type);
+		ex_btc8723b2ant_special_packet_notify(btcoexist,
+						      packet_type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_special_packet_notify(btcoexist,
@@ -1741,12 +1716,7 @@ void exhalbtc_bt_info_notify(struct btc_coexist *btcoexist,
 			ex_btc8821a1ant_bt_info_notify(btcoexist, tmp_buf,
 						       length);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_bt_info_notify(btcoexist, tmp_buf,
-						       length);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_bt_info_notify(btcoexist, tmp_buf,
-						       length);
+		ex_btc8723b2ant_bt_info_notify(btcoexist, tmp_buf, length);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_bt_info_notify(btcoexist, tmp_buf,
@@ -1763,8 +1733,6 @@ void exhalbtc_rf_status_notify(struct btc_coexist *btcoexist, u8 type)
 
 	if (IS_HARDWARE_TYPE_8821(btcoexist->adapter)) {
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_rf_status_notify(btcoexist, type);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 	}
 }
@@ -1804,10 +1772,7 @@ void exhalbtc_halt_notify(struct btc_coexist *btcoexist)
 		else if (btcoexist->board_info.btdm_ant_num == 1)
 			ex_btc8821a1ant_halt_notify(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_halt_notify(btcoexist);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_halt_notify(btcoexist);
+		ex_btc8723b2ant_halt_notify(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_halt_notify(btcoexist);
@@ -1880,10 +1845,7 @@ void exhalbtc_periodical(struct btc_coexist *btcoexist)
 			if (!halbtc_under_ips(btcoexist))
 				ex_btc8821a1ant_periodical(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8723B(btcoexist->adapter)) {
-		if (btcoexist->board_info.btdm_ant_num == 2)
-			ex_btc8723b2ant_periodical(btcoexist);
-		else if (btcoexist->board_info.btdm_ant_num == 1)
-			ex_btc8723b1ant_periodical(btcoexist);
+		ex_btc8723b2ant_periodical(btcoexist);
 	} else if (IS_HARDWARE_TYPE_8192E(btcoexist->adapter)) {
 		if (btcoexist->board_info.btdm_ant_num == 2)
 			ex_btc8192e2ant_periodical(btcoexist);
-- 
2.17.0

^ permalink raw reply related

* [RFC PATCH 2/3] Revert "rtlwifi: fill FW version and subversion"
From: João Paulo Rechi Vita @ 2018-05-01 22:46 UTC (permalink / raw)
  To: Larry Finger
  Cc: Steve deRosier, Yan-Hsuan Chuang, Ping-Ke Shih, Birming Chiu,
	Shaofu, Steven Ting, Chaoming Li, Kalle Valo, linux-wireless,
	Network Development, LKML, Daniel Drake,
	João Paulo Rechi Vita, linux
In-Reply-To: <20180501224613.32460-1-jprvita@endlessm.com>

This reverts commit 874e837d67d0db179c9771f38fd21df07c703e93.

This drastically affects the upload performance and signal strenght on
the HP 240 G4 laptop, as shown by the results bellow:

Without this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
        signal: -42.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 39678 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec   735 KBytes  6.02 Mbits/sec    1   1.41 KBytes
 [  4]   1.00-2.00   sec   274 KBytes  2.25 Mbits/sec    1   1.41 KBytes
 [  4]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec    1   28.3 KBytes
 [  4]   5.00-6.00   sec   423 KBytes  3.47 Mbits/sec    3   41.0 KBytes
 [  4]   6.00-7.00   sec   840 KBytes  6.88 Mbits/sec    0   58.0 KBytes
 [  4]   7.00-8.00   sec   830 KBytes  6.79 Mbits/sec    1   1.41 KBytes
 [  4]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  3.03 MBytes  2.54 Mbits/sec    7             sender
 [  4]   0.00-10.00  sec  2.88 MBytes  2.41 Mbits/sec                  receiver

 iperf Done.

With this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
        signal: -14.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 59380 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec  4.63 MBytes  38.8 Mbits/sec    0    194 KBytes
 [  4]   1.00-2.00   sec  4.58 MBytes  38.4 Mbits/sec    0    273 KBytes
 [  4]   2.00-3.00   sec  5.05 MBytes  42.3 Mbits/sec    0    332 KBytes
 [  4]   3.00-4.00   sec  4.98 MBytes  41.8 Mbits/sec    0    393 KBytes
 [  4]   4.00-5.00   sec  4.76 MBytes  39.9 Mbits/sec    0    434 KBytes
 [  4]   5.00-6.00   sec  4.85 MBytes  40.7 Mbits/sec    0    434 KBytes
 [  4]   6.00-7.00   sec  3.96 MBytes  33.2 Mbits/sec    0    464 KBytes
 [  4]   7.00-8.00   sec  4.74 MBytes  39.8 Mbits/sec    0    481 KBytes
 [  4]   8.00-9.00   sec  4.22 MBytes  35.4 Mbits/sec    0    508 KBytes
 [  4]   9.00-10.00  sec  4.09 MBytes  34.3 Mbits/sec    0    564 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  45.9 MBytes  38.5 Mbits/sec    0             sender
 [  4]   0.00-10.00  sec  45.0 MBytes  37.7 Mbits/sec                  receiver

 iperf Done.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
---
 drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c         | 2 --
 drivers/net/wireless/realtek/rtlwifi/rtl8723com/fw_common.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c
index 63874512598b..a2eca669873b 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8188ee/fw.c
@@ -141,8 +141,6 @@ int rtl88e_download_fw(struct ieee80211_hw *hw,
 		return 1;
 
 	pfwheader = (struct rtlwifi_firmware_header *)rtlhal->pfirmware;
-	rtlhal->fw_version = le16_to_cpu(pfwheader->version);
-	rtlhal->fw_subversion = pfwheader->subversion;
 	pfwdata = rtlhal->pfirmware;
 	fwsize = rtlhal->fwsize;
 	RT_TRACE(rtlpriv, COMP_FW, DBG_DMESG,
diff --git a/drivers/net/wireless/realtek/rtlwifi/rtl8723com/fw_common.c b/drivers/net/wireless/realtek/rtlwifi/rtl8723com/fw_common.c
index 521039c5d4ce..0d1ebc861720 100644
--- a/drivers/net/wireless/realtek/rtlwifi/rtl8723com/fw_common.c
+++ b/drivers/net/wireless/realtek/rtlwifi/rtl8723com/fw_common.c
@@ -200,8 +200,6 @@ int rtl8723_download_fw(struct ieee80211_hw *hw,
 		return 1;
 
 	pfwheader = (struct rtlwifi_firmware_header *)rtlhal->pfirmware;
-	rtlhal->fw_version = le16_to_cpu(pfwheader->version);
-	rtlhal->fw_subversion = pfwheader->subversion;
 	pfwdata = rtlhal->pfirmware;
 	fwsize = rtlhal->fwsize;
 
-- 
2.17.0

^ permalink raw reply related

* [RFC PATCH 1/3] rtlwifi: btcoex: Don't init antenna position to main port
From: João Paulo Rechi Vita @ 2018-05-01 22:46 UTC (permalink / raw)
  To: Larry Finger
  Cc: Steve deRosier, Yan-Hsuan Chuang, Ping-Ke Shih, Birming Chiu,
	Shaofu, Steven Ting, Chaoming Li, Kalle Valo, linux-wireless,
	Network Development, LKML, Daniel Drake,
	João Paulo Rechi Vita, linux
In-Reply-To: <CA+A7VXUUsZHD2gr9TBcV6jZBOPGZv7_vK-wWZ0MvGgiCnAiUgQ@mail.gmail.com>

This partially reverts commit 40d9dd4f1c5dcd0d4a2a1f0efcd225c9c4b36d6f,
which does not only remove global variables from btcoex, as described on
its commit message, but also does a few other things -- in particular,
setting the default antenna position to BTC_ANTENNA_AT_MAIN_PORT.

This drastically affects the upload performance and signal strenght on
the HP 240 G4 laptop, as shown by the results bellow:

Without this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
 	signal: -42.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 39678 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec   735 KBytes  6.02 Mbits/sec    1   1.41 KBytes
 [  4]   1.00-2.00   sec   274 KBytes  2.25 Mbits/sec    1   1.41 KBytes
 [  4]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec    1   28.3 KBytes
 [  4]   5.00-6.00   sec   423 KBytes  3.47 Mbits/sec    3   41.0 KBytes
 [  4]   6.00-7.00   sec   840 KBytes  6.88 Mbits/sec    0   58.0 KBytes
 [  4]   7.00-8.00   sec   830 KBytes  6.79 Mbits/sec    1   1.41 KBytes
 [  4]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 [  4]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  3.03 MBytes  2.54 Mbits/sec    7             sender
 [  4]   0.00-10.00  sec  2.88 MBytes  2.41 Mbits/sec                  receiver

 iperf Done.

With this change:

 $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
 	signal: -14.00 dBm
 $ iperf3 -c 192.168.1.254
 Connecting to host 192.168.1.254, port 5201
 [  4] local 192.168.1.253 port 59380 connected to 192.168.1.254 port 5201
 [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
 [  4]   0.00-1.00   sec  4.63 MBytes  38.8 Mbits/sec    0    194 KBytes
 [  4]   1.00-2.00   sec  4.58 MBytes  38.4 Mbits/sec    0    273 KBytes
 [  4]   2.00-3.00   sec  5.05 MBytes  42.3 Mbits/sec    0    332 KBytes
 [  4]   3.00-4.00   sec  4.98 MBytes  41.8 Mbits/sec    0    393 KBytes
 [  4]   4.00-5.00   sec  4.76 MBytes  39.9 Mbits/sec    0    434 KBytes
 [  4]   5.00-6.00   sec  4.85 MBytes  40.7 Mbits/sec    0    434 KBytes
 [  4]   6.00-7.00   sec  3.96 MBytes  33.2 Mbits/sec    0    464 KBytes
 [  4]   7.00-8.00   sec  4.74 MBytes  39.8 Mbits/sec    0    481 KBytes
 [  4]   8.00-9.00   sec  4.22 MBytes  35.4 Mbits/sec    0    508 KBytes
 [  4]   9.00-10.00  sec  4.09 MBytes  34.3 Mbits/sec    0    564 KBytes
 - - - - - - - - - - - - - - - - - - - - - - - - -
 [ ID] Interval           Transfer     Bandwidth       Retr
 [  4]   0.00-10.00  sec  45.9 MBytes  38.5 Mbits/sec    0             sender
 [  4]   0.00-10.00  sec  45.0 MBytes  37.7 Mbits/sec                  receiver

 iperf Done.

Signed-off-by: João Paulo Rechi Vita <jprvita@endlessm.com>
---
 drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
index b026e80940a4..86182b917c92 100644
--- a/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
+++ b/drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c
@@ -1388,9 +1388,6 @@ bool exhalbtc_bind_bt_coex_withadapter(void *adapter)
 	ant_num = rtl_get_hwpg_ant_num(rtlpriv);
 	exhalbtc_set_ant_num(rtlpriv, BT_COEX_ANT_TYPE_PG, ant_num);
 
-	/* set default antenna position to main  port */
-	btcoexist->board_info.btdm_ant_pos = BTC_ANTENNA_AT_MAIN_PORT;
-
 	single_ant_path = rtl_get_hwpg_single_ant_path(rtlpriv);
 	exhalbtc_set_single_ant_path(btcoexist, single_ant_path);
 
-- 
2.17.0

^ permalink raw reply related

* Re: Silently dropped UDP packets on kernel 4.14
From: Kristian Evensen @ 2018-05-01 22:42 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Netfilter Development Mailing list, Network Development
In-Reply-To: <20180501222406.fy5rph2t4dzndhu2@breakpoint.cc>

Hi,

Thanks for your quick and detailed reply!

On Wed, May 2, 2018 at 12:24 AM, Florian Westphal <fw@strlen.de> wrote:
> I'm not sure what the best way to solve this is, we either need
> to insert earlier in nfqueue case, or do conflict resolution in nfqueue
> case (and perhaps also nat undo? not sure).

My knowledge of the conntrack/nat subsystem is not that great, and I
don't know the implications of what I am about to suggest. However,
considering that the two packets represent the same flow, wouldn't it
be possible to apply the existing nat-mapping to the second packet,
and then let the second packet pass?

BR,
Kristian

^ permalink raw reply

* Re: RTL8723BE performance regression
From: João Paulo Rechi Vita @ 2018-05-01 22:41 UTC (permalink / raw)
  To: Larry Finger
  Cc: Steve deRosier, Yan-Hsuan Chuang, Ping-Ke Shih, Birming Chiu,
	Shaofu, Steven Ting, Chaoming Li, Kalle Valo, linux-wireless,
	Network Development, LKML, Daniel Drake,
	João Paulo Rechi Vita, linux
In-Reply-To: <059b40f0-b8e2-b55f-92d5-a859ba4204a4@lwfinger.net>

On Tue, Apr 3, 2018 at 7:51 PM, Larry Finger <Larry.Finger@lwfinger.net> wrote:
> On 04/03/2018 09:37 PM, João Paulo Rechi Vita wrote:
>>
>> On Tue, Apr 3, 2018 at 7:28 PM, Larry Finger <Larry.Finger@lwfinger.net>
>> wrote:
>>
>> (...)
>>
>>> As the antenna selection code changes affected your first bisection, do
>>> you
>>> have one of those HP laptops with only one antenna and the incorrect
>>> coding
>>> in the FUSE?
>>
>>
>> Yes, that is why I've been passing ant_sel=1 during my tests -- this
>> was needed to achieve a good performance in the past, before this
>> regression. I've also opened the laptop chassis and confirmed the
>> antenna cable is plugged to the connector labeled with "1" on the
>> card.
>>
>>> If so, please make sure that you still have the same signal
>>> strength for good and bad cases. I have tried to keep the driver and the
>>> btcoex code in sync, but there may be some combinations of antenna
>>> configuration and FUSE contents that cause the code to fail.
>>>
>>
>> What is the recommended way to monitor the signal strength?
>
>
> The btcoex code is developed for multiple platforms by a different group
> than the Linux driver. I think they made a change that caused ant_sel to
> switch from 1 to 2. At least numerous comments at
> github.com/lwfinger/rtlwifi_new claimed they needed to make that change.
>
> Mhy recommended method is to verify the wifi device name with "iw dev". Then
> using that device
>
> sudo iw dev <dev_name> scan | egrep "SSID|signal"
>

I have confirmed that the performance regression is indeed tied to
signal strength: on the good cases signal was between -16 and -8 dBm,
whereas in bad cases signal was always between -50 to - 40 dBm. I've
also switched to testing bandwidth in controlled LAN environment using
iperf3, as suggested by Steve deRosier, with the DUT being the only
machine connected to the 2.4 GHz radio and the machine running the
iperf3 server connected via ethernet.

Using those two tests (iperf3 and signal strength) I've dug deeper
into the culprit I had found previously, commit 7937f02d1953,
reverting it partially and testing the resulting driver, to isolate
which change was causing the problem. Besides "hooking up external
functions for newer ICs", as described by the commit message, that
commit also added code to decided whether ex_btc8723b1ant_*() or
ex_btc8723b2ant_*() functions should be used in halbtcoutsrc.c,
depending on the value of board_info.btdm_ant_num, whereas before that
commit ex_btc8723b2ant_*() were always used. Reverting to always using
ex_btc8723b2ant_*() functions fixes the regression on v4.15.

I've also tried to bisect between v4.15..v4.16 to find what else was
causing problems there, as the changes mentioned above on top of v4.16
did not solve the problem. The bisect pointed to "874e837d67d0
rtlwifi: fill FW version and subversion", only but reverting it plus
the changes mentioned above also didn't yield good results. That's
when I decided to get a bit creative: starting on v4.16 I first
applied the changes to have ex_btc8723b2ant_*() always being used, as
mentioned above, then reverted every commit between v4.15..v4.16
affecting drivers/net/wireless/realtek/rtlwifi/, and verified the
resulting kernel had a good performance. Then I started trimming down
the history and testing along the way, to reduce to the minimum set of
changes that had to be reverted in order to restore the good
performance. In addition to the ex_btc8723b2ant_*() changes and
reverting "874e837d67d0 rtlwifi: fill FW version and subversion", I've
also had to remove the following lines from
drivers/net/wireless/realtek/rtlwifi/btcoexist/halbtcoutsrc.c, which
were introduced by "40d9dd4f1c5d rtlwifi: btcoex: Remove global
variables from btcoex", in order to restore the upload performance and
signal strength.

        /* set default antenna position to main  port */
        btcoexist->board_info.btdm_ant_pos = BTC_ANTENNA_AT_MAIN_PORT;

These are the results I've got on v4.16 (similarly on
wireless-drivers-next-for-davem-2018-03-29 or v4.15):

     $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
            signal: -42.00 dBm
     $ iperf3 -c 192.168.1.254
     Connecting to host 192.168.1.254, port 5201
     [  4] local 192.168.1.253 port 39678 connected to 192.168.1.254 port 5201
     [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
     [  4]   0.00-1.00   sec   735 KBytes  6.02 Mbits/sec    1   1.41 KBytes
     [  4]   1.00-2.00   sec   274 KBytes  2.25 Mbits/sec    1   1.41 KBytes
     [  4]   2.00-3.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
     [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
     [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec    1   28.3 KBytes
     [  4]   5.00-6.00   sec   423 KBytes  3.47 Mbits/sec    3   41.0 KBytes
     [  4]   6.00-7.00   sec   840 KBytes  6.88 Mbits/sec    0   58.0 KBytes
     [  4]   7.00-8.00   sec   830 KBytes  6.79 Mbits/sec    1   1.41 KBytes
     [  4]   8.00-9.00   sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
     [  4]   9.00-10.00  sec  0.00 Bytes  0.00 bits/sec    0   1.41 KBytes
     - - - - - - - - - - - - - - - - - - - - - - - - -
     [ ID] Interval           Transfer     Bandwidth       Retr
     [  4]   0.00-10.00  sec  3.03 MBytes  2.54 Mbits/sec    7
    sender
     [  4]   0.00-10.00  sec  2.88 MBytes  2.41 Mbits/sec
    receiver

     iperf Done.

And these are the results on v4.16 with all the changes mentioned on
the previous paragraph, and similar results were obtained when
applying the same changes on the current wireless-drivers-next master
branch (f56324baf329):

     $ sudo iw dev wlp2s0 scan | grep -B3 JJ | grep signal
            signal: -14.00 dBm
     $ iperf3 -c 192.168.1.254
     Connecting to host 192.168.1.254, port 5201
     [  4] local 192.168.1.253 port 59380 connected to 192.168.1.254 port 5201
     [ ID] Interval           Transfer     Bandwidth       Retr  Cwnd
     [  4]   0.00-1.00   sec  4.63 MBytes  38.8 Mbits/sec    0    194 KBytes
     [  4]   1.00-2.00   sec  4.58 MBytes  38.4 Mbits/sec    0    273 KBytes
     [  4]   2.00-3.00   sec  5.05 MBytes  42.3 Mbits/sec    0    332 KBytes
     [  4]   3.00-4.00   sec  4.98 MBytes  41.8 Mbits/sec    0    393 KBytes
     [  4]   4.00-5.00   sec  4.76 MBytes  39.9 Mbits/sec    0    434 KBytes
     [  4]   5.00-6.00   sec  4.85 MBytes  40.7 Mbits/sec    0    434 KBytes
     [  4]   6.00-7.00   sec  3.96 MBytes  33.2 Mbits/sec    0    464 KBytes
     [  4]   7.00-8.00   sec  4.74 MBytes  39.8 Mbits/sec    0    481 KBytes
     [  4]   8.00-9.00   sec  4.22 MBytes  35.4 Mbits/sec    0    508 KBytes
     [  4]   9.00-10.00  sec  4.09 MBytes  34.3 Mbits/sec    0    564 KBytes
     - - - - - - - - - - - - - - - - - - - - - - - - -
     [ ID] Interval           Transfer     Bandwidth       Retr
     [  4]   0.00-10.00  sec  45.9 MBytes  38.5 Mbits/sec    0
    sender
     [  4]   0.00-10.00  sec  45.0 MBytes  37.7 Mbits/sec
    receiver

     iperf Done.

I'll send an RFC patchset with the changes that I mentioned above.
Please advise whether those changes themselves should be merged, or if
you suggest a different approach.

^ permalink raw reply

* Re: [PATCH net-next v7 1/3] vmcore: add API to collect hardware dump in second kernel
From: Eric W. Biederman @ 2018-05-01 22:41 UTC (permalink / raw)
  To: Rahul Lakkireddy
  Cc: netdev, kexec, linux-fsdevel, linux-kernel, davem, viro, stephen,
	akpm, torvalds, ganeshgr, nirranjan, indranil
In-Reply-To: <b6065b53c5446d98ee55e09119f6821f979418d4.1525197408.git.rahul.lakkireddy@chelsio.com>

Rahul Lakkireddy <rahul.lakkireddy@chelsio.com> writes:

> The sequence of actions done by device drivers to append their device
> specific hardware/firmware logs to /proc/vmcore are as follows:

Except for the missing include that the kbuild test robot caught
I am not seeing a problems.

Acked-by: "Eric W. Biederman" <ebiederm@xmission.com>

> 1. During probe (before hardware is initialized), device drivers
> register to the vmcore module (via vmcore_add_device_dump()), with
> callback function, along with buffer size and log name needed for
> firmware/hardware log collection.
>
> 2. vmcore module allocates the buffer with requested size. It adds
> an Elf note and invokes the device driver's registered callback
> function.
>
> 3. Device driver collects all hardware/firmware logs into the buffer
> and returns control back to vmcore module.
>
> Ensure that the device dump buffer size is always aligned to page size
> so that it can be mmaped.
>
> Also, rename alloc_elfnotes_buf() to vmcore_alloc_buf() to make it more
> generic and reserve NT_VMCOREDD note type to indicate vmcore device
> dump.
>
> Suggested-by: Eric Biederman <ebiederm@xmission.com>.
> Signed-off-by: Rahul Lakkireddy <rahul.lakkireddy@chelsio.com>
> Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
> ---
> v7:
> - Removed "CHELSIO" vendor identifier in Elf Note name. Instead,
>   writing "LINUX".
> - Moved vmcoredd_header to new file include/uapi/linux/vmcore.h
> - Reworked vmcoredd_header to include Elf Note as part of the header
>   itself.
> - Removed vmcoredd_get_note_size().
> - Renamed vmcoredd_write_note() to vmcoredd_write_header().
> - Replaced all "unsigned long" with "unsigned int" for device dump
>   size since max size of Elf Word is u32.
>
> v6:
> - Reworked device dump elf note name to contain vendor identifier.
> - Added vmcoredd_header that precedes actual dump in the Elf Note.
> - Device dump's name is moved inside vmcoredd_header.
>
> v5:
> - Removed enabling CONFIG_PROC_VMCORE_DEVICE_DUMP by default and
>   updated help message to indicate that the driver must be present
>   in second kernel's initramfs to collect the underlying device
>   snapshot.
>
> v4:
> - Made __vmcore_add_device_dump() static.
> - Moved compile check to define vmcore_add_device_dump() to
>   crash_dump.h to fix compilation when vmcore.c is not compiled in.
> - Convert ---help--- to help in Kconfig as indicated by checkpatch.
> - Rebased to tip.
>
> v3:
> - Dropped sysfs crashdd module.
> - Added CONFIG_PROC_VMCORE_DEVICE_DUMP to allow configuring device
>   dump support.
> - Moved logic related to adding dumps from crashdd to vmcore module.
> - Rename all crashdd* to vmcoredd*.
>
> v2:
> - Added ABI Documentation for crashdd.
> - Directly use octal permission instead of macro.
>
> Changes since rfc v2:
> - Moved exporting crashdd from procfs to sysfs.  Suggested by
>   Stephen Hemminger <stephen@networkplumber.org>
> - Moved code from fs/proc/crashdd.c to fs/crashdd/ directory.
> - Replaced all proc API with sysfs API and updated comments.
> - Calling driver callback before creating the binary file under
>   crashdd sysfs.
> - Changed binary dump file permission from S_IRUSR to S_IRUGO.
> - Changed module name from CRASH_DRIVER_DUMP to CRASH_DEVICE_DUMP.
>
> rfc v2:
> - Collecting logs in 2nd kernel instead of during kernel panic.
>   Suggested by Eric Biederman <ebiederm@xmission.com>.
> - Patch added in this series.
>
>  fs/proc/Kconfig             |  15 +++++
>  fs/proc/vmcore.c            | 140 +++++++++++++++++++++++++++++++++++++++++---
>  include/linux/crash_dump.h  |  18 ++++++
>  include/linux/kcore.h       |   6 ++
>  include/uapi/linux/elf.h    |   1 +
>  include/uapi/linux/vmcore.h |  16 +++++
>  6 files changed, 187 insertions(+), 9 deletions(-)
>  create mode 100644 include/uapi/linux/vmcore.h
>
> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> index 1ade1206bb89..0eaeb41453f5 100644
> --- a/fs/proc/Kconfig
> +++ b/fs/proc/Kconfig
> @@ -43,6 +43,21 @@ config PROC_VMCORE
>          help
>          Exports the dump image of crashed kernel in ELF format.
>  
> +config PROC_VMCORE_DEVICE_DUMP
> +	bool "Device Hardware/Firmware Log Collection"
> +	depends on PROC_VMCORE
> +	default n
> +	help
> +	  After kernel panic, device drivers can collect the device
> +	  specific snapshot of their hardware or firmware before the
> +	  underlying devices are initialized in crash recovery kernel.
> +	  Note that the device driver must be present in the crash
> +	  recovery kernel's initramfs to collect its underlying device
> +	  snapshot.
> +
> +	  If you say Y here, the collected device dumps will be added
> +	  as ELF notes to /proc/vmcore.
> +
>  config PROC_SYSCTL
>  	bool "Sysctl support (/proc/sys)" if EXPERT
>  	depends on PROC_FS
> diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
> index a45f0af22a60..60fce8dfe4e3 100644
> --- a/fs/proc/vmcore.c
> +++ b/fs/proc/vmcore.c
> @@ -20,6 +20,7 @@
>  #include <linux/init.h>
>  #include <linux/crash_dump.h>
>  #include <linux/list.h>
> +#include <linux/mutex.h>
>  #include <linux/vmalloc.h>
>  #include <linux/pagemap.h>
>  #include <linux/uaccess.h>
> @@ -44,6 +45,12 @@ static u64 vmcore_size;
>  
>  static struct proc_dir_entry *proc_vmcore;
>  
> +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
> +/* Device Dump list and mutex to synchronize access to list */
> +static LIST_HEAD(vmcoredd_list);
> +static DEFINE_MUTEX(vmcoredd_mutex);
> +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> +
>  /*
>   * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error
>   * The called function has to take care of module refcounting.
> @@ -302,10 +309,8 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
>  };
>  
>  /**
> - * alloc_elfnotes_buf - allocate buffer for ELF note segment in
> - *                      vmalloc memory
> - *
> - * @notes_sz: size of buffer
> + * vmcore_alloc_buf - allocate buffer in vmalloc memory
> + * @sizez: size of buffer
>   *
>   * If CONFIG_MMU is defined, use vmalloc_user() to allow users to mmap
>   * the buffer to user-space by means of remap_vmalloc_range().
> @@ -313,12 +318,12 @@ static const struct vm_operations_struct vmcore_mmap_ops = {
>   * If CONFIG_MMU is not defined, use vzalloc() since mmap_vmcore() is
>   * disabled and there's no need to allow users to mmap the buffer.
>   */
> -static inline char *alloc_elfnotes_buf(size_t notes_sz)
> +static inline char *vmcore_alloc_buf(size_t size)
>  {
>  #ifdef CONFIG_MMU
> -	return vmalloc_user(notes_sz);
> +	return vmalloc_user(size);
>  #else
> -	return vzalloc(notes_sz);
> +	return vzalloc(size);
>  #endif
>  }
>  
> @@ -665,7 +670,7 @@ static int __init merge_note_headers_elf64(char *elfptr, size_t *elfsz,
>  		return rc;
>  
>  	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
> -	*notes_buf = alloc_elfnotes_buf(*notes_sz);
> +	*notes_buf = vmcore_alloc_buf(*notes_sz);
>  	if (!*notes_buf)
>  		return -ENOMEM;
>  
> @@ -851,7 +856,7 @@ static int __init merge_note_headers_elf32(char *elfptr, size_t *elfsz,
>  		return rc;
>  
>  	*notes_sz = roundup(phdr_sz, PAGE_SIZE);
> -	*notes_buf = alloc_elfnotes_buf(*notes_sz);
> +	*notes_buf = vmcore_alloc_buf(*notes_sz);
>  	if (!*notes_buf)
>  		return -ENOMEM;
>  
> @@ -1145,6 +1150,120 @@ static int __init parse_crash_elf_headers(void)
>  	return 0;
>  }
>  
> +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
> +/**
> + * vmcoredd_write_header - Write vmcore device dump header at the
> + * beginning of the dump's buffer.
> + * @buf: Output buffer where the note is written
> + * @data: Dump info
> + * @size: Size of the dump
> + *
> + * Fills beginning of the dump's buffer with vmcore device dump header.
> + */
> +static void vmcoredd_write_header(void *buf, struct vmcoredd_data *data,
> +				  u32 size)
> +{
> +	struct vmcoredd_header *vdd_hdr = (struct vmcoredd_header *)buf;
> +
> +	vdd_hdr->n_namesz = sizeof(vdd_hdr->name);
> +	vdd_hdr->n_descsz = size + sizeof(vdd_hdr->dump_name);
> +	vdd_hdr->n_type = NT_VMCOREDD;
> +
> +	strncpy((char *)vdd_hdr->name, VMCOREDD_NOTE_NAME,
> +		sizeof(vdd_hdr->name));
> +	memcpy(vdd_hdr->dump_name, data->dump_name, sizeof(vdd_hdr->dump_name));
> +}
> +
> +/**
> + * vmcore_add_device_dump - Add a buffer containing device dump to vmcore
> + * @data: dump info.
> + *
> + * Allocate a buffer and invoke the calling driver's dump collect routine.
> + * Write Elf note at the beginning of the buffer to indicate vmcore device
> + * dump and add the dump to global list.
> + */
> +static int __vmcore_add_device_dump(struct vmcoredd_data *data)
> +{
> +	struct vmcoredd_node *dump;
> +	void *buf = NULL;
> +	size_t data_size;
> +	int ret;
> +
> +	if (!data || !strlen(data->dump_name) ||
> +	    !data->vmcoredd_callback || !data->size)
> +		return -EINVAL;
> +
> +	dump = vzalloc(sizeof(*dump));
> +	if (!dump) {
> +		ret = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	/* Keep size of the buffer page aligned so that it can be mmaped */
> +	data_size = roundup(sizeof(struct vmcoredd_header) + data->size,
> +			    PAGE_SIZE);
> +
> +	/* Allocate buffer for driver's to write their dumps */
> +	buf = vmcore_alloc_buf(data_size);
> +	if (!buf) {
> +		ret = -ENOMEM;
> +		goto out_err;
> +	}
> +
> +	vmcoredd_write_header(buf, data, data_size -
> +			      sizeof(struct vmcoredd_header));
> +
> +	/* Invoke the driver's dump collection routing */
> +	ret = data->vmcoredd_callback(data, buf +
> +				      sizeof(struct vmcoredd_header));
> +	if (ret)
> +		goto out_err;
> +
> +	dump->buf = buf;
> +	dump->size = data_size;
> +
> +	/* Add the dump to driver sysfs list */
> +	mutex_lock(&vmcoredd_mutex);
> +	list_add_tail(&dump->list, &vmcoredd_list);
> +	mutex_unlock(&vmcoredd_mutex);
> +
> +	return 0;
> +
> +out_err:
> +	if (buf)
> +		vfree(buf);
> +
> +	if (dump)
> +		vfree(dump);
> +
> +	return ret;
> +}
> +
> +int vmcore_add_device_dump(struct vmcoredd_data *data)
> +{
> +	return __vmcore_add_device_dump(data);
> +}
> +EXPORT_SYMBOL(vmcore_add_device_dump);
> +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> +
> +/* Free all dumps in vmcore device dump list */
> +static void vmcore_free_device_dumps(void)
> +{
> +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
> +	mutex_lock(&vmcoredd_mutex);
> +	while (!list_empty(&vmcoredd_list)) {
> +		struct vmcoredd_node *dump;
> +
> +		dump = list_first_entry(&vmcoredd_list, struct vmcoredd_node,
> +					list);
> +		list_del(&dump->list);
> +		vfree(dump->buf);
> +		vfree(dump);
> +	}
> +	mutex_unlock(&vmcoredd_mutex);
> +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
> +}
> +
>  /* Init function for vmcore module. */
>  static int __init vmcore_init(void)
>  {
> @@ -1192,4 +1311,7 @@ void vmcore_cleanup(void)
>  		kfree(m);
>  	}
>  	free_elfcorebuf();
> +
> +	/* clear vmcore device dump list */
> +	vmcore_free_device_dumps();
>  }
> diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
> index f7ac2aa93269..3e4ba9d753c8 100644
> --- a/include/linux/crash_dump.h
> +++ b/include/linux/crash_dump.h
> @@ -5,6 +5,7 @@
>  #include <linux/kexec.h>
>  #include <linux/proc_fs.h>
>  #include <linux/elf.h>
> +#include <uapi/linux/vmcore.h>
>  
>  #include <asm/pgtable.h> /* for pgprot_t */
>  
> @@ -93,4 +94,21 @@ static inline bool is_kdump_kernel(void) { return 0; }
>  #endif /* CONFIG_CRASH_DUMP */
>  
>  extern unsigned long saved_max_pfn;
> +
> +/* Device Dump information to be filled by drivers */
> +struct vmcoredd_data {
> +	char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */
> +	unsigned int size;                       /* Size of the dump */
> +	/* Driver's registered callback to be invoked to collect dump */
> +	int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf);
> +};
> +
> +#ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP
> +int vmcore_add_device_dump(struct vmcoredd_data *data);
> +#else
> +static inline int vmcore_add_device_dump(struct vmcoredd_data *data)
> +{
> +	return -EOPNOTSUPP;
> +}
> +#endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */
>  #endif /* LINUX_CRASHDUMP_H */
> diff --git a/include/linux/kcore.h b/include/linux/kcore.h
> index 80db19d3a505..8de55e4b5ee9 100644
> --- a/include/linux/kcore.h
> +++ b/include/linux/kcore.h
> @@ -28,6 +28,12 @@ struct vmcore {
>  	loff_t offset;
>  };
>  
> +struct vmcoredd_node {
> +	struct list_head list;	/* List of dumps */
> +	void *buf;		/* Buffer containing device's dump */
> +	unsigned int size;	/* Size of the buffer */
> +};
> +
>  #ifdef CONFIG_PROC_KCORE
>  extern void kclist_add(struct kcore_list *, void *, size_t, int type);
>  #else
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index e2535d6dcec7..4e12c423b9fe 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -421,6 +421,7 @@ typedef struct elf64_shdr {
>  #define NT_ARM_SYSTEM_CALL	0x404	/* ARM system call number */
>  #define NT_ARM_SVE	0x405		/* ARM Scalable Vector Extension registers */
>  #define NT_ARC_V2	0x600		/* ARCv2 accumulator/extra registers */
> +#define NT_VMCOREDD	0x700		/* Vmcore Device Dump Note */
>  
>  /* Note header in a PT_NOTE section */
>  typedef struct elf32_note {
> diff --git a/include/uapi/linux/vmcore.h b/include/uapi/linux/vmcore.h
> new file mode 100644
> index 000000000000..f9eab9a37370
> --- /dev/null
> +++ b/include/uapi/linux/vmcore.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _UAPI_VMCORE_H
> +#define _UAPI_VMCORE_H
> +
> +#define VMCOREDD_NOTE_NAME "LINUX"
> +#define VMCOREDD_MAX_NAME_BYTES 44
> +
> +struct vmcoredd_header {
> +	__u32 n_namesz; /* Name size */
> +	__u32 n_descsz; /* Content size */
> +	__u32 n_type;   /* NT_VMCOREDD */
> +	__u8 name[8];   /* LINUX\0\0\0 */
> +	__u8 dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Device dump's name */
> +};
> +
> +#endif /* _UAPI_VMCORE_H */

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox