Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH net-next] qed*: Utilize FW 8.37.7.0
From: David Miller @ 2018-09-06 14:44 UTC (permalink / raw)
  To: denis.bolotin; +Cc: netdev, ariel.elior
In-Reply-To: <20180905153555.2661-1-denis.bolotin@cavium.com>

From: Denis Bolotin <denis.bolotin@cavium.com>
Date: Wed, 5 Sep 2018 18:35:55 +0300

> This patch adds a new qed firmware with fixes and support for new features.
> 
> Fixes:
> - Fix a rare case of device crash with iWARP, iSCSI or FCoE offload.
> - Fix GRE tunneled traffic when iWARP offload is enabled.
> - Fix RoCE failure in ib_send_bw when using inline data.
> - Fix latency optimization flow for inline WQEs.
> - BigBear 100G fix
> 
> RDMA:
> - Reduce task context size.
> - Application page sizes above 2GB support.
> - Performance improvements.
> 
> ETH:
> - Tenant DCB support.
> - Replace RSS indirection table update interface.
> 
> Misc:
> - Debug Tools changes.
> 
> Signed-off-by: Denis Bolotin <denis.bolotin@cavium.com>
> Signed-off-by: Ariel Elior <ariel.elior@cavium.com>

Applied, thanks.

^ permalink raw reply

* [PATCH] net/sock: move memory_allocated over to percpu_counter variables
From: Olof Johansson @ 2018-09-06 19:20 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: Neil Horman, Marcelo Ricardo Leitner, Vlad Yasevich, Herbert Xu,
	Alexey Kuznetsov, Hideaki YOSHIFUJI, linux-crypto, linux-kernel,
	linux-sctp, netdev, linux-decnet-user, kernel-team,
	Olof Johansson

Today these are all global shared variables per protocol, and in
particular tcp_memory_allocated can get hot on a system with
large number of CPUs and a substantial number of connections.

Moving it over to a per-cpu variable makes it significantly cheaper,
and the added overhead when summing up the percpu copies is still smaller
than the cost of having a hot cacheline bouncing around.

Signed-off-by: Olof Johansson <olof@lixom.net>
---
 crypto/af_alg.c         | 10 ++++++++--
 include/net/sctp/sctp.h |  3 ++-
 include/net/sock.h      | 12 ++++++------
 include/net/tcp.h       |  2 +-
 include/net/udp.h       |  2 +-
 net/core/sock.c         |  5 ++++-
 net/decnet/af_decnet.c  |  3 ++-
 net/ipv4/tcp.c          |  3 ++-
 net/ipv4/udp.c          |  4 +++-
 net/sctp/protocol.c     |  6 ++++++
 net/sctp/socket.c       |  2 +-
 11 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/crypto/af_alg.c b/crypto/af_alg.c
index b053179e0bc5..1fd75a709d7b 100644
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -29,7 +29,7 @@ struct alg_type_list {
 	struct list_head list;
 };
 
-static atomic_long_t alg_memory_allocated;
+static struct percpu_counter alg_memory_allocated;
 
 static struct proto alg_proto = {
 	.name			= "ALG",
@@ -1183,13 +1183,19 @@ static int __init af_alg_init(void)
 	if (err)
 		goto out;
 
-	err = sock_register(&alg_family);
+	err = percpu_counter_init(&alg_memory_allocated, 0, GFP_KERNEL);
 	if (err != 0)
 		goto out_unregister_proto;
 
+	err = sock_register(&alg_family);
+	if (err != 0)
+		goto out_free_percpu;
+
 out:
 	return err;
 
+out_free_percpu:
+	percpu_counter_destroy(&alg_memory_allocated);
 out_unregister_proto:
 	proto_unregister(&alg_proto);
 	goto out;
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 8c2caa370e0f..270579cf310b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -36,7 +36,7 @@
  *    Sridhar Samudrala     <sri@us.ibm.com>
  *    Ardelle Fan           <ardelle.fan@intel.com>
  *    Ryan Layer            <rmlayer@us.ibm.com>
- *    Kevin Gao             <kevin.gao@intel.com> 
+ *    Kevin Gao             <kevin.gao@intel.com>
  */
 
 #ifndef __net_sctp_h__
@@ -114,6 +114,7 @@ __poll_t sctp_poll(struct file *file, struct socket *sock,
 void sctp_sock_rfree(struct sk_buff *skb);
 void sctp_copy_sock(struct sock *newsk, struct sock *sk,
 		    struct sctp_association *asoc);
+extern struct percpu_counter sctp_memory_allocated;
 extern struct percpu_counter sctp_sockets_allocated;
 int sctp_asconf_mgmt(struct sctp_sock *, struct sctp_sockaddr_entry *);
 struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *);
diff --git a/include/net/sock.h b/include/net/sock.h
index 433f45fc2d68..45aed5e84b5d 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1114,7 +1114,7 @@ struct proto {
 	/* Memory pressure */
 	void			(*enter_memory_pressure)(struct sock *sk);
 	void			(*leave_memory_pressure)(struct sock *sk);
-	atomic_long_t		*memory_allocated;	/* Current allocated memory. */
+	struct percpu_counter	*memory_allocated;	/* Current allocated memory. */
 	struct percpu_counter	*sockets_allocated;	/* Current number of sockets. */
 	/*
 	 * Pressure flag: try to collapse.
@@ -1237,19 +1237,19 @@ static inline bool sk_under_memory_pressure(const struct sock *sk)
 static inline long
 sk_memory_allocated(const struct sock *sk)
 {
-	return atomic_long_read(sk->sk_prot->memory_allocated);
+	return percpu_counter_sum_positive(sk->sk_prot->memory_allocated);
 }
 
-static inline long
+static inline void
 sk_memory_allocated_add(struct sock *sk, int amt)
 {
-	return atomic_long_add_return(amt, sk->sk_prot->memory_allocated);
+	percpu_counter_add(sk->sk_prot->memory_allocated, amt);
 }
 
 static inline void
 sk_memory_allocated_sub(struct sock *sk, int amt)
 {
-	atomic_long_sub(amt, sk->sk_prot->memory_allocated);
+	percpu_counter_sub(sk->sk_prot->memory_allocated, amt);
 }
 
 static inline void sk_sockets_allocated_dec(struct sock *sk)
@@ -1277,7 +1277,7 @@ proto_sockets_allocated_sum_positive(struct proto *prot)
 static inline long
 proto_memory_allocated(struct proto *prot)
 {
-	return atomic_long_read(prot->memory_allocated);
+	return percpu_counter_sum_positive(prot->memory_allocated);
 }
 
 static inline bool
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 770917d0caa7..2df1754cf3ab 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -248,7 +248,7 @@ extern long sysctl_tcp_mem[3];
 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
 #define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
 
-extern atomic_long_t tcp_memory_allocated;
+extern struct percpu_counter tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
 extern unsigned long tcp_memory_pressure;
 
diff --git a/include/net/udp.h b/include/net/udp.h
index 8482a990b0bb..9e0d9f7091a0 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -97,7 +97,7 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 
 extern struct proto udp_prot;
 
-extern atomic_long_t udp_memory_allocated;
+extern struct percpu_counter udp_memory_allocated;
 
 /* sysctl variables for udp */
 extern long sysctl_udp_mem[3];
diff --git a/net/core/sock.c b/net/core/sock.c
index 3730eb855095..0a755f6c8942 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2394,9 +2394,12 @@ EXPORT_SYMBOL(sk_wait_data);
 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
 {
 	struct proto *prot = sk->sk_prot;
-	long allocated = sk_memory_allocated_add(sk, amt);
+	long allocated;
 	bool charged = true;
 
+	sk_memory_allocated_add(sk, amt);
+	allocated = sk_memory_allocated(sk);
+
 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
 		goto suppress_allocation;
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 7d6ff983ba2c..f88af9ae4474 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -156,7 +156,7 @@ static const struct proto_ops dn_proto_ops;
 static DEFINE_RWLOCK(dn_hash_lock);
 static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE];
 static struct hlist_head dn_wild_sk;
-static atomic_long_t decnet_memory_allocated;
+static struct percpu_counter decnet_memory_allocated;
 
 static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen, int flags);
 static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags);
@@ -2356,6 +2356,7 @@ static int __init decnet_init(void)
 	int rc;
 
 	printk(banner);
+	percpu_counter_init(&decnet_memory_allocated, 0, GFP_KERNEL);
 
 	rc = proto_register(&dn_proto, 1);
 	if (rc != 0)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8c4235c098fd..eb6531ba6bd3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(tcp_orphan_count);
 long sysctl_tcp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_tcp_mem);
 
-atomic_long_t tcp_memory_allocated;	/* Current allocated memory. */
+struct percpu_counter tcp_memory_allocated;	/* Current allocated memory. */
 EXPORT_SYMBOL(tcp_memory_allocated);
 
 #if IS_ENABLED(CONFIG_SMC)
@@ -3834,6 +3834,7 @@ void __init tcp_init(void)
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) >
 		     FIELD_SIZEOF(struct sk_buff, cb));
 
+	percpu_counter_init(&tcp_memory_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
 	percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
 	inet_hashinfo_init(&tcp_hashinfo);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index f4e35b2ff8b8..6ec5d2f68ae7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,7 +122,7 @@ EXPORT_SYMBOL(udp_table);
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-atomic_long_t udp_memory_allocated;
+struct percpu_counter udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
 #define MAX_UDP_PORTS 65536
@@ -2923,6 +2923,8 @@ void __init udp_init(void)
 
 	__udp_sysctl_init(&init_net);
 
+	percpu_counter_init(&udp_memory_allocated, 0, GFP_KERNEL);
+
 	/* 16 spinlocks per cpu */
 	udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
 	udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e948db29ab53..ca59ca0dc740 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1391,6 +1391,10 @@ static __init int sctp_init(void)
 	if (!sctp_chunk_cachep)
 		goto err_chunk_cachep;
 
+	status = percpu_counter_init(&sctp_memory_allocated, 0, GFP_KERNEL);
+	if (status)
+		goto err_percpu_memory_init;
+
 	status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
 	if (status)
 		goto err_percpu_counter_init;
@@ -1559,6 +1563,8 @@ static __init int sctp_init(void)
 err_ehash_alloc:
 	percpu_counter_destroy(&sctp_sockets_allocated);
 err_percpu_counter_init:
+	percpu_counter_destroy(&sctp_memory_allocated);
+err_percpu_memory_init:
 	kmem_cache_destroy(sctp_chunk_cachep);
 err_chunk_cachep:
 	kmem_cache_destroy(sctp_bucket_cachep);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index f73e9d38d5ba..60d55573baa5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -107,7 +107,7 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
 			      enum sctp_socket_type type);
 
 static unsigned long sctp_memory_pressure;
-static atomic_long_t sctp_memory_allocated;
+struct percpu_counter sctp_memory_allocated;
 struct percpu_counter sctp_sockets_allocated;
 
 static void sctp_enter_memory_pressure(struct sock *sk)
-- 
2.11.0

^ permalink raw reply related

* Re: KASAN: slab-out-of-bounds Read in _decode_session6
From: Dmitry Vyukov @ 2018-09-06 19:17 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric Dumazet, syzbot, Alexei Starovoitov, Daniel Borkmann,
	David Miller, Herbert Xu, Alexey Kuznetsov, LKML, netdev,
	Steffen Klassert, syzkaller-bugs, Hideaki YOSHIFUJI
In-Reply-To: <20180906172713.cxjoazoo7asqggb3@ast-mbp.dhcp.thefacebook.com>

On Thu, Sep 6, 2018 at 7:27 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Thu, Sep 06, 2018 at 12:00:26AM -0700, Eric Dumazet wrote:
>>
>>
>> On 09/05/2018 08:17 PM, syzbot wrote:
>> > syzbot has found a reproducer for the following crash on:
>> >
>> > HEAD commit:ááá b36fdc6853a3 Merge tag 'gpio-v4.19-2' of git://git.kernel...
>> > git tree:áááááá upstream
>> > console output: https://syzkaller.appspot.com/x/log.txt?x=164938d1400000
>> > kernel config:á https://syzkaller.appspot.com/x/.config?x=4c7e83258d6e0156
>> > dashboard link: https://syzkaller.appspot.com/bug?extid=acffccec848dc13fe459
>> > compiler:áááááá gcc (GCC) 8.0.1 20180413 (experimental)
>> > syz repro:ááááá https://syzkaller.appspot.com/x/repro.syz?x=115f172e400000
>> > C reproducer:áá https://syzkaller.appspot.com/x/repro.c?x=16399be1400000
>> >
>> > IMPORTANT: if you fix the bug, please add the following tag to the commit:
>> > Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com
>> >
>> > IPv6: ADDRCONF(NETDEV_UP): veth1: link is not ready
>> > IPv6: ADDRCONF(NETDEV_CHANGE): veth1: link becomes ready
>> > IPv6: ADDRCONF(NETDEV_CHANGE): veth0: link becomes ready
>> > 8021q: adding VLAN 0 to HW filter on device team0
>> > ==================================================================
>> > BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>> > Read of size 1 at addr ffff8801d4a67f07 by task syz-executor092/4673
>> >
>> > CPU: 1 PID: 4673 Comm: syz-executor092 Not tainted 4.19.0-rc2+ #223
>> > Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
>> > Call Trace:
>> > á__dump_stack lib/dump_stack.c:77 [inline]
>> > ádump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>> > áprint_address_description+0x6c/0x20b mm/kasan/report.c:256
>> > ákasan_report_error mm/kasan/report.c:354 [inline]
>> > ákasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>> > á__asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
>> > á_decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>> > á__xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299
>> > áxfrm_decode_session include/net/xfrm.h:1232 [inline]
>> > ávti6_tnl_xmit+0x3fc/0x1bb1 net/ipv6/ip6_vti.c:542
>> > á__netdev_start_xmit include/linux/netdevice.h:4287 [inline]
>> > ánetdev_start_xmit include/linux/netdevice.h:4296 [inline]
>> > áxmit_one net/core/dev.c:3216 [inline]
>> > ádev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3232
>> > á__dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3802
>> > ádev_queue_xmit+0x17/0x20 net/core/dev.c:3835
>> > á__bpf_tx_skb net/core/filter.c:2012 [inline]
>> > á__bpf_redirect_common net/core/filter.c:2050 [inline]
>> > á__bpf_redirect+0x5b7/0xae0 net/core/filter.c:2057
>> > á____bpf_clone_redirect net/core/filter.c:2090 [inline]
>> > ábpf_clone_redirect+0x2f6/0x490 net/core/filter.c:2062
>> > ábpf_prog_c39d1ba309a769f7+0xe9e/0x1000
>> >
>> > Allocated by task 4673:
>> > ásave_stack+0x43/0xd0 mm/kasan/kasan.c:448
>> > áset_track mm/kasan/kasan.c:460 [inline]
>> > ákasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
>> > á__do_kmalloc_node mm/slab.c:3682 [inline]
>> > á__kmalloc_node_track_caller+0x47/0x70 mm/slab.c:3696
>> > á__kmalloc_reserve.isra.41+0x3a/0xe0 net/core/skbuff.c:137
>> > ápskb_expand_head+0x230/0x10e0 net/core/skbuff.c:1463
>> > áskb_ensure_writable+0x3dd/0x640 net/core/skbuff.c:5129
>> > á__bpf_try_make_writable net/core/filter.c:1633 [inline]
>> > ábpf_try_make_writable net/core/filter.c:1639 [inline]
>> > ábpf_try_make_head_writable net/core/filter.c:1647 [inline]
>> > á____bpf_clone_redirect net/core/filter.c:2084 [inline]
>> > ábpf_clone_redirect+0x14a/0x490 net/core/filter.c:2062
>> > ábpf_prog_c39d1ba309a769f7+0xe9e/0x1000
>> >
>> > Freed by task 3286:
>> > ásave_stack+0x43/0xd0 mm/kasan/kasan.c:448
>> > áset_track mm/kasan/kasan.c:460 [inline]
>> > á__kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
>> > ákasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
>> > á__cache_free mm/slab.c:3498 [inline]
>> > ákfree+0xd9/0x210 mm/slab.c:3813
>> > áload_elf_binary+0x2569/0x5610 fs/binfmt_elf.c:1118
>> > ásearch_binary_handler+0x17d/0x570 fs/exec.c:1653
>> > áexec_binprm fs/exec.c:1695 [inline]
>> > á__do_execve_file.isra.35+0x15ff/0x2460 fs/exec.c:1819
>> > ádo_execveat_common fs/exec.c:1866 [inline]
>> > ádo_execve fs/exec.c:1883 [inline]
>> > á__do_sys_execve fs/exec.c:1964 [inline]
>> > á__se_sys_execve fs/exec.c:1959 [inline]
>> > á__x64_sys_execve+0x8f/0xc0 fs/exec.c:1959
>> > ádo_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
>> > áentry_SYSCALL_64_after_hwframe+0x49/0xbe
>> >
>> > The buggy address belongs to the object at ffff8801d4a67d00
>> > áwhich belongs to the cache kmalloc-512 of size 512
>> > The buggy address is located 7 bytes to the right of
>> > á512-byte region [ffff8801d4a67d00, ffff8801d4a67f00)
>> > The buggy address belongs to the page:
>> > page:ffffea00075299c0 count:1 mapcount:0 mapping:ffff8801dac00940 index:0x0
>> > flags: 0x2fffc0000000100(slab)
>> > raw: 02fffc0000000100 ffffea0007529988 ffffea0007529a48 ffff8801dac00940
>> > raw: 0000000000000000 ffff8801d4a67080 0000000100000006 0000000000000000
>> > page dumped because: kasan: bad access detected
>> >
>> > Memory state around the buggy address:
>> > áffff8801d4a67e00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> > áffff8801d4a67e80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> >> ffff8801d4a67f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> > áááááááááááááááááá ^
>> > áffff8801d4a67f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>> > áffff8801d4a68000: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
>> > ==================================================================
>> >
>>
>>
>> What about :
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index aecdeba052d3f0ff3d4f0a33ec36891f9738052c..a662f59786bd0677850c1c60a2c92faa6fb6c5bb 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2081,7 +2081,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
>>          * here, we need to free the just generated clone to unclone once
>>          * again.
>>          */
>> -       ret = bpf_try_make_head_writable(skb);
>> +       ret = bpf_try_make_head_writable(clone);
>
> This part is fine. I think the bug is in _decode_session6,

Eric, you arrived to roughly the same conclusion, right?

> but I have a hard time reproducing the issue, so will appreciate
> if somebody can test the following patch:

syzbot can:
https://github.com/google/syzkaller/blob/master/docs/syzbot.md#testing-patches


> From 291f80f212461670d1e0140d06eee3071cf3e1ee Mon Sep 17 00:00:00 2001
> From: Alexei Starovoitov <ast@kernel.org>
> Date: Thu, 6 Sep 2018 10:23:29 -0700
> Subject: [PATCH] net/xfrm: fix out-of-bounds packet access
>
> BUG: KASAN: slab-out-of-bounds in _decode_session6+0x1331/0x14e0
> net/ipv6/xfrm6_policy.c:161
> Read of size 1 at addr ffff8801d882eec7 by task syz-executor1/6667
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
>   print_address_description+0x6c/0x20b mm/kasan/report.c:256
>   kasan_report_error mm/kasan/report.c:354 [inline]
>   kasan_report.cold.7+0x242/0x30d mm/kasan/report.c:412
>   __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:430
>   _decode_session6+0x1331/0x14e0 net/ipv6/xfrm6_policy.c:161
>   __xfrm_decode_session+0x71/0x140 net/xfrm/xfrm_policy.c:2299
>   xfrm_decode_session include/net/xfrm.h:1232 [inline]
>   vti6_tnl_xmit+0x3c3/0x1bc1 net/ipv6/ip6_vti.c:542
>   __netdev_start_xmit include/linux/netdevice.h:4313 [inline]
>   netdev_start_xmit include/linux/netdevice.h:4322 [inline]
>   xmit_one net/core/dev.c:3217 [inline]
>   dev_hard_start_xmit+0x272/0xc10 net/core/dev.c:3233
>   __dev_queue_xmit+0x2ab2/0x3870 net/core/dev.c:3803
>   dev_queue_xmit+0x17/0x20 net/core/dev.c:3836
>
> Reported-by: syzbot+acffccec848dc13fe459@syzkaller.appspotmail.com
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>  net/ipv6/xfrm6_policy.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
> index ef3defaf43b9..d35bcf92969c 100644
> --- a/net/ipv6/xfrm6_policy.c
> +++ b/net/ipv6/xfrm6_policy.c
> @@ -146,8 +146,8 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl, int reverse)
>         fl6->daddr = reverse ? hdr->saddr : hdr->daddr;
>         fl6->saddr = reverse ? hdr->daddr : hdr->saddr;
>
> -       while (nh + offset + 1 < skb->data ||
> -              pskb_may_pull(skb, nh + offset + 1 - skb->data)) {
> +       while (nh + offset + sizeof(*exthdr) < skb->data ||
> +              pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) {
>                 nh = skb_network_header(skb);
>                 exthdr = (struct ipv6_opt_hdr *)(nh + offset);
>
> --
> 2.17.1
>
> --
> You received this message because you are subscribed to the Google Groups "syzkaller-bugs" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to syzkaller-bugs+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/syzkaller-bugs/20180906172713.cxjoazoo7asqggb3%40ast-mbp.dhcp.thefacebook.com.
> For more options, visit https://groups.google.com/d/optout.

^ permalink raw reply

* Re: [PATCH net] net/ipv6: fix incorrect fib6 gateway info after do redirect
From: David Ahern @ 2018-09-06 14:35 UTC (permalink / raw)
  To: Hangbin Liu, netdev; +Cc: David S. Miller
In-Reply-To: <1536238666-5307-1-git-send-email-liuhangbin@gmail.com>

On 9/6/18 6:57 AM, Hangbin Liu wrote:
> When receive a redirect message and call rt6_do_redirect(), we allocate
> a new rt6_info and set new flags and gateway info, but not update these
> info to fib6_info.
> 
> Then if a user try to get the route info via `ip route get`, he will still
> get the old default gateway, because inet6_rtm_getroute() get gateway info
> from fib6_info.
> 
> Fixes: 23fb93a4d3f11 ("net/ipv6: Cleanup exception and cache route handling")
> Reported-by: Jianlin Shi <jishi@redhat.com>
> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
> ---
>  net/ipv6/route.c | 4 ++++
>  1 file changed, 4 insertions(+)
> 
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 18e00ce..3d367c9 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -3446,6 +3446,10 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
>  		goto out;
>  	}
>  
> +	/* Update fib6_info from rt6_info */
> +	from->fib6_flags = rt->rt6i_flags;
> +	from->fib6_nh.nh_gw = rt->rt6i_gateway;
> +
>  	netevent.old = &rt->dst;
>  	netevent.new = &nrt->dst;
>  	netevent.daddr = &msg->dest;
> 

Only an exception should be inserted - and it is. The original route
should not be updated.

The code prior to the fib6_info did not update the actual FIB entry, and
the IPv4 code does not update the original route.

^ permalink raw reply

* [PATCH v2 net-next 4/4] net/core: handle GRO_NORMAL skbs as a list in napi_gro_receive_list
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Allows GRO-using drivers to get the benefits of batching for non-GROable
 traffic.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 net/core/dev.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 69e2819994e4..9a937d2ac83b 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5617,6 +5617,7 @@ EXPORT_SYMBOL(napi_gro_receive);
 int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	struct list_head sublist;
 	gro_result_t result;
 	int kept = 0;
 
@@ -5626,14 +5627,26 @@ int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
 		skb_gro_reset_offset(skb);
 	}
 
+	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		list_del(&skb->list);
 		skb->next = NULL;
 		result = dev_gro_receive(napi, skb);
-		result = napi_skb_finish(result, skb);
-		if (result != GRO_DROP)
-			kept++;
+		if (result == GRO_NORMAL) {
+			list_add_tail(&skb->list, &sublist);
+			continue;
+		} else {
+			if (!list_empty(&sublist)) {
+				/* Handle the GRO_NORMAL skbs to prevent OoO */
+				kept += netif_receive_skb_list_internal(&sublist);
+				INIT_LIST_HEAD(&sublist);
+			}
+			result = napi_skb_finish(result, skb);
+			if (result != GRO_DROP)
+				kept++;
+		}
 	}
+	kept += netif_receive_skb_list_internal(&sublist);
 	return kept;
 }
 EXPORT_SYMBOL(napi_gro_receive_list);

^ permalink raw reply related

* [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  4 +--
 include/net/ip.h          |  4 +--
 include/net/ipv6.h        |  4 +--
 net/core/dev.c            | 63 +++++++++++++++++++++++++++++------------------
 net/ipv4/ip_input.c       | 39 ++++++++++++++++++-----------
 net/ipv6/ip6_input.c      | 37 +++++++++++++++++-----------
 6 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 2b53536b1d99..9b3fc5944ba5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2349,7 +2349,7 @@ struct packet_type {
 					 struct net_device *,
 					 struct packet_type *,
 					 struct net_device *);
-	void			(*list_func) (struct list_head *,
+	int			(*list_func) (struct list_head *,
 					      struct packet_type *,
 					      struct net_device *);
 	bool			(*id_match)(struct packet_type *ptype,
@@ -3546,7 +3546,7 @@ int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
-void netif_receive_skb_list(struct list_head *head);
+int netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
diff --git a/include/net/ip.h b/include/net/ip.h
index e44b1a44f67a..aab1f7eea1e1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -152,8 +152,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
 			  struct ip_options_rcu *opt);
 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 	   struct net_device *orig_dev);
-void ip_list_rcv(struct list_head *head, struct packet_type *pt,
-		 struct net_device *orig_dev);
+int ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		struct net_device *orig_dev);
 int ip_local_deliver(struct sk_buff *skb);
 int ip_mr_input(struct sk_buff *skb);
 int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index ff33f498c137..f15651eabfe0 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -914,8 +914,8 @@ static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6)
 
 int ipv6_rcv(struct sk_buff *skb, struct net_device *dev,
 	     struct packet_type *pt, struct net_device *orig_dev);
-void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
-		   struct net_device *orig_dev);
+int ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+		  struct net_device *orig_dev);
 
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 8df39ded77bd..69e2819994e4 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4922,24 +4922,27 @@ int netif_receive_skb_core(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb_core);
 
-static inline void __netif_receive_skb_list_ptype(struct list_head *head,
-						  struct packet_type *pt_prev,
-						  struct net_device *orig_dev)
+static inline int __netif_receive_skb_list_ptype(struct list_head *head,
+						 struct packet_type *pt_prev,
+						 struct net_device *orig_dev)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	if (!pt_prev)
-		return;
+		return 0;
 	if (list_empty(head))
-		return;
+		return 0;
 	if (pt_prev->list_func != NULL)
-		pt_prev->list_func(head, pt_prev, orig_dev);
+		kept = pt_prev->list_func(head, pt_prev, orig_dev);
 	else
 		list_for_each_entry_safe(skb, next, head, list)
-			pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
+			if (pt_prev->func(skb, skb->dev, pt_prev, orig_dev) == NET_RX_SUCCESS)
+				kept++;
+	return kept;
 }
 
-static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
+static int __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
 {
 	/* Fast-path assumptions:
 	 * - There is no RX handler.
@@ -4956,6 +4959,7 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 	struct net_device *od_curr = NULL;
 	struct list_head sublist;
 	struct sk_buff *skb, *next;
+	int kept = 0, ret;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -4963,12 +4967,15 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 		struct packet_type *pt_prev = NULL;
 
 		list_del(&skb->list);
-		__netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
-		if (!pt_prev)
+		ret = __netif_receive_skb_core(skb, pfmemalloc, &pt_prev);
+		if (!pt_prev) {
+			if (ret == NET_RX_SUCCESS)
+				kept++;
 			continue;
+		}
 		if (pt_curr != pt_prev || od_curr != orig_dev) {
 			/* dispatch old sublist */
-			__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+			kept += __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			pt_curr = pt_prev;
@@ -4978,7 +4985,8 @@ static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemallo
 	}
 
 	/* dispatch final sublist */
-	__netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+	kept += __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
+	return kept;
 }
 
 static int __netif_receive_skb(struct sk_buff *skb)
@@ -5006,11 +5014,12 @@ static int __netif_receive_skb(struct sk_buff *skb)
 	return ret;
 }
 
-static void __netif_receive_skb_list(struct list_head *head)
+static int __netif_receive_skb_list(struct list_head *head)
 {
 	unsigned long noreclaim_flag = 0;
 	struct sk_buff *skb, *next;
 	bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list) {
 		if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
@@ -5019,7 +5028,7 @@ static void __netif_receive_skb_list(struct list_head *head)
 			/* Handle the previous sublist */
 			list_cut_before(&sublist, head, &skb->list);
 			if (!list_empty(&sublist))
-				__netif_receive_skb_list_core(&sublist, pfmemalloc);
+				kept += __netif_receive_skb_list_core(&sublist, pfmemalloc);
 			pfmemalloc = !pfmemalloc;
 			/* See comments in __netif_receive_skb */
 			if (pfmemalloc)
@@ -5030,10 +5039,11 @@ static void __netif_receive_skb_list(struct list_head *head)
 	}
 	/* Handle the remaining sublist */
 	if (!list_empty(head))
-		__netif_receive_skb_list_core(head, pfmemalloc);
+		kept += __netif_receive_skb_list_core(head, pfmemalloc);
 	/* Restore pflags */
 	if (pfmemalloc)
 		memalloc_noreclaim_restore(noreclaim_flag);
+	return kept;
 }
 
 static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
@@ -5109,17 +5119,20 @@ static int netif_receive_skb_internal(struct sk_buff *skb)
 	return ret;
 }
 
-static void netif_receive_skb_list_internal(struct list_head *head)
+static int netif_receive_skb_list_internal(struct list_head *head)
 {
 	struct bpf_prog *xdp_prog = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
 		net_timestamp_check(netdev_tstamp_prequeue, skb);
 		list_del(&skb->list);
-		if (!skb_defer_rx_timestamp(skb))
+		if (skb_defer_rx_timestamp(skb))
+			kept++;
+		else
 			list_add_tail(&skb->list, &sublist);
 	}
 	list_splice_init(&sublist, head);
@@ -5149,13 +5162,15 @@ static void netif_receive_skb_list_internal(struct list_head *head)
 			if (cpu >= 0) {
 				/* Will be handled, remove from list */
 				list_del(&skb->list);
-				enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
+				if (enqueue_to_backlog(skb, cpu, &rflow->last_qtail) == NET_RX_SUCCESS)
+					kept++;
 			}
 		}
 	}
 #endif
-	__netif_receive_skb_list(head);
+	kept += __netif_receive_skb_list(head);
 	rcu_read_unlock();
+	return kept;
 }
 
 /**
@@ -5185,21 +5200,21 @@ EXPORT_SYMBOL(netif_receive_skb);
  *	netif_receive_skb_list - process many receive buffers from network
  *	@head: list of skbs to process.
  *
- *	Since return value of netif_receive_skb() is normally ignored, and
- *	wouldn't be meaningful for a list, this function returns void.
+ *	Returns the number of skbs for which netif_receive_skb() would have
+ *	returned %NET_RX_SUCCESS.
  *
  *	This function may only be called from softirq context and interrupts
  *	should be enabled.
  */
-void netif_receive_skb_list(struct list_head *head)
+int netif_receive_skb_list(struct list_head *head)
 {
 	struct sk_buff *skb;
 
 	if (list_empty(head))
-		return;
+		return 0;
 	list_for_each_entry(skb, head, list)
 		trace_netif_receive_skb_list_entry(skb);
-	netif_receive_skb_list_internal(head);
+	return netif_receive_skb_list_internal(head);
 }
 EXPORT_SYMBOL(netif_receive_skb_list);
 
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 3196cf58f418..75cc5a6ef9b8 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -526,9 +526,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
 		       ip_rcv_finish);
 }
 
-static void ip_sublist_rcv_finish(struct list_head *head)
+static int ip_sublist_rcv_finish(struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list) {
 		list_del(&skb->list);
@@ -536,16 +537,19 @@ static void ip_sublist_rcv_finish(struct list_head *head)
 		 * another kind of SKB-list usage (see validate_xmit_skb_list)
 		 */
 		skb->next = NULL;
-		dst_input(skb);
+		if (dst_input(skb) == NET_RX_SUCCESS)
+			kept++;
 	}
+	return kept;
 }
 
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
-			       struct list_head *head)
+static int ip_list_rcv_finish(struct net *net, struct sock *sk,
+			      struct list_head *head)
 {
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -556,8 +560,10 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		 * skb to its handler for processing
 		 */
 		skb = l3mdev_ip_rcv(skb);
-		if (!skb)
+		if (!skb) {
+			kept++;
 			continue;
+		}
 		if (ip_rcv_finish_core(net, sk, skb) == NET_RX_DROP)
 			continue;
 
@@ -565,7 +571,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		if (curr_dst != dst) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip_sublist_rcv_finish(&sublist);
+				kept += ip_sublist_rcv_finish(&sublist);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dst = dst;
@@ -573,25 +579,27 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip_sublist_rcv_finish(&sublist);
+	kept += ip_sublist_rcv_finish(&sublist);
+	return kept;
 }
 
-static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
-			   struct net *net)
+static int ip_sublist_rcv(struct list_head *head, struct net_device *dev,
+			  struct net *net)
 {
 	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
 		     head, dev, NULL, ip_rcv_finish);
-	ip_list_rcv_finish(net, NULL, head);
+	return ip_list_rcv_finish(net, NULL, head);
 }
 
-/* Receive a list of IP packets */
-void ip_list_rcv(struct list_head *head, struct packet_type *pt,
-		 struct net_device *orig_dev)
+/* Receive a list of IP packets; return number of successful receives */
+int ip_list_rcv(struct list_head *head, struct packet_type *pt,
+		struct net_device *orig_dev)
 {
 	struct net_device *curr_dev = NULL;
 	struct net *curr_net = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -606,7 +614,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		if (curr_dev != dev || curr_net != net) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip_sublist_rcv(&sublist, curr_dev, curr_net);
+				kept += ip_sublist_rcv(&sublist, curr_dev, curr_net);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dev = dev;
@@ -615,5 +623,6 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip_sublist_rcv(&sublist, curr_dev, curr_net);
+	kept += ip_sublist_rcv(&sublist, curr_dev, curr_net);
+	return kept;
 }
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 6242682be876..e64b830c9f0f 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -76,20 +76,24 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 	return dst_input(skb);
 }
 
-static void ip6_sublist_rcv_finish(struct list_head *head)
+static int ip6_sublist_rcv_finish(struct list_head *head)
 {
 	struct sk_buff *skb, *next;
+	int kept = 0;
 
 	list_for_each_entry_safe(skb, next, head, list)
-		dst_input(skb);
+		if (dst_input(skb) == NET_RX_SUCCESS)
+			kept++;
+	return kept;
 }
 
-static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
-				struct list_head *head)
+static int ip6_list_rcv_finish(struct net *net, struct sock *sk,
+			       struct list_head *head)
 {
 	struct dst_entry *curr_dst = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -100,14 +104,16 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 		 * skb to its handler for processing
 		 */
 		skb = l3mdev_ip6_rcv(skb);
-		if (!skb)
+		if (!skb) {
+			kept++;
 			continue;
+		}
 		ip6_rcv_finish_core(net, sk, skb);
 		dst = skb_dst(skb);
 		if (curr_dst != dst) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip6_sublist_rcv_finish(&sublist);
+				kept += ip6_sublist_rcv_finish(&sublist);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dst = dst;
@@ -115,7 +121,8 @@ static void ip6_list_rcv_finish(struct net *net, struct sock *sk,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip6_sublist_rcv_finish(&sublist);
+	kept += ip6_sublist_rcv_finish(&sublist);
+	return kept;
 }
 
 static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
@@ -273,22 +280,23 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
 		       ip6_rcv_finish);
 }
 
-static void ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
-			    struct net *net)
+static int ip6_sublist_rcv(struct list_head *head, struct net_device *dev,
+			   struct net *net)
 {
 	NF_HOOK_LIST(NFPROTO_IPV6, NF_INET_PRE_ROUTING, net, NULL,
 		     head, dev, NULL, ip6_rcv_finish);
-	ip6_list_rcv_finish(net, NULL, head);
+	return ip6_list_rcv_finish(net, NULL, head);
 }
 
 /* Receive a list of IPv6 packets */
-void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
-		   struct net_device *orig_dev)
+int ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
+		  struct net_device *orig_dev)
 {
 	struct net_device *curr_dev = NULL;
 	struct net *curr_net = NULL;
 	struct sk_buff *skb, *next;
 	struct list_head sublist;
+	int kept = 0;
 
 	INIT_LIST_HEAD(&sublist);
 	list_for_each_entry_safe(skb, next, head, list) {
@@ -303,7 +311,7 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 		if (curr_dev != dev || curr_net != net) {
 			/* dispatch old sublist */
 			if (!list_empty(&sublist))
-				ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+				kept += ip6_sublist_rcv(&sublist, curr_dev, curr_net);
 			/* start new sublist */
 			INIT_LIST_HEAD(&sublist);
 			curr_dev = dev;
@@ -312,7 +320,8 @@ void ipv6_list_rcv(struct list_head *head, struct packet_type *pt,
 		list_add_tail(&skb->list, &sublist);
 	}
 	/* dispatch final sublist */
-	ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+	kept += ip6_sublist_rcv(&sublist, curr_dev, curr_net);
+	return kept;
 }
 
 /*

^ permalink raw reply related

* [PATCH v2 net-next 2/4] sfc: use batched receive for GRO
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 drivers/net/ethernet/sfc/efx.c        | 11 +++++++++--
 drivers/net/ethernet/sfc/net_driver.h |  1 +
 drivers/net/ethernet/sfc/rx.c         | 16 +++++++++++++---
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index 330233286e78..dba13a28014c 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -263,9 +263,9 @@ static int efx_check_disabled(struct efx_nic *efx)
  */
 static int efx_process_channel(struct efx_channel *channel, int budget)
 {
+	struct list_head rx_list, gro_list;
 	struct efx_tx_queue *tx_queue;
-	struct list_head rx_list;
-	int spent;
+	int spent, gro_count;
 
 	if (unlikely(!channel->enabled))
 		return 0;
@@ -275,6 +275,10 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 	INIT_LIST_HEAD(&rx_list);
 	channel->rx_list = &rx_list;
 
+	EFX_WARN_ON_PARANOID(channel->gro_list != NULL);
+	INIT_LIST_HEAD(&gro_list);
+	channel->gro_list = &gro_list;
+
 	efx_for_each_channel_tx_queue(tx_queue, channel) {
 		tx_queue->pkts_compl = 0;
 		tx_queue->bytes_compl = 0;
@@ -300,6 +304,9 @@ static int efx_process_channel(struct efx_channel *channel, int budget)
 	/* Receive any packets we queued up */
 	netif_receive_skb_list(channel->rx_list);
 	channel->rx_list = NULL;
+	gro_count = napi_gro_receive_list(&channel->napi_str, channel->gro_list);
+	channel->irq_mod_score += gro_count * 2;
+	channel->gro_list = NULL;
 
 	return spent;
 }
diff --git a/drivers/net/ethernet/sfc/net_driver.h b/drivers/net/ethernet/sfc/net_driver.h
index 961b92979640..72addac7a84a 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -502,6 +502,7 @@ struct efx_channel {
 	unsigned int rx_pkt_index;
 
 	struct list_head *rx_list;
+	struct list_head *gro_list;
 
 	struct efx_rx_queue rx_queue;
 	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c
index 396ff01298cd..0534a54048c6 100644
--- a/drivers/net/ethernet/sfc/rx.c
+++ b/drivers/net/ethernet/sfc/rx.c
@@ -453,9 +453,19 @@ efx_rx_packet_gro(struct efx_channel *channel, struct efx_rx_buffer *rx_buf,
 
 	skb_record_rx_queue(skb, channel->rx_queue.core_index);
 
-	gro_result = napi_gro_frags(napi);
-	if (gro_result != GRO_DROP)
-		channel->irq_mod_score += 2;
+	/* Pass the packet up */
+	if (channel->gro_list != NULL) {
+		/* Clear napi->skb and prepare skb for GRO */
+		skb = napi_frags_skb(napi);
+		if (skb)
+			/* Add to list, will pass up later */
+			list_add_tail(&skb->list, channel->gro_list);
+	} else {
+		/* No list, so pass it up now */
+		gro_result = napi_gro_frags(napi);
+		if (gro_result != GRO_DROP)
+			channel->irq_mod_score += 2;
+	}
 }
 
 /* Allocate and construct an SKB around page fragments */

^ permalink raw reply related

* [PATCH v2 net-next 1/4] net: introduce list entry point for GRO
From: Edward Cree @ 2018-09-06 14:26 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev
In-Reply-To: <c1e79c86-56ae-98c6-8dc0-c227f91ee9bc@solarflare.com>

Also export napi_frags_skb() so that drivers using the napi_gro_frags()
 interface can prepare their SKBs properly for submitting on such a list.

Signed-off-by: Edward Cree <ecree@solarflare.com>
---
 include/linux/netdevice.h |  2 ++
 net/core/dev.c            | 28 +++++++++++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e2b3bd750c98..2b53536b1d99 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3548,8 +3548,10 @@ int netif_receive_skb(struct sk_buff *skb);
 int netif_receive_skb_core(struct sk_buff *skb);
 void netif_receive_skb_list(struct list_head *head);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
+int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
 struct sk_buff *napi_get_frags(struct napi_struct *napi);
+struct sk_buff *napi_frags_skb(struct napi_struct *napi);
 gro_result_t napi_gro_frags(struct napi_struct *napi);
 struct packet_offload *gro_find_receive_by_type(__be16 type);
 struct packet_offload *gro_find_complete_by_type(__be16 type);
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5a79a3..8df39ded77bd 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5598,6 +5598,31 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(napi_gro_receive);
 
+/* Returns the number of SKBs on the list successfully received */
+int napi_gro_receive_list(struct napi_struct *napi, struct list_head *head)
+{
+	struct sk_buff *skb, *next;
+	gro_result_t result;
+	int kept = 0;
+
+	list_for_each_entry(skb, head, list) {
+		skb_mark_napi_id(skb, napi);
+		trace_napi_gro_receive_entry(skb);
+		skb_gro_reset_offset(skb);
+	}
+
+	list_for_each_entry_safe(skb, next, head, list) {
+		list_del(&skb->list);
+		skb->next = NULL;
+		result = dev_gro_receive(napi, skb);
+		result = napi_skb_finish(result, skb);
+		if (result != GRO_DROP)
+			kept++;
+	}
+	return kept;
+}
+EXPORT_SYMBOL(napi_gro_receive_list);
+
 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
 {
 	if (unlikely(skb->pfmemalloc)) {
@@ -5669,7 +5694,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
  * Drivers could call both napi_gro_frags() and napi_gro_receive()
  * We copy ethernet header into skb->data to have a common layout.
  */
-static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
+struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 {
 	struct sk_buff *skb = napi->skb;
 	const struct ethhdr *eth;
@@ -5705,6 +5730,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
 
 	return skb;
 }
+EXPORT_SYMBOL(napi_frags_skb);
 
 gro_result_t napi_gro_frags(struct napi_struct *napi)
 {

^ permalink raw reply related

* [PATCH v2 net-next 0/4] net: batched receive in GRO path
From: Edward Cree @ 2018-09-06 14:24 UTC (permalink / raw)
  To: davem; +Cc: linux-net-drivers, netdev

This series listifies part of GRO processing, in a manner which allows those
 packets which are not GROed (i.e. for which dev_gro_receive returns
 GRO_NORMAL) to be passed on to the listified regular receive path.
I have not listified dev_gro_receive() itself, or the per-protocol GRO
 callback, since GRO's need to hold packets on lists under napi->gro_hash
 makes keeping the packets on other lists awkward, and since the GRO control
 block state of held skbs can refer only to one 'new' skb at a time.
 Nonetheless the batching of the calling code yields some performance gains
 in the GRO case as well.

Herewith the performance figures obtained in a NetPerf TCP stream test (with
 four streams, and irqs bound to a single core):
net-next: 7.166 Gbit/s (sigma 0.435)
after #2: 7.715 Gbit/s (sigma 0.145) = datum + 7.7%
after #4: 7.890 Gbit/s (sigma 0.217) = datum + 10.1%
(Note that the 'net-next' results were distinctly bimodal, with two results
 of about 8 Gbit/s and the remaining ten around 7 Gbit/s.  I don't have a
 good explanation for this.)
And with GRO disabled through ethtool -K (thus simulating traffic which is
 not GRO-able but, being TCP, is still passed to the GRO entry point):
net-next: 4.756 Gbit/s (sigma 0.240)
after #4: 5.355 Gbit/s (sigma 0.232) = datum + 12.6%

v2: Rebased on latest net-next.  Removed RFC tags.  Otherwise unchanged
 owing to lack of comments on v1.

Edward Cree (4):
  net: introduce list entry point for GRO
  sfc: use batched receive for GRO
  net: make listified RX functions return number of good packets
  net/core: handle GRO_NORMAL skbs as a list in napi_gro_receive_list

 drivers/net/ethernet/sfc/efx.c        |  11 +++-
 drivers/net/ethernet/sfc/net_driver.h |   1 +
 drivers/net/ethernet/sfc/rx.c         |  16 +++++-
 include/linux/netdevice.h             |   6 +-
 include/net/ip.h                      |   4 +-
 include/net/ipv6.h                    |   4 +-
 net/core/dev.c                        | 104 ++++++++++++++++++++++++++--------
 net/ipv4/ip_input.c                   |  39 ++++++++-----
 net/ipv6/ip6_input.c                  |  37 +++++++-----
 9 files changed, 157 insertions(+), 65 deletions(-)

^ permalink raw reply

* [PATCH net-next] net: dsa: b53: Fix build with B53_SRAB enabled and not B53_SERDES
From: Florian Fainelli @ 2018-09-06 18:42 UTC (permalink / raw)
  To: netdev
  Cc: Florian Fainelli, Andrew Lunn, Vivien Didelot, David S. Miller,
	open list

In case B53_SRAB is enabled, but not B53_SERDES, we can get the
following linking error:

ERROR: "b53_serdes_init" [drivers/net/dsa/b53/b53_srab.ko] undefined!

We also need to ifdef the body of b53_srab_serdes_map_lane() since it
would not be used when B53_SERDES is disabled and that would produce a
warning.

Fixes: 0e01491de646 ("net: dsa: b53: Add SerDes support")
Reported-by: kbuild test robot <lkp@intel.com>
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
---
 drivers/net/dsa/b53/b53_serdes.h | 7 +++++++
 drivers/net/dsa/b53/b53_srab.c   | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/net/dsa/b53/b53_serdes.h b/drivers/net/dsa/b53/b53_serdes.h
index e0674aa0167f..eed7c9357091 100644
--- a/drivers/net/dsa/b53/b53_serdes.h
+++ b/drivers/net/dsa/b53/b53_serdes.h
@@ -118,4 +118,11 @@ void b53_serdes_link_set(struct b53_device *dev, int port, unsigned int mode,
 void b53_serdes_phylink_validate(struct b53_device *dev, int port,
 				unsigned long *supported,
 				struct phylink_link_state *state);
+#if IS_ENABLED(CONFIG_B53_SERDES)
 int b53_serdes_init(struct b53_device *dev, int port);
+#else
+static inline int b53_serdes_init(struct b53_device *dev, int port)
+{
+	return -ENODEV;
+}
+#endif
diff --git a/drivers/net/dsa/b53/b53_srab.c b/drivers/net/dsa/b53/b53_srab.c
index 149788697fd6..b0ed81876bae 100644
--- a/drivers/net/dsa/b53/b53_srab.c
+++ b/drivers/net/dsa/b53/b53_srab.c
@@ -390,6 +390,7 @@ static irqreturn_t b53_srab_port_isr(int irq, void *dev_id)
 	return IRQ_WAKE_THREAD;
 }
 
+#if IS_ENABLED(CONFIG_B53_SERDES)
 static u8 b53_srab_serdes_map_lane(struct b53_device *dev, int port)
 {
 	struct b53_srab_priv *priv = dev->priv;
@@ -407,6 +408,7 @@ static u8 b53_srab_serdes_map_lane(struct b53_device *dev, int port)
 		return B53_INVALID_LANE;
 	}
 }
+#endif
 
 static int b53_srab_irq_enable(struct b53_device *dev, int port)
 {
-- 
2.17.1

^ permalink raw reply related

* [PATCH] tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY
From: Vincent Whitchurch @ 2018-09-06 13:54 UTC (permalink / raw)
  To: davem; +Cc: netdev, willemb, Vincent Whitchurch

According to the documentation in msg_zerocopy.rst, the SO_ZEROCOPY
flag was introduced because send(2) ignores unknown message flags and
any legacy application which was accidentally passing the equivalent of
MSG_ZEROCOPY earlier should not see any new behaviour.

Before commit f214f915e7db ("tcp: enable MSG_ZEROCOPY"), a send(2) call
which passed the equivalent of MSG_ZEROCOPY without setting SO_ZEROCOPY
would succeed.  However, after that commit, it fails with -ENOBUFS.  So
it appears that the SO_ZEROCOPY flag fails to fulfill its intended
purpose.  Fix it.

Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
Signed-off-by: Vincent Whitchurch <vincent.whitchurch@axis.com>
---
 net/core/skbuff.c | 3 ---
 net/ipv4/tcp.c    | 2 +-
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09d095f..b2c807f67aba 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -939,9 +939,6 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)

 	WARN_ON_ONCE(!in_task());

-	if (!sock_flag(sk, SOCK_ZEROCOPY))
-		return NULL;
-
 	skb = sock_omalloc(sk, 0, GFP_KERNEL);
 	if (!skb)
 		return NULL;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8af2fec5ad5..10c6246396cc 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1185,7 +1185,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)

 	flags = msg->msg_flags;

-	if (flags & MSG_ZEROCOPY && size) {
+	if (flags & MSG_ZEROCOPY && size && sock_flag(sk, SOCK_ZEROCOPY)) {
 		if (sk->sk_state != TCP_ESTABLISHED) {
 			err = -EINVAL;
 			goto out_err;
-- 
2.11.0

^ permalink raw reply related

* Re: [PATCH net-next v2 3/7] net: aquantia: implement WOL support
From: Andrew Lunn @ 2018-09-06 13:44 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Yana Esina, Nikita Danilov
In-Reply-To: <66632a61b9d43ffe804de8abf3a09ec825ff2754.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:05:58PM +0300, Igor Russkikh wrote:
> From: Yana Esina <yana.esina@aquantia.com>
> 
> Add WOL support. Currently only magic packet
> (ethtool -s <ethX> wol g) feature is implemented.
> 
> Remove hw_set_power and move that to FW_OPS set_power:
> because WOL configuration behaves differently on 1x and 2x
> firmwares
> 
> Signed-off-by: Yana Esina <yana.esina@aquantia.com>
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Tested-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next v2 6/7] net: aquantia: renaming for better visibility
From: Andrew Lunn @ 2018-09-06 13:44 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Nikita Danilov
In-Reply-To: <12f20049c9ca52bacef91a2d2e6f71f05a2fbbb2.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:06:01PM +0300, Igor Russkikh wrote:
> From: Nikita Danilov <nikita.danilov@aquantia.com>
> 
> Removed extra characters from the names of structures to unify prefixes
> used through the driver code (we normally use hw_atl for hw specifics).
> HW_ATL_B0_ and HW_ATL_A0_ are the same and useless copies.
> 
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH net-next v2 5/7] net: aquantia: whitespace changes
From: Andrew Lunn @ 2018-09-06 13:42 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Nikita Danilov
In-Reply-To: <5bf4e7aed7f6d93dcfd67e6aabc55561a28e0f70.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:06:00PM +0300, Igor Russkikh wrote:
> From: Nikita Danilov <nikita.danilov@aquantia.com>
> 
> Removed extra spaces, corrected alignment.
> 
> Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
> Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* RE: [PATCH v2 1/2] net: ethernet: i40e: fix build error
From: Wyborny, Carolyn @ 2018-09-06 18:15 UTC (permalink / raw)
  To: Andrew Lunn, Keller, Jacob E
  Cc: Wang, Dongsheng, Kirsher, Jeffrey T,
	sergei.shtylyov@cogentembedded.com, davem@davemloft.net,
	intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <20180906180316.GB26997@lunn.ch>

> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Andrew Lunn
> Sent: Thursday, September 06, 2018 11:03 AM
> To: Keller, Jacob E <jacob.e.keller@intel.com>
> Cc: Wang, Dongsheng <dongsheng.wang@hxt-semitech.com>; Kirsher,
> Jeffrey T <jeffrey.t.kirsher@intel.com>;
> sergei.shtylyov@cogentembedded.com; davem@davemloft.net; intel-
> wired-lan@lists.osuosl.org; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] net: ethernet: i40e: fix build error
> 
[..]
> You have a few options here.
> 
> 1) A library module, containing shared code. Use EXPORT_SYMBOL_GPL()
> in the library module, and the kernel runtime linker will link the
> calls into the library. Also, modprobe will ensure the library module
> is loaded first, before the driver module.
> 
> 2) Build time sharing of code. Place the shared code into a .o file,
> and link it to both modules.
> 
> There is nothing particularly difficult here, this all done lots of
> times within the kernel. Just look around and see how others do it.

Thanks Andrew,

Yes, I agree and we do have a team working on doing this.

Carolyn

Carolyn Wyborny 
Linux Development 
Networking Division 
Intel Corporation 

^ permalink raw reply

* Re: [PATCH net-next v2 3/7] net: aquantia: implement WOL support
From: Andrew Lunn @ 2018-09-06 13:34 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Yana Esina, Nikita Danilov
In-Reply-To: <66632a61b9d43ffe804de8abf3a09ec825ff2754.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:05:58PM +0300, Igor Russkikh wrote:
> From: Yana Esina <yana.esina@aquantia.com>
> 
> Add WOL support. Currently only magic packet
> (ethtool -s <ethX> wol g) feature is implemented.
> 
> Remove hw_set_power and move that to FW_OPS set_power:
> because WOL configuration behaves differently on 1x and 2x
> firmwares

Hi Igor, Yana

It looks like the set_power refactor could of been a patch of its own.
Then add the WOL support as a second patch.

In the future, try to make a patch do one thing, and only one
thing. It makes them easier to review.

   Andrew

^ permalink raw reply

* [iproute PATCH v2] ip-route: Fix segfault with many nexthops
From: Phil Sutter @ 2018-09-06 13:31 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: netdev

It was possible to crash ip-route by adding an IPv6 route with 37
nexthop statements. A simple reproducer is:

| for i in `seq 37`; do
| 	nhs="nexthop via 1111::$i "$nhs
| done
| ip -6 route add 3333::/64 $nhs

The related code was broken in multiple ways:

* parse_one_nh() assumed that rta points to 4kB of storage but caller
  provided just 1kB. Fixed by passing 'len' parameter with the correct
  value.

* Error checking of rta_addattr*() calls in parse_one_nh() and called
  functions was completely absent, so with above fix in place output
  flood would occur due to parser looping forever.

While being at it, increase message buffer sizes to 4k. This allows for
at most 144 nexthops.

Signed-off-by: Phil Sutter <phil@nwl.cc>
---
Changes since v1:
- Remove accidentally added 'return 0' line from parse_nexthops().
- Increase buffer sizes.
---
 ip/iproute.c          |  43 ++++++++++-------
 ip/iproute_lwtunnel.c | 108 +++++++++++++++++++++++++-----------------
 2 files changed, 91 insertions(+), 60 deletions(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index 30833414a3f7f..398322fd1f4ff 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -941,7 +941,7 @@ int print_route(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg)
 }
 
 static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
-			struct rtattr *rta, struct rtnexthop *rtnh,
+			struct rtattr *rta, size_t len, struct rtnexthop *rtnh,
 			int *argcp, char ***argvp)
 {
 	int argc = *argcp;
@@ -962,11 +962,16 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (r->rtm_family == AF_UNSPEC)
 				r->rtm_family = addr.family;
 			if (addr.family == r->rtm_family) {
-				rta_addattr_l(rta, 4096, RTA_GATEWAY, &addr.data, addr.bytelen);
-				rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
+				if (rta_addattr_l(rta, len, RTA_GATEWAY,
+						  &addr.data, addr.bytelen))
+					return -1;
+				rtnh->rtnh_len += sizeof(struct rtattr)
+						  + addr.bytelen;
 			} else {
-				rta_addattr_l(rta, 4096, RTA_VIA, &addr.family, addr.bytelen+2);
-				rtnh->rtnh_len += RTA_SPACE(addr.bytelen+2);
+				if (rta_addattr_l(rta, len, RTA_VIA,
+						  &addr.family, addr.bytelen + 2))
+					return -1;
+				rtnh->rtnh_len += RTA_SPACE(addr.bytelen + 2);
 			}
 		} else if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
@@ -988,13 +993,15 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			NEXT_ARG();
 			if (get_rt_realms_or_raw(&realm, *argv))
 				invarg("\"realm\" value is invalid\n", *argv);
-			rta_addattr32(rta, 4096, RTA_FLOW, realm);
+			if (rta_addattr32(rta, len, RTA_FLOW, realm))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + 4;
 		} else if (strcmp(*argv, "encap") == 0) {
-			int len = rta->rta_len;
+			int old_len = rta->rta_len;
 
-			lwt_parse_encap(rta, 4096, &argc, &argv);
-			rtnh->rtnh_len += rta->rta_len - len;
+			if (lwt_parse_encap(rta, len, &argc, &argv))
+				return -1;
+			rtnh->rtnh_len += rta->rta_len - old_len;
 		} else if (strcmp(*argv, "as") == 0) {
 			inet_prefix addr;
 
@@ -1002,8 +1009,9 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 			if (strcmp(*argv, "to") == 0)
 				NEXT_ARG();
 			get_addr(&addr, *argv, r->rtm_family);
-			rta_addattr_l(rta, 4096, RTA_NEWDST, &addr.data,
-				      addr.bytelen);
+			if (rta_addattr_l(rta, len, RTA_NEWDST,
+					  &addr.data, addr.bytelen))
+				return -1;
 			rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
 		} else
 			break;
@@ -1016,7 +1024,7 @@ static int parse_one_nh(struct nlmsghdr *n, struct rtmsg *r,
 static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 			  int argc, char **argv)
 {
-	char buf[1024];
+	char buf[4096];
 	struct rtattr *rta = (void *)buf;
 	struct rtnexthop *rtnh;
 
@@ -1036,7 +1044,7 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 		memset(rtnh, 0, sizeof(*rtnh));
 		rtnh->rtnh_len = sizeof(*rtnh);
 		rta->rta_len += rtnh->rtnh_len;
-		if (parse_one_nh(n, r, rta, rtnh, &argc, &argv)) {
+		if (parse_one_nh(n, r, rta, 4096, rtnh, &argc, &argv)) {
 			fprintf(stderr, "Error: cannot parse nexthop\n");
 			exit(-1);
 		}
@@ -1044,7 +1052,8 @@ static int parse_nexthops(struct nlmsghdr *n, struct rtmsg *r,
 	}
 
 	if (rta->rta_len > RTA_LENGTH(0))
-		addattr_l(n, 1024, RTA_MULTIPATH, RTA_DATA(rta), RTA_PAYLOAD(rta));
+		return addattr_l(n, 4096, RTA_MULTIPATH,
+				 RTA_DATA(rta), RTA_PAYLOAD(rta));
 	return 0;
 }
 
@@ -1053,7 +1062,7 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 	struct {
 		struct nlmsghdr	n;
 		struct rtmsg		r;
-		char			buf[1024];
+		char			buf[4096];
 	} req = {
 		.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
 		.n.nlmsg_flags = NLM_F_REQUEST | flags,
@@ -1484,8 +1493,8 @@ static int iproute_modify(int cmd, unsigned int flags, int argc, char **argv)
 		addattr_l(&req.n, sizeof(req), RTA_METRICS, RTA_DATA(mxrta), RTA_PAYLOAD(mxrta));
 	}
 
-	if (nhs_ok)
-		parse_nexthops(&req.n, &req.r, argc, argv);
+	if (nhs_ok && parse_nexthops(&req.n, &req.r, argc, argv))
+		return -1;
 
 	if (req.r.rtm_family == AF_UNSPEC)
 		req.r.rtm_family = AF_INET;
diff --git a/ip/iproute_lwtunnel.c b/ip/iproute_lwtunnel.c
index e604481142ec1..969a4763df71d 100644
--- a/ip/iproute_lwtunnel.c
+++ b/ip/iproute_lwtunnel.c
@@ -538,8 +538,9 @@ static int parse_encap_seg6(struct rtattr *rta, size_t len, int *argcp,
 
 	memcpy(tuninfo->srh, srh, srhlen);
 
-	rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
-		      sizeof(*tuninfo) + srhlen);
+	if (rta_addattr_l(rta, len, SEG6_IPTUNNEL_SRH, tuninfo,
+			  sizeof(*tuninfo) + srhlen))
+		return -1;
 
 	free(tuninfo);
 	free(srh);
@@ -611,6 +612,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 	char segbuf[1024];
 	inet_prefix addr;
 	__u32 hmac = 0;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "action") == 0) {
@@ -620,27 +622,28 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			action = read_action_type(*argv);
 			if (!action)
 				invarg("\"action\" value is invalid\n", *argv);
-			rta_addattr32(rta, len, SEG6_LOCAL_ACTION, action);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_ACTION,
+					    action);
 		} else if (strcmp(*argv, "table") == 0) {
 			NEXT_ARG();
 			if (table_ok++)
 				duparg2("table", *argv);
 			get_u32(&table, *argv, 0);
-			rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_TABLE, table);
 		} else if (strcmp(*argv, "nh4") == 0) {
 			NEXT_ARG();
 			if (nh4_ok++)
 				duparg2("nh4", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH4, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH4,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "nh6") == 0) {
 			NEXT_ARG();
 			if (nh6_ok++)
 				duparg2("nh6", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, SEG6_LOCAL_NH6, &addr.data,
-				      addr.bytelen);
+			ret = rta_addattr_l(rta, len, SEG6_LOCAL_NH6,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "iif") == 0) {
 			NEXT_ARG();
 			if (iif_ok++)
@@ -648,7 +651,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			iif = ll_name_to_index(*argv);
 			if (!iif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_IIF, iif);
 		} else if (strcmp(*argv, "oif") == 0) {
 			NEXT_ARG();
 			if (oif_ok++)
@@ -656,7 +659,7 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 			oif = ll_name_to_index(*argv);
 			if (!oif)
 				exit(nodev(*argv));
-			rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
+			ret = rta_addattr32(rta, len, SEG6_LOCAL_OIF, oif);
 		} else if (strcmp(*argv, "srh") == 0) {
 			NEXT_ARG();
 			if (srh_ok++)
@@ -691,6 +694,8 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		} else {
 			break;
 		}
+		if (ret)
+			return ret;
 		argc--; argv++;
 	}
 
@@ -705,14 +710,14 @@ static int parse_encap_seg6local(struct rtattr *rta, size_t len, int *argcp,
 		srh = parse_srh(segbuf, hmac,
 				action == SEG6_LOCAL_ACTION_END_B6_ENCAP);
 		srhlen = (srh->hdrlen + 1) << 3;
-		rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
+		ret = rta_addattr_l(rta, len, SEG6_LOCAL_SRH, srh, srhlen);
 		free(srh);
 	}
 
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_mpls(struct rtattr *rta, size_t len,
@@ -730,8 +735,9 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 		exit(1);
 	}
 
-	rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST, &addr.data,
-		      addr.bytelen);
+	if (rta_addattr_l(rta, len, MPLS_IPTUNNEL_DST,
+			  &addr.data, addr.bytelen))
+		return -1;
 
 	argc--;
 	argv++;
@@ -745,7 +751,8 @@ static int parse_encap_mpls(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl);
+			if (rta_addattr8(rta, len, MPLS_IPTUNNEL_TTL, ttl))
+				return -1;
 		} else {
 			break;
 		}
@@ -768,6 +775,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -778,7 +786,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -786,8 +794,8 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET);
-			rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tos") == 0) {
 			__u32 tos;
 
@@ -796,7 +804,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("tos", *argv);
 			if (rtnl_dsfield_a2n(&tos, *argv))
 				invarg("\"tos\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TOS, tos);
 		} else if (strcmp(*argv, "ttl") == 0) {
 			__u8 ttl;
 
@@ -805,10 +813,12 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 				duparg2("ttl", *argv);
 			if (get_u8(&ttl, *argv, 0))
 				invarg("\"ttl\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP_TTL, ttl);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -819,7 +829,7 @@ static int parse_encap_ip(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ila(struct rtattr *rta, size_t len,
@@ -828,6 +838,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	__u64 locator;
 	int argc = *argcp;
 	char **argv = *argvp;
+	int ret = 0;
 
 	if (get_addr64(&locator, *argv) < 0) {
 		fprintf(stderr, "Bad locator: %s\n", *argv);
@@ -836,7 +847,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 
 	argc--; argv++;
 
-	rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator);
+	if (rta_addattr64(rta, 1024, ILA_ATTR_LOCATOR, locator))
+		return -1;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "csum-mode") == 0) {
@@ -849,8 +861,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"csum-mode\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
-				     (__u8)csum_mode);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_CSUM_MODE,
+					   (__u8)csum_mode);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "ident-type") == 0) {
@@ -863,8 +875,8 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"ident-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
-				     (__u8)ident_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_IDENT_TYPE,
+					   (__u8)ident_type);
 
 			argc--; argv++;
 		} else if (strcmp(*argv, "hook-type") == 0) {
@@ -877,13 +889,15 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 				invarg("\"hook-type\" value is invalid\n",
 				       *argv);
 
-			rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
-				     (__u8)hook_type);
+			ret = rta_addattr8(rta, 1024, ILA_ATTR_HOOK_TYPE,
+					   (__u8)hook_type);
 
 			argc--; argv++;
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 	}
 
 	/* argv is currently the first unparsed argument,
@@ -893,7 +907,7 @@ static int parse_encap_ila(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static int parse_encap_ip6(struct rtattr *rta, size_t len,
@@ -902,6 +916,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	int id_ok = 0, dst_ok = 0, tos_ok = 0, ttl_ok = 0;
 	char **argv = *argvp;
 	int argc = *argcp;
+	int ret = 0;
 
 	while (argc > 0) {
 		if (strcmp(*argv, "id") == 0) {
@@ -912,7 +927,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("id", *argv);
 			if (get_be64(&id, *argv, 0))
 				invarg("\"id\" value is invalid\n", *argv);
-			rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
+			ret = rta_addattr64(rta, len, LWTUNNEL_IP6_ID, id);
 		} else if (strcmp(*argv, "dst") == 0) {
 			inet_prefix addr;
 
@@ -920,8 +935,8 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (dst_ok++)
 				duparg2("dst", *argv);
 			get_addr(&addr, *argv, AF_INET6);
-			rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
-				      &addr.data, addr.bytelen);
+			ret = rta_addattr_l(rta, len, LWTUNNEL_IP6_DST,
+					    &addr.data, addr.bytelen);
 		} else if (strcmp(*argv, "tc") == 0) {
 			__u32 tc;
 
@@ -930,7 +945,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 				duparg2("tc", *argv);
 			if (rtnl_dsfield_a2n(&tc, *argv))
 				invarg("\"tc\" value is invalid\n", *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_TC, tc);
 		} else if (strcmp(*argv, "hoplimit") == 0) {
 			__u8 hoplimit;
 
@@ -940,10 +955,13 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 			if (get_u8(&hoplimit, *argv, 0))
 				invarg("\"hoplimit\" value is invalid\n",
 				       *argv);
-			rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT, hoplimit);
+			ret = rta_addattr8(rta, len, LWTUNNEL_IP6_HOPLIMIT,
+					   hoplimit);
 		} else {
 			break;
 		}
+		if (ret)
+			break;
 		argc--; argv++;
 	}
 
@@ -954,7 +972,7 @@ static int parse_encap_ip6(struct rtattr *rta, size_t len,
 	*argcp = argc + 1;
 	*argvp = argv - 1;
 
-	return 0;
+	return ret;
 }
 
 static void lwt_bpf_usage(void)
@@ -1021,6 +1039,7 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	int argc = *argcp;
 	char **argv = *argvp;
 	__u16 type;
+	int ret = 0;
 
 	NEXT_ARG();
 	type = read_encap_type(*argv);
@@ -1037,37 +1056,40 @@ int lwt_parse_encap(struct rtattr *rta, size_t len, int *argcp, char ***argvp)
 	nest = rta_nest(rta, 1024, RTA_ENCAP);
 	switch (type) {
 	case LWTUNNEL_ENCAP_MPLS:
-		parse_encap_mpls(rta, len, &argc, &argv);
+		ret = parse_encap_mpls(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP:
-		parse_encap_ip(rta, len, &argc, &argv);
+		ret = parse_encap_ip(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_ILA:
-		parse_encap_ila(rta, len, &argc, &argv);
+		ret = parse_encap_ila(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_IP6:
-		parse_encap_ip6(rta, len, &argc, &argv);
+		ret = parse_encap_ip6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_BPF:
 		if (parse_encap_bpf(rta, len, &argc, &argv) < 0)
 			exit(-1);
 		break;
 	case LWTUNNEL_ENCAP_SEG6:
-		parse_encap_seg6(rta, len, &argc, &argv);
+		ret = parse_encap_seg6(rta, len, &argc, &argv);
 		break;
 	case LWTUNNEL_ENCAP_SEG6_LOCAL:
-		parse_encap_seg6local(rta, len, &argc, &argv);
+		ret = parse_encap_seg6local(rta, len, &argc, &argv);
 		break;
 	default:
 		fprintf(stderr, "Error: unsupported encap type\n");
 		break;
 	}
+	if (ret)
+		return ret;
+
 	rta_nest_end(rta, nest);
 
-	rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
+	ret = rta_addattr16(rta, 1024, RTA_ENCAP_TYPE, type);
 
 	*argcp = argc;
 	*argvp = argv;
 
-	return 0;
+	return ret;
 }
-- 
2.18.0

^ permalink raw reply related

* Re: [PATCH v2 1/2] net: ethernet: i40e: fix build error
From: Andrew Lunn @ 2018-09-06 18:03 UTC (permalink / raw)
  To: Keller, Jacob E
  Cc: Wang, Dongsheng, Kirsher, Jeffrey T,
	sergei.shtylyov@cogentembedded.com, davem@davemloft.net,
	intel-wired-lan@lists.osuosl.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <02874ECE860811409154E81DA85FBB5884C7AC01@ORSMSX115.amr.corp.intel.com>

> I'm more worried about how it interacts with modules. For example,
> we could have i40e and i40evf share some code, but then wouldn't one
> of them become dependent on the other? i.e. you'd have to load i40e
> in order to successfully load i40evf? Or you'd have to have some
> sort of common glue module which you load first, and then load i40e
> and i40evf after? This also creates some interactions with
> out-of-tree modules which make it difficult. It would be nice if we
> could share the code in some way that still resulted in allowing
> each module to be separate...

You have a few options here.

1) A library module, containing shared code. Use EXPORT_SYMBOL_GPL()
in the library module, and the kernel runtime linker will link the
calls into the library. Also, modprobe will ensure the library module
is loaded first, before the driver module.

2) Build time sharing of code. Place the shared code into a .o file,
and link it to both modules.

There is nothing particularly difficult here, this all done lots of
times within the kernel. Just look around and see how others do it.

      Andrew

^ permalink raw reply

* Re: [PATCH net-next 10/11] tap: accept an array of XDP buffs through sendmsg()
From: Michael S. Tsirkin @ 2018-09-06 18:00 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-11-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:25PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. Tap will build skb through those XDP buffers.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tap.c | 73 +++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 71 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/tap.c b/drivers/net/tap.c
> index 7996ed7cbf18..50eb7bf22225 100644
> --- a/drivers/net/tap.c
> +++ b/drivers/net/tap.c
> @@ -1146,14 +1146,83 @@ static const struct file_operations tap_fops = {
>  #endif
>  };
>  
> +static int tap_get_user_xdp(struct tap_queue *q, struct xdp_buff *xdp)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int vnet_hdr_len = 0;
> +	struct tap_dev *tap;
> +	struct sk_buff *skb;
> +	int err, depth;
> +
> +	if (q->flags & IFF_VNET_HDR)
> +		vnet_hdr_len = READ_ONCE(q->vnet_hdr_sz);
> +
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto err;
> +	}

So fundamentally why is it called XDP?
We just build and skb, don't we?

> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	skb_set_network_header(skb, ETH_HLEN);
> +	skb_reset_mac_header(skb);
> +	skb->protocol = eth_hdr(skb)->h_proto;
> +
> +	if (vnet_hdr_len) {
> +		err = virtio_net_hdr_to_skb(skb, gso, tap_is_little_endian(q));
> +		if (err)
> +			goto err_kfree;
> +	}
> +
> +	skb_probe_transport_header(skb, ETH_HLEN);
> +
> +	/* Move network header to the right position for VLAN tagged packets */
> +	if ((skb->protocol == htons(ETH_P_8021Q) ||
> +	     skb->protocol == htons(ETH_P_8021AD)) &&
> +	    __vlan_get_protocol(skb, skb->protocol, &depth) != 0)
> +		skb_set_network_header(skb, depth);
> +
> +	rcu_read_lock();
> +	tap = rcu_dereference(q->tap);
> +	if (tap) {
> +		skb->dev = tap->dev;
> +		dev_queue_xmit(skb);
> +	} else {
> +		kfree_skb(skb);
> +	}
> +	rcu_read_unlock();
> +
> +	return 0;
> +
> +err_kfree:
> +	kfree_skb(skb);
> +err:
> +	rcu_read_lock();
> +		tap = rcu_dereference(q->tap);
> +	if (tap && tap->count_tx_dropped)
> +		tap->count_tx_dropped(tap);
> +	rcu_read_unlock();
> +	return err;
> +}
> +
>  static int tap_sendmsg(struct socket *sock, struct msghdr *m,
>  		       size_t total_len)
>  {
>  	struct tap_queue *q = container_of(sock, struct tap_queue, sock);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
> +	int i;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		for (i = 0; i < ctl->type >> 16; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tap_get_user_xdp(q, xdp);
> +		}
> +		return 0;
> +	}
>  
>  	return tap_get_user(q, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			    m->msg_flags & MSG_DONTWAIT);
> -- 
> 2.17.1

^ permalink raw reply

* Re: [PATCH bpf] selftests/bpf/test_progs: do not check errno == 0
From: Alexei Starovoitov @ 2018-09-06 18:00 UTC (permalink / raw)
  To: Mauricio Vasquez B
  Cc: Alexei Starovoitov, Daniel Borkmann, Shuah Khan, netdev,
	linux-kernel, linux-kselftest
In-Reply-To: <1535994119-4853-1-git-send-email-mauricio.vasquez@polito.it>

On Mon, Sep 03, 2018 at 07:01:59PM +0200, Mauricio Vasquez B wrote:
> The errno man page states: "The value in errno is significant only when
> the return value of the call indicated an error..." then it is not correct
> to check it, it could be different than zero even if the function
> succeeded.
> 
> It causes some false positives if errno is set by a previous function.
> 
> Signed-off-by: Mauricio Vasquez B <mauricio.vasquez@polito.it>

Applied, Thanks

^ permalink raw reply

* Re: [PATCH net-next v2 2/7] net: aquantia: definitions for WOL
From: Andrew Lunn @ 2018-09-06 13:23 UTC (permalink / raw)
  To: Igor Russkikh; +Cc: David S . Miller, netdev, Yana Esina, Nikita Danilov
In-Reply-To: <901717047deb6a872a8f3792d6337fca321a88ea.1536233536.git.igor.russkikh@aquantia.com>

On Thu, Sep 06, 2018 at 04:05:57PM +0300, Igor Russkikh wrote:
> From: Yana Esina <yana.esina@aquantia.com>

Hi Igor, Yana

> @@ -134,13 +132,36 @@ struct __packed hw_aq_atl_utils_fw_rpc {
>  					u32 pattern_offset;
>  					u32 pattern_size;
>  				} wol_bit_map_pattern;
> +
> +				struct {
> +					u8 mac_addr[6];

ETH_ALEN

> +struct __packed offload_info {
> +	u32 version;
> +	u32 len;
> +	u8 mac_addr[6];

ETH_ALEN

> +struct __packed fw2x_msg_wol {
> +	u32 msg_id;
> +	u8 hw_addr[6];

ETH_ALEN

^ permalink raw reply

* Re: [PATCH net-next 09/11] tuntap: accept an array of XDP buffs through sendmsg()
From: Michael S. Tsirkin @ 2018-09-06 17:51 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-10-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:24PM +0800, Jason Wang wrote:
> This patch implement TUN_MSG_PTR msg_control type. This type allows
> the caller to pass an array of XDP buffs to tuntap through ptr field
> of the tun_msg_control. If an XDP program is attached, tuntap can run
> XDP program directly. If not, tuntap will build skb and do a fast
> receiving since part of the work has been done by vhost_net.
> 
> This will avoid lots of indirect calls thus improves the icache
> utilization and allows to do XDP batched flushing when doing XDP
> redirection.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Is most of the benefit in batched flushing or skipping
indirect calls? Because if it's flushing we can gain
most of it easily by adding an analog of xmit_more.

> ---
>  drivers/net/tun.c | 103 ++++++++++++++++++++++++++++++++++++++++++++--
>  1 file changed, 100 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index c839a4bdcbd9..069db2e5dd08 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -2424,22 +2424,119 @@ static void tun_sock_write_space(struct sock *sk)
>  	kill_fasync(&tfile->fasync, SIGIO, POLL_OUT);
>  }
>  
> +static int tun_xdp_one(struct tun_struct *tun,
> +		       struct tun_file *tfile,
> +		       struct xdp_buff *xdp, int *flush)
> +{
> +	struct virtio_net_hdr *gso = xdp->data_hard_start + sizeof(int);
> +	struct tun_pcpu_stats *stats;
> +	struct bpf_prog *xdp_prog;
> +	struct sk_buff *skb = NULL;
> +	u32 rxhash = 0, act;
> +	int buflen = *(int *)xdp->data_hard_start;
> +	int err = 0;
> +	bool skb_xdp = false;
> +
> +	xdp_prog = rcu_dereference(tun->xdp_prog);
> +	if (xdp_prog) {
> +		if (gso->gso_type) {
> +			skb_xdp = true;
> +			goto build;
> +		}
> +		xdp_set_data_meta_invalid(xdp);
> +		xdp->rxq = &tfile->xdp_rxq;
> +		act = tun_do_xdp(tun, tfile, xdp_prog, xdp, &err);
> +		if (err)
> +			goto out;
> +		if (act == XDP_REDIRECT)
> +			*flush = true;
> +		if (act != XDP_PASS)
> +			goto out;
> +	}
> +
> +build:
> +	skb = build_skb(xdp->data_hard_start, buflen);
> +	if (!skb) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	skb_reserve(skb, xdp->data - xdp->data_hard_start);
> +	skb_put(skb, xdp->data_end - xdp->data);
> +
> +	if (virtio_net_hdr_to_skb(skb, gso, tun_is_little_endian(tun))) {
> +		this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
> +		kfree_skb(skb);
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	skb->protocol = eth_type_trans(skb, tun->dev);
> +	skb_reset_network_header(skb);
> +	skb_probe_transport_header(skb, 0);
> +
> +	if (skb_xdp) {
> +		err = do_xdp_generic(xdp_prog, skb);
> +		if (err != XDP_PASS)
> +			goto out;
> +	}
> +
> +	if (!rcu_dereference(tun->steering_prog))
> +		rxhash = __skb_get_hash_symmetric(skb);
> +
> +	netif_receive_skb(skb);
> +
> +	stats = get_cpu_ptr(tun->pcpu_stats);
> +	u64_stats_update_begin(&stats->syncp);
> +	stats->rx_packets++;
> +	stats->rx_bytes += skb->len;
> +	u64_stats_update_end(&stats->syncp);
> +	put_cpu_ptr(stats);
> +
> +	if (rxhash)
> +		tun_flow_update(tun, rxhash, tfile);
> +
> +out:
> +	return err;
> +}
> +
>  static int tun_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
>  {
> -	int ret;
> +	int ret, i;
>  	struct tun_file *tfile = container_of(sock, struct tun_file, socket);
>  	struct tun_struct *tun = tun_get(tfile);
>  	struct tun_msg_ctl *ctl = m->msg_control;
> +	struct xdp_buff *xdp;
>  
>  	if (!tun)
>  		return -EBADFD;
>  
> -	if (ctl && ctl->type != TUN_MSG_UBUF)
> -		return -EINVAL;
> +	if (ctl && ((ctl->type & 0xF) == TUN_MSG_PTR)) {
> +		int n = ctl->type >> 16;
> +		int flush = 0;
> +
> +		local_bh_disable();
> +		rcu_read_lock();
> +
> +		for (i = 0; i < n; i++) {
> +			xdp = &((struct xdp_buff *)ctl->ptr)[i];
> +			tun_xdp_one(tun, tfile, xdp, &flush);
> +		}
> +
> +		if (flush)
> +			xdp_do_flush_map();
> +
> +		rcu_read_unlock();
> +		local_bh_enable();
> +
> +		ret = total_len;
> +		goto out;
> +	}
>  
>  	ret = tun_get_user(tun, tfile, ctl ? ctl->ptr : NULL, &m->msg_iter,
>  			   m->msg_flags & MSG_DONTWAIT,
>  			   m->msg_flags & MSG_MORE);
> +out:
>  	tun_put(tun);
>  	return ret;
>  }
> -- 
> 2.17.1

^ permalink raw reply

* Re: BUG: 4.14.11 unable to handle kernel NULL pointer dereference in xfrm_lookup
From: Tobias Hommel @ 2018-09-06 13:03 UTC (permalink / raw)
  To: Kristian Evensen
  Cc: Steffen Klassert, Markus Berner, Network Development,
	Florian Westphal, Wolfgang Walter, Wei Wang
In-Reply-To: <CAKfDRXjMYtAFKkv9+S-=_Rug3w1AiKeu0eRyEDBhXvau_91UaQ@mail.gmail.com>

Hey guys,

I finally got some time to do a bisect and we narrowed the problem down to:

b838d5e1c5b6e57b10ec8af2268824041e3ea911 is the first bad commit
commit b838d5e1c5b6e57b10ec8af2268824041e3ea911
Author: Wei Wang <weiwan@google.com>
Date:   Sat Jun 17 10:42:32 2017 -0700

    ipv4: mark DST_NOGC and remove the operation of dst_free()

    With the previous preparation patches, we are ready to get rid of the
    dst gc operation in ipv4 code and release dst based on refcnt only.
    So this patch adds DST_NOGC flag for all IPv4 dst and remove the calls
    to dst_free().
    At this point, all dst created in ipv4 code do not use the dst gc
    anymore and will be destroyed at the point when refcnt drops to 0.

    Signed-off-by: Wei Wang <weiwan@google.com>
    Acked-by: Martin KaFai Lau <kafai@fb.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

:040000 040000 9b7e7fb641de6531fc7887473ca47ef7cb6a11da 831a73b71d3df1755f3e24c0d3c86d7a93fd55e2 M      net


I also saw there was a new thread some days ago reporting a similar problem. So
I put you guys (Wolfgang, Wei) into Cc.

Tobi

On Thu, Jun 14, 2018 at 10:38:01AM +0200, Kristian Evensen wrote:
> Hello,
> 
> On Tue, Jun 12, 2018 at 10:29 AM, Kristian Evensen
> <kristian.evensen@gmail.com> wrote:
> > Thanks for spending time on this. I will see what I can manage in
> > terms of a bisect. Our last good kernel was 4.9, so at least it
> > narrows the scope down a bit compared to 4.4 or 4.1.
> 
> I hope we might have got somewhere. While looking more into ipsec and
> 4.14, we noticed large performance regressions (-~20%) on some
> low-powered devices we are also using. We quickly identified the
> removal of the flow cache as the "culprit", and the performance
> regression is discussed in the netdev-thread for the removal of the
> cache ("xfrm: remove flow cache"). For the time being and in order to
> restore the performance, we have reverted the patch series removing
> the flow cache. When running our tests (on the APU) after the revert,
> we no longer see the crash. Before the revert, the APU would always
> crash within some hours. After the revert, our tests have been running
> for 24 hours+. Our test is quite basic, we establish 1, 2, 3 ...,  50
> tunnels and then run iperf on all tunnels in parallel. The tunnels are
> teared down between each iteration.
> 
> We are still running the test and will keep doing so, but I thought I
> should share this finding in case it can help in fixing the error. I
> will report back in case we find out something more, and please let me
> know if you have any suggestions for things I can test. I don't for
> example know if it is safe to revert one and one commit of the flow
> cache, to try to pin the crash even more down.
> 
> BR,
> Kristian

^ permalink raw reply

* Re: [PATCH net-next 07/11] tuntap: move XDP flushing out of tun_do_xdp()
From: Michael S. Tsirkin @ 2018-09-06 17:48 UTC (permalink / raw)
  To: Jason Wang; +Cc: netdev, linux-kernel, kvm, virtualization
In-Reply-To: <20180906040526.22518-8-jasowang@redhat.com>

On Thu, Sep 06, 2018 at 12:05:22PM +0800, Jason Wang wrote:
> This will allow adding batch flushing on top.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---
>  drivers/net/tun.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 21b125020b3b..ff1cbf3ebd50 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -1646,7 +1646,6 @@ static u32 tun_do_xdp(struct tun_struct *tun,
>  	switch (act) {
>  	case XDP_REDIRECT:
>  		*err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
> -		xdp_do_flush_map();
>  		if (*err)
>  			break;
>  		goto out;
> @@ -1735,6 +1734,9 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
>  		act = tun_do_xdp(tun, tfile, xdp_prog, &xdp, &err);
>  		if (err)
>  			goto err_xdp;
> +
> +		if (act == XDP_REDIRECT)
> +			xdp_do_flush_map();
>  		if (act != XDP_PASS)
>  			goto out;

At this point the switch statement which used to contain all XDP things
seems to be gone completely. Just rewrite with a bunch of if statements
and all xdp handling spread out to where it makes sense?

> -- 
> 2.17.1

^ permalink raw reply

* [PATCH net-next v2 4/7] net: aquantia: implement EEE support
From: Igor Russkikh @ 2018-09-06 13:05 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Igor Russkikh, Yana Esina, Nikita Danilov
In-Reply-To: <cover.1536233536.git.igor.russkikh@aquantia.com>

From: Yana Esina <yana.esina@aquantia.com>

Support of Energy-Efficient Ethernet to aQuantia NIC's via ethtool
(according to the IEEE 802.3az specifications)

Signed-off-by: Yana Esina <yana.esina@aquantia.com>
Signed-off-by: Nikita Danilov <nikita.danilov@aquantia.com>
Tested-by: Nikita Danilov <nikita.danilov@aquantia.com>
Signed-off-by: Igor Russkikh <igor.russkikh@aquantia.com>
---
 drivers/net/ethernet/aquantia/atlantic/aq_common.h |  5 ++
 .../net/ethernet/aquantia/atlantic/aq_ethtool.c    | 77 +++++++++++++++++++
 drivers/net/ethernet/aquantia/atlantic/aq_hw.h     |  5 ++
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h    |  1 +
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c        |  2 +
 .../aquantia/atlantic/hw_atl/hw_atl_utils.h        | 13 ++++
 .../aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c   | 86 ++++++++++++++++++++++
 7 files changed, 189 insertions(+)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_common.h b/drivers/net/ethernet/aquantia/atlantic/aq_common.h
index d52b088ff8f0..becb578211ed 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_common.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_common.h
@@ -57,4 +57,9 @@
 #define AQ_NIC_RATE_1G         BIT(4)
 #define AQ_NIC_RATE_100M       BIT(5)
 
+#define AQ_NIC_RATE_EEE_10G	BIT(6)
+#define AQ_NIC_RATE_EEE_5G	BIT(7)
+#define AQ_NIC_RATE_EEE_2GS	BIT(8)
+#define AQ_NIC_RATE_EEE_1G	BIT(9)
+
 #endif /* AQ_COMMON_H */
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
index b88be5e5f0a2..22dd4fbd34d7 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_ethtool.c
@@ -315,6 +315,81 @@ static int aq_ethtool_set_wol(struct net_device *ndev,
 	return err;
 }
 
+static enum hw_atl_fw2x_rate eee_mask_to_ethtool_mask(u32 speed)
+{
+	u32 rate = 0;
+
+	if (speed & AQ_NIC_RATE_EEE_10G)
+		rate |= SUPPORTED_10000baseT_Full;
+
+	if (speed & AQ_NIC_RATE_EEE_2GS)
+		rate |= SUPPORTED_2500baseX_Full;
+
+	if (speed & AQ_NIC_RATE_EEE_1G)
+		rate |= SUPPORTED_1000baseT_Full;
+
+	return rate;
+}
+
+static int aq_ethtool_get_eee(struct net_device *ndev, struct ethtool_eee *eee)
+{
+	struct aq_nic_s *aq_nic = netdev_priv(ndev);
+	u32 rate, supported_rates;
+	int err = 0;
+
+	if (!aq_nic->aq_fw_ops->get_eee_rate)
+		return -EOPNOTSUPP;
+
+	err = aq_nic->aq_fw_ops->get_eee_rate(aq_nic->aq_hw, &rate,
+					      &supported_rates);
+	if (err < 0)
+		return err;
+
+	eee->supported = eee_mask_to_ethtool_mask(supported_rates);
+
+	if (aq_nic->aq_nic_cfg.eee_speeds)
+		eee->advertised = eee->supported;
+
+	eee->lp_advertised = eee_mask_to_ethtool_mask(rate);
+
+	eee->eee_enabled = !!eee->advertised;
+
+	eee->tx_lpi_enabled = eee->eee_enabled;
+	if (eee->advertised & eee->lp_advertised)
+		eee->eee_active = true;
+
+	return 0;
+}
+
+static int aq_ethtool_set_eee(struct net_device *ndev, struct ethtool_eee *eee)
+{
+	struct aq_nic_s *aq_nic = netdev_priv(ndev);
+	u32 rate, supported_rates;
+	struct aq_nic_cfg_s *cfg;
+	int err = 0;
+
+	cfg = aq_nic_get_cfg(aq_nic);
+
+	if (unlikely(!aq_nic->aq_fw_ops->get_eee_rate ||
+		     !aq_nic->aq_fw_ops->set_eee_rate))
+		return -EOPNOTSUPP;
+
+	err = aq_nic->aq_fw_ops->get_eee_rate(aq_nic->aq_hw, &rate,
+					      &supported_rates);
+	if (err < 0)
+		return err;
+
+	if (eee->eee_enabled) {
+		rate = supported_rates;
+		cfg->eee_speeds = rate;
+	} else {
+		rate = 0;
+		cfg->eee_speeds = 0;
+	}
+
+	return aq_nic->aq_fw_ops->set_eee_rate(aq_nic->aq_hw, rate);
+}
+
 static int aq_ethtool_nway_reset(struct net_device *ndev)
 {
 	struct aq_nic_s *aq_nic = netdev_priv(ndev);
@@ -438,6 +513,8 @@ const struct ethtool_ops aq_ethtool_ops = {
 	.nway_reset          = aq_ethtool_nway_reset,
 	.get_ringparam       = aq_get_ringparam,
 	.set_ringparam       = aq_set_ringparam,
+	.get_eee             = aq_ethtool_get_eee,
+	.set_eee             = aq_ethtool_set_eee,
 	.get_pauseparam      = aq_ethtool_get_pauseparam,
 	.set_pauseparam      = aq_ethtool_set_pauseparam,
 	.get_rxfh_key_size   = aq_ethtool_get_rss_key_size,
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
index 9050b40d4f58..908f19fe19b3 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_hw.h
@@ -230,6 +230,11 @@ struct aq_fw_ops {
 
 	int (*set_power)(struct aq_hw_s *self, unsigned int power_state,
 			 u8 *mac);
+
+	int (*set_eee_rate)(struct aq_hw_s *self, u32 speed);
+
+	int (*get_eee_rate)(struct aq_hw_s *self, u32 *rate,
+			    u32 *supported_rates);
 };
 
 #endif /* AQ_HW_H */
diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index 2069cbb6e1a1..c1582f4e8e1b 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -45,6 +45,7 @@ struct aq_nic_cfg_s {
 	bool is_lro;
 	u8  tcs;
 	struct aq_rss_parameters aq_rss;
+	u32 eee_speeds;
 };
 
 #define AQ_NIC_FLAG_STARTED     0x00000004U
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
index c6fe4a58e047..bb1561c6d25a 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.c
@@ -916,5 +916,7 @@ const struct aq_fw_ops aq_fw_1x_ops = {
 	.update_link_status = hw_atl_utils_mpi_get_link_status,
 	.update_stats = hw_atl_utils_update_stats,
 	.set_power = aq_fw1x_set_power,
+	.set_eee_rate = NULL,
+	.get_eee_rate = NULL,
 	.set_flow_control = NULL,
 };
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
index 5de4294c471f..069088893433 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
@@ -171,9 +171,22 @@ struct __packed hw_aq_atl_utils_mbox_header {
 	u32 error;
 };
 
+struct __packed hw_aq_info {
+	u8 reserved[6];
+	u16 phy_fault_code;
+	u16 phy_temperature;
+	u8 cable_len;
+	u8 reserved1;
+	u32 cable_diag_data[4];
+	u8 reserved2[32];
+	u32 caps_lo;
+	u32 caps_hi;
+};
+
 struct __packed hw_aq_atl_utils_mbox {
 	struct hw_aq_atl_utils_mbox_header header;
 	struct hw_atl_stats_s stats;
+	struct hw_aq_info info;
 };
 
 /* fw2x */
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
index 3cd4b098c8c0..98f2de9c0696 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c
@@ -40,6 +40,11 @@
 #define HW_ATL_FW2X_CTRL_ASYMMETRIC_PAUSE BIT(CTRL_ASYMMETRIC_PAUSE)
 #define HW_ATL_FW2X_CTRL_FORCE_RECONNECT  BIT(CTRL_FORCE_RECONNECT)
 
+#define HW_ATL_FW2X_CAP_EEE_1G_MASK      BIT(CAPS_HI_1000BASET_FD_EEE)
+#define HW_ATL_FW2X_CAP_EEE_2G5_MASK     BIT(CAPS_HI_2P5GBASET_FD_EEE)
+#define HW_ATL_FW2X_CAP_EEE_5G_MASK      BIT(CAPS_HI_5GBASET_FD_EEE)
+#define HW_ATL_FW2X_CAP_EEE_10G_MASK     BIT(CAPS_HI_10GBASET_FD_EEE)
+
 #define HAL_ATLANTIC_WOL_FILTERS_COUNT   8
 #define HAL_ATLANTIC_UTILS_FW2X_MSG_WOL  0x0E
 
@@ -115,6 +120,38 @@ static enum hw_atl_fw2x_rate link_speed_mask_2fw2x_ratemask(u32 speed)
 	return rate;
 }
 
+static u32 fw2x_to_eee_mask(u32 speed)
+{
+	u32 rate = 0;
+
+	if (speed & HW_ATL_FW2X_CAP_EEE_10G_MASK)
+		rate |= AQ_NIC_RATE_EEE_10G;
+	if (speed & HW_ATL_FW2X_CAP_EEE_5G_MASK)
+		rate |= AQ_NIC_RATE_EEE_5G;
+	if (speed & HW_ATL_FW2X_CAP_EEE_2G5_MASK)
+		rate |= AQ_NIC_RATE_EEE_2GS;
+	if (speed & HW_ATL_FW2X_CAP_EEE_1G_MASK)
+		rate |= AQ_NIC_RATE_EEE_1G;
+
+	return rate;
+}
+
+static u32 eee_mask_to_fw2x(u32 speed)
+{
+	u32 rate = 0;
+
+	if (speed & AQ_NIC_RATE_EEE_10G)
+		rate |= HW_ATL_FW2X_CAP_EEE_10G_MASK;
+	if (speed & AQ_NIC_RATE_EEE_5G)
+		rate |= HW_ATL_FW2X_CAP_EEE_5G_MASK;
+	if (speed & AQ_NIC_RATE_EEE_2GS)
+		rate |= HW_ATL_FW2X_CAP_EEE_2G5_MASK;
+	if (speed & AQ_NIC_RATE_EEE_1G)
+		rate |= HW_ATL_FW2X_CAP_EEE_1G_MASK;
+
+	return rate;
+}
+
 static int aq_fw2x_set_link_speed(struct aq_hw_s *self, u32 speed)
 {
 	u32 val = link_speed_mask_2fw2x_ratemask(speed);
@@ -137,14 +174,27 @@ static void aq_fw2x_set_mpi_flow_control(struct aq_hw_s *self, u32 *mpi_state)
 		*mpi_state &= ~BIT(CAPS_HI_ASYMMETRIC_PAUSE);
 }
 
+static void aq_fw2x_upd_eee_rate_bits(struct aq_hw_s *self, u32 *mpi_opts,
+				      u32 eee_speeds)
+{
+	*mpi_opts &= ~(HW_ATL_FW2X_CAP_EEE_1G_MASK |
+		       HW_ATL_FW2X_CAP_EEE_2G5_MASK |
+		       HW_ATL_FW2X_CAP_EEE_5G_MASK |
+		       HW_ATL_FW2X_CAP_EEE_10G_MASK);
+
+	*mpi_opts |= eee_mask_to_fw2x(eee_speeds);
+}
+
 static int aq_fw2x_set_state(struct aq_hw_s *self,
 			     enum hal_atl_utils_fw_state_e state)
 {
 	u32 mpi_state = aq_hw_read_reg(self, HW_ATL_FW2X_MPI_CONTROL2_ADDR);
+	struct aq_nic_cfg_s *cfg = self->aq_nic_cfg;
 
 	switch (state) {
 	case MPI_INIT:
 		mpi_state &= ~BIT(CAPS_HI_LINK_DROP);
+		aq_fw2x_upd_eee_rate_bits(self, &mpi_state, cfg->eee_speeds);
 		aq_fw2x_set_mpi_flow_control(self, &mpi_state);
 		break;
 	case MPI_DEINIT:
@@ -347,6 +397,40 @@ static int aq_fw2x_set_power(struct aq_hw_s *self, unsigned int power_state,
 	return err;
 }
 
+static int aq_fw2x_set_eee_rate(struct aq_hw_s *self, u32 speed)
+{
+	u32 mpi_opts = aq_hw_read_reg(self, HW_ATL_FW2X_MPI_CONTROL2_ADDR);
+
+	aq_fw2x_upd_eee_rate_bits(self, &mpi_opts, speed);
+
+	aq_hw_write_reg(self, HW_ATL_FW2X_MPI_CONTROL2_ADDR, mpi_opts);
+
+	return 0;
+}
+
+static int aq_fw2x_get_eee_rate(struct aq_hw_s *self, u32 *rate,
+				u32 *supported_rates)
+{
+	u32 mpi_state;
+	u32 caps_hi;
+	int err = 0;
+	u32 addr = self->mbox_addr + offsetof(struct hw_aq_atl_utils_mbox, info) +
+		   offsetof(struct hw_aq_info, caps_hi);
+
+	err = hw_atl_utils_fw_downld_dwords(self, addr, &caps_hi,
+					    sizeof(caps_hi) / sizeof(u32));
+
+	if (err)
+		return err;
+
+	*supported_rates = fw2x_to_eee_mask(caps_hi);
+
+	mpi_state = aq_hw_read_reg(self, HW_ATL_FW2X_MPI_STATE2_ADDR);
+	*rate = fw2x_to_eee_mask(mpi_state);
+
+	return err;
+}
+
 static int aq_fw2x_renegotiate(struct aq_hw_s *self)
 {
 	u32 mpi_opts = aq_hw_read_reg(self, HW_ATL_FW2X_MPI_CONTROL2_ADDR);
@@ -380,5 +464,7 @@ const struct aq_fw_ops aq_fw_2x_ops = {
 	.update_link_status = aq_fw2x_update_link_status,
 	.update_stats = aq_fw2x_update_stats,
 	.set_power = aq_fw2x_set_power,
+	.set_eee_rate = aq_fw2x_set_eee_rate,
+	.get_eee_rate = aq_fw2x_get_eee_rate,
 	.set_flow_control   = aq_fw2x_set_flow_control,
 };
-- 
2.7.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox