Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH 3/3] netfilter: nft_compat: use-after-free when deleting targets
From: Pablo Neira Ayuso @ 2019-02-13 17:47 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190213174758.17275-1-pablo@netfilter.org>

Fetch pointer to module before target object is released.

Fixes: 29e3880109e3 ("netfilter: nf_tables: fix use-after-free when deleting compat expressions")
Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables")
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nft_compat.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index fe64df848365..0a4bad55a8aa 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -315,6 +315,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 {
 	struct xt_target *target = expr->ops->data;
 	void *info = nft_expr_priv(expr);
+	struct module *me = target->me;
 	struct xt_tgdtor_param par;
 
 	par.net = ctx->net;
@@ -325,7 +326,7 @@ nft_target_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
 		par.target->destroy(&par);
 
 	if (nft_xt_put(container_of(expr->ops, struct nft_xt, ops)))
-		module_put(target->me);
+		module_put(me);
 }
 
 static int nft_extension_dump_info(struct sk_buff *skb, int attr,
-- 
2.11.0


^ permalink raw reply related

* [PATCH 2/3] ipvs: fix dependency on nf_defrag_ipv6
From: Pablo Neira Ayuso @ 2019-02-13 17:47 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190213174758.17275-1-pablo@netfilter.org>

From: Andrea Claudi <aclaudi@redhat.com>

ipvs relies on nf_defrag_ipv6 module to manage IPv6 fragmentation,
but lacks proper Kconfig dependencies and does not explicitly
request defrag features.

As a result, if netfilter hooks are not loaded, when IPv6 fragmented
packet are handled by ipvs only the first fragment makes through.

Fix it properly declaring the dependency on Kconfig and registering
netfilter hooks on ip_vs_add_service() and ip_vs_new_dest().

Reported-by: Li Shuang <shuali@redhat.com>
Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/ipvs/Kconfig      |  1 +
 net/netfilter/ipvs/ip_vs_core.c | 10 ++++------
 net/netfilter/ipvs/ip_vs_ctl.c  | 10 ++++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index cad48d07c818..8401cefd9f65 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -29,6 +29,7 @@ config	IP_VS_IPV6
 	bool "IPv6 support for IPVS"
 	depends on IPV6 = y || IP_VS = IPV6
 	select IP6_NF_IPTABLES
+	select NF_DEFRAG_IPV6
 	---help---
 	  Add IPv6 support to IPVS.
 
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index fe9abf3cc10a..235205c93e14 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1536,14 +1536,12 @@ ip_vs_try_to_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 		/* sorry, all this trouble for a no-hit :) */
 		IP_VS_DBG_PKT(12, af, pp, skb, iph->off,
 			      "ip_vs_in: packet continues traversal as normal");
-		if (iph->fragoffs) {
-			/* Fragment that couldn't be mapped to a conn entry
-			 * is missing module nf_defrag_ipv6
-			 */
-			IP_VS_DBG_RL("Unhandled frag, load nf_defrag_ipv6\n");
+
+		/* Fragment couldn't be mapped to a conn entry */
+		if (iph->fragoffs)
 			IP_VS_DBG_PKT(7, af, pp, skb, iph->off,
 				      "unhandled fragment");
-		}
+
 		*verdict = NF_ACCEPT;
 		return 0;
 	}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7d6318664eb2..86afacb07e5f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -43,6 +43,7 @@
 #ifdef CONFIG_IP_VS_IPV6
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #endif
 #include <net/route.h>
 #include <net/sock.h>
@@ -895,6 +896,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 {
 	struct ip_vs_dest *dest;
 	unsigned int atype, i;
+	int ret = 0;
 
 	EnterFunction(2);
 
@@ -905,6 +907,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
 			atype & IPV6_ADDR_LINKLOCAL) &&
 			!__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
 			return -EINVAL;
+
+		ret = nf_defrag_ipv6_enable(svc->ipvs->net);
+		if (ret)
+			return ret;
 	} else
 #endif
 	{
@@ -1228,6 +1234,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 			ret = -EINVAL;
 			goto out_err;
 		}
+
+		ret = nf_defrag_ipv6_enable(ipvs->net);
+		if (ret)
+			goto out_err;
 	}
 #endif
 
-- 
2.11.0


^ permalink raw reply related

* Re: [PATCH rdma-next 0/2] Followup changes to mlx5-next branch
From: Leon Romanovsky @ 2019-02-13 17:53 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: RDMA mailing list, Moni Shoua, Saeed Mahameed, linux-netdev
In-Reply-To: <20190211115608.22677-1-leon@kernel.org>

[-- Attachment #1: Type: text/plain, Size: 546 bytes --]

On Mon, Feb 11, 2019 at 01:56:06PM +0200, Leon Romanovsky wrote:
> From: Leon Romanovsky <leonro@mellanox.com>
>
> Hi,
>
> There are two small cleanups needed after prefetch MR code was merged.
>
> Thanks
>
> Leon Romanovsky (2):
>   net/mlx5: Align ODP capability function with netdev coding style

This patch was applied.

>   net/mlx5: Factor out HCA capabilities functions

This will be resent.
>
>  .../net/ethernet/mellanox/mlx5/core/main.c    | 62 ++++++++++++-------
>  1 file changed, 39 insertions(+), 23 deletions(-)
>
> --
> 2.19.1
>

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 801 bytes --]

^ permalink raw reply

* WARNING: locking bug in try_to_grab_pending
From: syzbot @ 2019-02-13 17:56 UTC (permalink / raw)
  To: davem, kuznet, linux-kernel, netdev, syzkaller-bugs, yoshfuji

Hello,

syzbot found the following crash on:

HEAD commit:    6663cf821c13 flow_offload: Fix flow action infrastructure
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=15927de4c00000
kernel config:  https://syzkaller.appspot.com/x/.config?x=8572a6e4661225f4
dashboard link: https://syzkaller.appspot.com/bug?extid=2b713236b28823cd4dff
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=13e932a8c00000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+2b713236b28823cd4dff@syzkaller.appspotmail.com

Started in network mode
Own node identity 7f000001, cluster identity 4711
Enabled bearer <udp:syz0>, priority 10
------------[ cut here ]------------
DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)
WARNING: CPU: 1 PID: 3476 at kernel/locking/lockdep.c:3315  
__lock_acquire+0x13bf/0x4700 kernel/locking/lockdep.c:3315
Kernel panic - not syncing: panic_on_warn set ...
CPU: 1 PID: 3476 Comm: kworker/1:2 Not tainted 5.0.0-rc5+ #59
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
Workqueue:  addrconf_dad_work (ipv6_addrconf)
Call Trace:
  __dump_stack lib/dump_stack.c:77 [inline]
  dump_stack+0x172/0x1f0 lib/dump_stack.c:113
  panic+0x2cb/0x65c kernel/panic.c:214
  __warn.cold+0x20/0x45 kernel/panic.c:571
  report_bug+0x263/0x2b0 lib/bug.c:186
  fixup_bug arch/x86/kernel/traps.c:178 [inline]
  fixup_bug arch/x86/kernel/traps.c:173 [inline]
  do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271
  do_invalid_op+0x37/0x50 arch/x86/kernel/traps.c:290
  invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:973
RIP: 0010:__lock_acquire+0x13bf/0x4700 kernel/locking/lockdep.c:3315
Code: 8b 1d e9 c7 05 08 45 85 db 0f 85 d6 f4 ff ff 48 c7 c6 a0 a4 6b 87 48  
c7 c7 00 79 6b 87 44 89 9c 24 98 00 00 00 e8 3f 2f ec ff <0f> 0b 44 8b 9c  
24 98 00 00 00 e9 af f4 ff ff 8b 3d 5c 7a fe 08 85
RSP: 0018:ffff88809b5478b0 EFLAGS: 00010086
RAX: 0000000000000000 RBX: 00000000005e00a8 RCX: 0000000000000000
RDX: 0000000000000000 RSI: ffffffff815a8f66 RDI: ffffed10136a8f08
RBP: ffff88809b547a78 R08: ffff88809b58e680 R09: fffffbfff1133341
R10: fffffbfff1133340 R11: ffffffff88999a03 R12: ffff88809b58efb8
R13: ffff88809b58efc2 R14: 00000000005e00a8 R15: ffff88809b58e680
  lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841
  __raw_spin_lock include/linux/spinlock_api_smp.h:142 [inline]
  _raw_spin_lock+0x2f/0x40 kernel/locking/spinlock.c:144
  spin_lock include/linux/spinlock.h:329 [inline]
  try_to_grab_pending+0x129/0x710 kernel/workqueue.c:1259
  __cancel_work kernel/workqueue.c:3115 [inline]
  cancel_delayed_work+0x82/0x2d0 kernel/workqueue.c:3144
  addrconf_del_dad_work+0x1c/0x50 net/ipv6/addrconf.c:312
  addrconf_dad_stop+0x129/0x2e0 net/ipv6/addrconf.c:2040
  addrconf_dad_begin net/ipv6/addrconf.c:3973 [inline]
  addrconf_dad_work+0xdbf/0x1150 net/ipv6/addrconf.c:4062
  process_one_work+0x98e/0x1790 kernel/workqueue.c:2173
  worker_thread+0x98/0xe40 kernel/workqueue.c:2319
  kthread+0x357/0x430 kernel/kthread.c:246
  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352
Kernel Offset: disabled
Rebooting in 86400 seconds..


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* general protection fault in tc_ctl_chain
From: syzbot @ 2019-02-13 17:56 UTC (permalink / raw)
  To: davem, jhs, jiri, linux-kernel, netdev, syzkaller-bugs,
	xiyou.wangcong

Hello,

syzbot found the following crash on:

HEAD commit:    bd3606c29fcc rocker: Remove port_attr_bridge_flags_get ass..
git tree:       net-next
console output: https://syzkaller.appspot.com/x/log.txt?x=121bbf87400000
kernel config:  https://syzkaller.appspot.com/x/.config?x=8572a6e4661225f4
dashboard link: https://syzkaller.appspot.com/bug?extid=eff9cae063e4b633c6c1
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=11cbd404c00000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=17fc80d4c00000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+eff9cae063e4b633c6c1@syzkaller.appspotmail.com

audit: type=1800 audit(1550028783.638:30): pid=7517 uid=0 auid=4294967295  
ses=4294967295 subj==unconfined op=collect_data cause=failed(directio)  
comm="startpar" name="rmnologin" dev="sda1" ino=2423 res=0
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 0 PID: 7669 Comm: syz-executor789 Not tainted 5.0.0-rc5+ #60
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011
RIP: 0010:__lock_acquire+0x8df/0x4700 kernel/locking/lockdep.c:3215
Code: 28 00 00 00 0f 85 35 27 00 00 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f  
5d c3 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 0f  
85 dc 27 00 00 49 81 3c 24 20 45 9a 89 0f 84 03 f8
RSP: 0018:ffff88808b44f180 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 000000000000000c RSI: 0000000000000000 RDI: 0000000000000060
RBP: ffff88808b44f350 R08: 0000000000000001 R09: 0000000000000001
R10: ffff88808b44f570 R11: 0000000000000001 R12: 0000000000000060
R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880915d4680
FS:  0000000001ef6880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000080 CR3: 0000000093549000 CR4: 00000000001406f0
Call Trace:
  lock_acquire+0x16f/0x3f0 kernel/locking/lockdep.c:3841
  __mutex_lock_common kernel/locking/mutex.c:925 [inline]
  __mutex_lock+0xf7/0x1310 kernel/locking/mutex.c:1072
  mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:1087
  tc_ctl_chain+0x42f/0x11a0 net/sched/cls_api.c:2812
  rtnetlink_rcv_msg+0x465/0xb00 net/core/rtnetlink.c:5192
  netlink_rcv_skb+0x17a/0x460 net/netlink/af_netlink.c:2485
  rtnetlink_rcv+0x1d/0x30 net/core/rtnetlink.c:5210
  netlink_unicast_kernel net/netlink/af_netlink.c:1310 [inline]
  netlink_unicast+0x536/0x720 net/netlink/af_netlink.c:1336
  netlink_sendmsg+0x8ae/0xd70 net/netlink/af_netlink.c:1925
  sock_sendmsg_nosec net/socket.c:621 [inline]
  sock_sendmsg+0xdd/0x130 net/socket.c:631
  ___sys_sendmsg+0x806/0x930 net/socket.c:2136
  __sys_sendmsg+0x105/0x1d0 net/socket.c:2174
  __do_sys_sendmsg net/socket.c:2183 [inline]
  __se_sys_sendmsg net/socket.c:2181 [inline]
  __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181
  do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
  entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x4400d9
Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7  
48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff  
ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
RSP: 002b:00007ffd09281608 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004400d9
RDX: 0000000000000000 RSI: 0000000020000080 RDI: 0000000000000003
RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401960
R13: 00000000004019f0 R14: 0000000000000000 R15: 0000000000000000
Modules linked in:
---[ end trace 25ab48d993ef9249 ]---
RIP: 0010:__lock_acquire+0x8df/0x4700 kernel/locking/lockdep.c:3215
Code: 28 00 00 00 0f 85 35 27 00 00 48 8d 65 d8 5b 41 5c 41 5d 41 5e 41 5f  
5d c3 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 0f  
85 dc 27 00 00 49 81 3c 24 20 45 9a 89 0f 84 03 f8
RSP: 0018:ffff88808b44f180 EFLAGS: 00010006
RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000
RDX: 000000000000000c RSI: 0000000000000000 RDI: 0000000000000060
RBP: ffff88808b44f350 R08: 0000000000000001 R09: 0000000000000001
R10: ffff88808b44f570 R11: 0000000000000001 R12: 0000000000000060
R13: 0000000000000000 R14: 0000000000000000 R15: ffff8880915d4680
FS:  0000000001ef6880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020000080 CR3: 0000000093549000 CR4: 00000000001406f0


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with  
syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* Re: [PATCH iproute2] lib/libnetlink: ensure a minimum of 32KB for the buffer used in rtnl_recvmsg()
From: Eric Dumazet @ 2019-02-13 17:58 UTC (permalink / raw)
  To: Phil Sutter, Eric Dumazet, David Ahern, Stephen Hemminger, netdev,
	Hangbin Liu
In-Reply-To: <20190213174622.GD19329@orbyte.nwl.cc>



On 02/13/2019 09:46 AM, Phil Sutter wrote:
> Hi Eric,
> 
> On Tue, Feb 12, 2019 at 05:58:41PM -0800, Eric Dumazet wrote:
>> In the past, we tried to increase the buffer size up to 32 KB in order
>> to reduce number of syscalls per dump.
>>
>> Commit 2d34851cd341 ("lib/libnetlink: re malloc buff if size is not enough")
>> brought the size back to 4KB because the kernel can not know the application
>> is ready to receive bigger requests.
>>
>> See kernel commits 9063e21fb026 ("netlink: autosize skb lengthes") and
>> d35c99ff77ec ("netlink: do not enter direct reclaim from netlink_dump()")
>> for more details.
> 
> Wouldn't it be better if the kernel recognized MSG_TRUNC and allocated a
> buffer large enough to hold the full message in that case? I have no
> idea how hard that would be to implement, but calling recvmsg() with
> MSG_TRUNC set and not getting the full message length in return is not
> quite what one expects after reading recvmsg(2).
> 

I am not sure what you are suggesting.

The buffer is already in the kernel, this is the skb in the receive queue.
Its size is unknown... We only can guess.

We can not change old kernels, and new iproute2/ss commands need to work with old kernels.

We can not change MSG_TRUNC semantic.

Adding another MSG_be_gentle_do_not_consume_skb_if_user_size_is_too_small wont solve the issue for old kernels.


^ permalink raw reply

* Re: [PATCH] net: phy: at803x: disable delay only for RGMII mode
From: Florian Fainelli @ 2019-02-13 17:59 UTC (permalink / raw)
  To: Niklas Cassel, Marc Gonzalez
  Cc: Andrew Lunn, Vinod Koul, David S Miller, linux-arm-msm,
	Bjorn Andersson, netdev, Nori, Sekhar, Peter Ujfalusi, hkallweit1
In-Reply-To: <20190213174034.GA6954@centauri.lan>

On 2/13/19 9:40 AM, Niklas Cassel wrote:
> On Wed, Feb 13, 2019 at 02:40:18PM +0100, Marc Gonzalez wrote:
>> On 13/02/2019 14:29, Andrew Lunn wrote:
>>
>>>> So we have these modes:
>>>>
>>>> PHY_INTERFACE_MODE_RGMII: TX and RX delays disabled
>>>> PHY_INTERFACE_MODE_RGMII_ID: TX and RX delays enabled
>>>> PHY_INTERFACE_MODE_RGMII_RXID: RX delay enabled, TX delay disabled
>>>> PHY_INTERFACE_MODE_RGMII_TXID: TX delay enabled, RX delay disabled
>>>>
>>>> What I don't like with this patch, is that if we specify phy-mode
>>>> PHY_INTERFACE_MODE_RGMII_TXID, this patch will enable TX delay,
>>>> but RX delay will not be explicitly set.
>>>
>>> That is not the behaviour we want. It is best to assume the device is
>>> in a random state, and correctly enable/disable all delays as
>>> requested. Only leave the hardware alone if PHY_INTERFACE_MODE_NA is
>>> used.
>>
>> That's what my patch did:
>> https://www.spinics.net/lists/netdev/msg445053.html
>>
>> But see Florian's remarks:
>> https://www.spinics.net/lists/netdev/msg445133.html
> 
> Hello Marc,
> 
> I saw that comment from Florian. However that was way back in 2017.
> Maybe the phy-modes were not as well defined back then?

The definition of the 'phy-mode' was clarified to be understood from the
perspective of the PHY device (hence the name) after we had several
fruitful exchanges with Marc (at least from my perspective), but since
the definition was not clear before, there is a high chance of finding
DTS/DTBs out there with the 'phy-mode' property understood from the
MAC's perspective, which would now be wrong.

> 
> Andrew recently suggested to fix the driver so that it conforms with the
> phy-modes, and fix any SoC that specified an incorrect phy-mode in DT
> and thus relied upon the broken behavior of the PHY driver:
> https://www.spinics.net/lists/netdev/msg445133.html
> 
> 
> So, I've rebased your old patch, see attachment.
> I suggest that Peter test it on am335x-evm.
> 
> am335x-evm appears to rely on the current broken behavior of the PHY
> driver, so we will probably need to fix the am335x-evm according to this:
> https://www.spinics.net/lists/netdev/msg445117.html
> and merge that as well.
> 
> 
> Andrew, Florian, do you both agree?

In my reply to Marc, there was a concern that while am335x-evm was
identified and reported to be broken after fixing the PHY driver, there
could be platforms out there that we have little to no visibility that
would most likely be equally broken. That concern still exists, and I
don't think there is anything we can do to even assess the size of the
problem unless we attempt to fix it, so maybe we should attempt to fix that.

There was a suggestion to Marc that one way to possibly "ignore" an
incorrectly broken 'phy-mode' property would be to allow specifying
rx/tx delay properties such that if the driver obtained its
phy_interface_t, yet still parsed rx/tx delays, the rx/tx delays would
take precedence, and we could possibly derive some sort of a "more
correct" phy_interface_t that we could assign back to phydev->interface
and issue a warning about that.

Another possible way to resolve that could be to introduce a 'mac-mode'
property, which must be strictly compatible with specifying a 'phy-mode'
property. For instance:

- MAC specifies mac-mode = 'rgmii-id', then the PHY must have phy-mode =
'rmgii' since the MAC is taking of inserting both RX and TX delays,
reverse also applies

- MAC specifies mac-mode = 'rgmii-txid', then the PHY must have phy-mode
= 'rgmii-rxid' because the MAC adds the TX delay, but the PHY should
insert the delay on the RX lines, reverse also applies

Because there is usually (not always, DSA is an exception) a 1:1 mapping
between MAC and PHY devices we could look up the 'mac-mode' property in
the MAC in the PHY library code and make sure that we have a compatible
matrix and if we do not, maybe pass something like PHY_INTERFACE_MODE_NA
such that the driver retains its settings.

Maybe another way to approach this is if we assume that the PHY comes up
configured correctly by the boot loader, or upon power on reset, we add
some PHY driver methods that allow us to determine the RGMII mode in
which a PHY is and that tells us whether we are compatible with the
MAC's phy_interface_t upon connection. We check both at connect() time
and if something does not look right, we flip the meaning of
phy_interface_t.

None of those solutions are entirely fool proof, but at least we might
be able to detect incorrect combinations, yet still make them work by
reversing the meaning of the 'phy-mode' property given information at hand.

Let me know if none of that makes sense and this just looks like yet
another brain dump.

Wonderful RGMII...
-- 
Florian

^ permalink raw reply

* Re: [PATCH iproute2 net-next v2 3/4] ss: Buffer raw fields first, then render them as a table
From: Eric Dumazet @ 2019-02-13 18:01 UTC (permalink / raw)
  To: Stefano Brivio; +Cc: Stephen Hemminger, netdev, Sabrina Dubroca, David Ahern
In-Reply-To: <20190213183831.24b3cc9a@redhat.com>



On 02/13/2019 09:38 AM, Stefano Brivio wrote:
>
> What would you suggest in that case, single whitespaces? Tabs?
> 

We have spaces at the moment, I would leave them.


^ permalink raw reply

* Re: [PATCH net-next 3/3] enetc: Add ENETC PF level external MDIO support
From: Andrew Lunn @ 2019-02-13 18:13 UTC (permalink / raw)
  To: Claudiu Manoil
  Cc: Shawn Guo, Li Yang, David S . Miller, devicetree,
	alexandru.marginean, linux-kernel, linux-arm-kernel, netdev
In-Reply-To: <1550055743-15542-4-git-send-email-claudiu.manoil@nxp.com>

On Wed, Feb 13, 2019 at 01:02:23PM +0200, Claudiu Manoil wrote:
> Each ENETC PF has its own MDIO interface, the corresponding
> MDIO registers are mapped in the ENETC's Port register block.
> The current patch adds a driver for these PF level MDIO buses,
> so that each PF can manage directly its own external link.
> 
> Signed-off-by: Alex Marginean <alexandru.marginean@nxp.com>
> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
> ---
>  drivers/net/ethernet/freescale/enetc/Makefile     |   3 +-
>  drivers/net/ethernet/freescale/enetc/enetc_mdio.c | 196 ++++++++++++++++++++++
>  drivers/net/ethernet/freescale/enetc/enetc_pf.c   |  13 ++
>  drivers/net/ethernet/freescale/enetc/enetc_pf.h   |   6 +
>  4 files changed, 217 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/net/ethernet/freescale/enetc/enetc_mdio.c
> 
> diff --git a/drivers/net/ethernet/freescale/enetc/Makefile b/drivers/net/ethernet/freescale/enetc/Makefile
> index 6976602..7139e41 100644
> --- a/drivers/net/ethernet/freescale/enetc/Makefile
> +++ b/drivers/net/ethernet/freescale/enetc/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  obj-$(CONFIG_FSL_ENETC) += fsl-enetc.o
> -fsl-enetc-$(CONFIG_FSL_ENETC) += enetc.o enetc_cbdr.o enetc_ethtool.o
> +fsl-enetc-$(CONFIG_FSL_ENETC) += enetc.o enetc_cbdr.o enetc_ethtool.o \
> +				 enetc_mdio.o
>  fsl-enetc-$(CONFIG_PCI_IOV) += enetc_msg.o
>  fsl-enetc-objs := enetc_pf.o $(fsl-enetc-y)
>  
> diff --git a/drivers/net/ethernet/freescale/enetc/enetc_mdio.c b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
> new file mode 100644
> index 0000000..e71b4fd
> --- /dev/null
> +++ b/drivers/net/ethernet/freescale/enetc/enetc_mdio.c
> @@ -0,0 +1,196 @@
> +// SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)
> +/* Copyright 2019 NXP */
> +
> +#include <linux/mdio.h>
> +#include <linux/of_mdio.h>
> +
> +#include "enetc_pf.h"
> +
> +struct enetc_mdio_regs {
> +	u32	mdio_cfg;	/* MDIO configuration and status */
> +	u32	mdio_ctl;	/* MDIO control */
> +	u32	mdio_data;	/* MDIO data */
> +	u32	mdio_addr;	/* MDIO address */
> +};
> +
> +#define bus_to_enetc_regs(bus)	(struct enetc_mdio_regs __iomem *)((bus)->priv)
> +
> +#define ENETC_MDIO_REG_OFFSET	0x1c00
> +#define ENETC_MDC_DIV		258
> +#define MDIO_CFG_CLKDIV(x)	((((x) >> 1) & 0xff) << 8)
> +#define MDIO_CFG_BSY		BIT(0)
> +#define MDIO_CFG_RD_ER		BIT(1)
> +#define MDIO_CFG_ENC		BIT(6)
> +#define MDIO_CFG_NEG		BIT(23)
> +#define MDIO_CTL_DEV_ADDR(x)	((x) & 0x1f)
> +#define MDIO_CTL_PORT_ADDR(x)	(((x) & 0x1f) << 5)
> +#define MDIO_CTL_READ		BIT(15)
> +#define MDIO_DATA(x)		((x) & 0xffff)
> +
> +#define TIMEOUT	1000
> +static int enetc_wait_complete(struct device *dev,
> +			       struct enetc_mdio_regs __iomem *regs)
> +{
> +	unsigned int timeout;
> +
> +	timeout = TIMEOUT;
> +	while ((enetc_rd_reg(&regs->mdio_cfg) & MDIO_CFG_BSY) && timeout) {
> +		cpu_relax();
> +		timeout--;
> +	}
> +
> +	if (!timeout) {
> +		dev_err(dev, "timeout waiting for bus to be free\n");
> +		return -ETIMEDOUT;
> +	}

Hi Claudiu

readx_poll_timeout()

> +
> +	return 0;
> +}
> +
> +static int enetc_mdio_write(struct mii_bus *bus, int phy_id, int regnum,
> +			    u16 value)
> +{
> +	struct enetc_mdio_regs __iomem *regs = bus_to_enetc_regs(bus);
> +	u32 mdio_ctl, mdio_cfg;
> +	u16 dev_addr;
> +	int ret;
> +
> +	mdio_cfg = MDIO_CFG_CLKDIV(ENETC_MDC_DIV) | MDIO_CFG_NEG;

What does MDIO_CFG_NEG mean?

> +	if (regnum & MII_ADDR_C45) {
> +		/* clause 45 */

Does the comment add anything useful?

> +		dev_addr = (regnum >> 16) & 0x1f;
> +		mdio_cfg |= MDIO_CFG_ENC;

Maybe MDIO_CFG_ENC could be called MDIO_CFG_C45? Assuming that is what
it actually means?

> +int enetc_mdio_probe(struct enetc_pf *pf)
> +{
> +	struct device *dev = &pf->si->pdev->dev;
> +	struct enetc_mdio_regs __iomem *regs;
> +	struct mii_bus *bus;
> +	int ret;
> +
> +	bus = mdiobus_alloc_size(sizeof(regs));
> +	if (!bus)
> +		return -ENOMEM;
> +
> +	bus->name = "Freescale ENETC MDIO Bus";
> +	bus->read = enetc_mdio_read;
> +	bus->write = enetc_mdio_write;
> +	bus->parent = dev;
> +	snprintf(bus->id, MII_BUS_ID_SIZE, "%s", dev_name(dev));
> +
> +	/* store the enetc mdio base address for this bus */
> +	regs = pf->si->hw.port + ENETC_MDIO_REG_OFFSET;
> +	bus->priv = regs;
> +
> +	ret = of_mdiobus_register(bus, dev->of_node);

It is a good idea to have an mdio node which contains the list of
PHYs. You can get into odd situations if you don't do that.

>  
> @@ -770,12 +771,23 @@ static int enetc_of_get_phy(struct enetc_ndev_priv *priv)
>  		priv->phy_node = of_node_get(np);
>  	}
>  
> +	if (!of_phy_is_fixed_link(np)) {
> +		err = enetc_mdio_probe(pf);
> +		if (err) {
> +			dev_err(priv->dev, "MDIO bus registration failed\n");

enetc_mdio_probe() already prints an error message. You don't really
need both.

     Andrew

^ permalink raw reply

* Re: [PATCH net-next 2/3] arm64: dts: fsl: ls1028a-rdb: Add ENETC external eth ports for the LS1028A RDB board
From: Andrew Lunn @ 2019-02-13 18:15 UTC (permalink / raw)
  To: Claudiu Manoil
  Cc: Shawn Guo, Li Yang, David S . Miller, devicetree,
	alexandru.marginean, linux-kernel, linux-arm-kernel, netdev
In-Reply-To: <1550055743-15542-3-git-send-email-claudiu.manoil@nxp.com>

On Wed, Feb 13, 2019 at 01:02:22PM +0200, Claudiu Manoil wrote:
> The LS1028A RDB board features an Atheros PHY connected over
> SGMII to the ENETC PF0 (or Port0).  ENETC Port1 (PF1) has no
> external connection on this board, so it can be disabled for now.
> 
> Signed-off-by: Alex Marginean <alexandru.marginean@nxp.com>
> Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
> ---
>  arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts | 15 +++++++++++++++
>  1 file changed, 15 insertions(+)
> 
> diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
> index fdeb417..c8487893 100644
> --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
> +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-rdb.dts
> @@ -71,3 +71,18 @@
>  &duart1 {
>  	status = "okay";
>  };
> +
> +&enetc_port0 {
> +	phy-handle = <&sgmii_phy0>;
> +	phy-connection-type = "sgmii";
> +	#address-cells = <1>;
> +	#size-cells = <0>;
> +
> +	sgmii_phy0: ethernet-phy@2 {
> +		reg = <0x2>;
> +	};
> +};
> +

Hi Claudiu

It is better to use:

&enetc_port0 {
	phy-handle = <&sgmii_phy0>;
	phy-connection-type = "sgmii";
	#address-cells = <1>;
	#size-cells = <0>;

	mdio {
		sgmii_phy0: ethernet-phy@2 {
			reg = <0x2>;
		};
	};
};

	Andrew

^ permalink raw reply

* Re: [PATCH net] selftests: fix timestamping Makefile
From: shuah @ 2019-02-13 18:17 UTC (permalink / raw)
  To: Deepa Dinamani; +Cc: willemb, netdev, linux-kselftest, shuah
In-Reply-To: <20190213170914.11991-1-deepa.kernel@gmail.com>

On 2/13/19 10:09 AM, Deepa Dinamani wrote:
> The clean target in the makefile conflicts with the generic
> kselftests lib.mk, and fails to properly remove the compiled
> test programs.
> 
> Remove the redundant rule, the TEST_GEN_FILES will be already
> removed by the CLEAN macro in lib.mk.
> 
> Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
> ---
> 
> * Changes since v1: as per review comments
> 
>   tools/testing/selftests/networking/timestamping/Makefile | 3 ---
>   1 file changed, 3 deletions(-)
> 
> diff --git a/tools/testing/selftests/networking/timestamping/Makefile b/tools/testing/selftests/networking/timestamping/Makefile
> index 9050eeea5f5f..1de8bd8ccf5d 100644
> --- a/tools/testing/selftests/networking/timestamping/Makefile
> +++ b/tools/testing/selftests/networking/timestamping/Makefile
> @@ -9,6 +9,3 @@ all: $(TEST_PROGS)
>   top_srcdir = ../../../../..
>   KSFT_KHDR_INSTALL := 1
>   include ../../lib.mk
> -
> -clean:
> -	rm -fr $(TEST_GEN_FILES)
> 

Thanks for the patch.

Acked-by: Shuah Khan <shuah@kernel.org>

thanks,
-- Shuah

^ permalink raw reply

* [PATCH v2 bpf-next 2/2] tools: sync uapi/linux/if_link.h header
From: Andrii Nakryiko @ 2019-02-13 18:25 UTC (permalink / raw)
  To: andrii.nakryiko, netdev, kernel-team, yhs, ast, kafai, daniel,
	david.laight, acme
  Cc: Andrii Nakryiko
In-Reply-To: <20190213182554.2763867-1-andriin@fb.com>

Syncing if_link.h that got out of sync.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
---
 tools/include/uapi/linux/if_link.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h
index d6533828123a..5b225ff63b48 100644
--- a/tools/include/uapi/linux/if_link.h
+++ b/tools/include/uapi/linux/if_link.h
@@ -925,6 +925,7 @@ enum {
 enum {
 	LINK_XSTATS_TYPE_UNSPEC,
 	LINK_XSTATS_TYPE_BRIDGE,
+	LINK_XSTATS_TYPE_BOND,
 	__LINK_XSTATS_TYPE_MAX
 };
 #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1)
-- 
2.17.1


^ permalink raw reply related

* [PATCH v2 bpf-next 0/2] tools/bpf: smaller clean ups
From: Andrii Nakryiko @ 2019-02-13 18:25 UTC (permalink / raw)
  To: andrii.nakryiko, netdev, kernel-team, yhs, ast, kafai, daniel,
	david.laight, acme
  Cc: Andrii Nakryiko

This patchset replaces bzero() with memset() and syncs if_link.h header
to suppress unsynchronized headers warning.

Andrii Nakryiko (2):
  tools/bpf: replace bzero with memset
  tools: sync uapi/linux/if_link.h header

 tools/include/uapi/linux/if_link.h |  1 +
 tools/lib/bpf/bpf.c                | 48 +++++++++++++++---------------
 tools/lib/bpf/btf.c                |  5 ++--
 tools/lib/bpf/libbpf.c             |  5 ++--
 4 files changed, 29 insertions(+), 30 deletions(-)

-- 
2.17.1


^ permalink raw reply

* [PATCH v2 bpf-next 1/2] tools/bpf: replace bzero with memset
From: Andrii Nakryiko @ 2019-02-13 18:25 UTC (permalink / raw)
  To: andrii.nakryiko, netdev, kernel-team, yhs, ast, kafai, daniel,
	david.laight, acme
  Cc: Andrii Nakryiko
In-Reply-To: <20190213182554.2763867-1-andriin@fb.com>

bzero() call is deprecated and superseded by memset().

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Reported-by: David Laight <david.laight@aculab.com>
Acked-by: Martin KaFai Lau <kafai@fb.com>
---
 tools/lib/bpf/bpf.c    | 48 +++++++++++++++++++++---------------------
 tools/lib/bpf/btf.c    |  5 ++---
 tools/lib/bpf/libbpf.c |  5 ++---
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a5261f39e2bd..9cd015574e83 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -22,7 +22,7 @@
  */
 
 #include <stdlib.h>
-#include <strings.h>
+#include <string.h>
 #include <memory.h>
 #include <unistd.h>
 #include <asm/unistd.h>
@@ -228,7 +228,7 @@ int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr,
 
 	name_len = load_attr->name ? strlen(load_attr->name) : 0;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = load_attr->prog_type;
 	attr.expected_attach_type = load_attr->expected_attach_type;
 	attr.insn_cnt = (__u32)load_attr->insns_cnt;
@@ -340,7 +340,7 @@ int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns,
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.prog_type = type;
 	attr.insn_cnt = (__u32)insns_cnt;
 	attr.insns = ptr_to_u64(insns);
@@ -360,7 +360,7 @@ int bpf_map_update_elem(int fd, const void *key, const void *value,
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
@@ -373,7 +373,7 @@ int bpf_map_lookup_elem(int fd, const void *key, void *value)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
@@ -385,7 +385,7 @@ int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
@@ -398,7 +398,7 @@ int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.value = ptr_to_u64(value);
@@ -410,7 +410,7 @@ int bpf_map_delete_elem(int fd, const void *key)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 
@@ -421,7 +421,7 @@ int bpf_map_get_next_key(int fd, const void *key, void *next_key)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_fd = fd;
 	attr.key = ptr_to_u64(key);
 	attr.next_key = ptr_to_u64(next_key);
@@ -433,7 +433,7 @@ int bpf_obj_pin(int fd, const char *pathname)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.pathname = ptr_to_u64((void *)pathname);
 	attr.bpf_fd = fd;
 
@@ -444,7 +444,7 @@ int bpf_obj_get(const char *pathname)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.pathname = ptr_to_u64((void *)pathname);
 
 	return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr));
@@ -455,7 +455,7 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type,
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	   = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type   = type;
@@ -468,7 +468,7 @@ int bpf_prog_detach(int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	 = target_fd;
 	attr.attach_type = type;
 
@@ -479,7 +479,7 @@ int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.target_fd	 = target_fd;
 	attr.attach_bpf_fd = prog_fd;
 	attr.attach_type = type;
@@ -493,7 +493,7 @@ int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags,
 	union bpf_attr attr;
 	int ret;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.query.target_fd	= target_fd;
 	attr.query.attach_type	= type;
 	attr.query.query_flags	= query_flags;
@@ -514,7 +514,7 @@ int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size,
 	union bpf_attr attr;
 	int ret;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.test.prog_fd = prog_fd;
 	attr.test.data_in = ptr_to_u64(data);
 	attr.test.data_out = ptr_to_u64(data_out);
@@ -539,7 +539,7 @@ int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr)
 	if (!test_attr->data_out && test_attr->data_size_out > 0)
 		return -EINVAL;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.test.prog_fd = test_attr->prog_fd;
 	attr.test.data_in = ptr_to_u64(test_attr->data_in);
 	attr.test.data_out = ptr_to_u64(test_attr->data_out);
@@ -559,7 +559,7 @@ int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id)
 	union bpf_attr attr;
 	int err;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.start_id = start_id;
 
 	err = sys_bpf(BPF_PROG_GET_NEXT_ID, &attr, sizeof(attr));
@@ -574,7 +574,7 @@ int bpf_map_get_next_id(__u32 start_id, __u32 *next_id)
 	union bpf_attr attr;
 	int err;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.start_id = start_id;
 
 	err = sys_bpf(BPF_MAP_GET_NEXT_ID, &attr, sizeof(attr));
@@ -588,7 +588,7 @@ int bpf_prog_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.prog_id = id;
 
 	return sys_bpf(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr));
@@ -598,7 +598,7 @@ int bpf_map_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.map_id = id;
 
 	return sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr));
@@ -608,7 +608,7 @@ int bpf_btf_get_fd_by_id(__u32 id)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.btf_id = id;
 
 	return sys_bpf(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr));
@@ -619,7 +619,7 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len)
 	union bpf_attr attr;
 	int err;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.info.bpf_fd = prog_fd;
 	attr.info.info_len = *info_len;
 	attr.info.info = ptr_to_u64(info);
@@ -635,7 +635,7 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
 {
 	union bpf_attr attr;
 
-	bzero(&attr, sizeof(attr));
+	memset(&attr, 0, sizeof(attr));
 	attr.raw_tracepoint.name = ptr_to_u64(name);
 	attr.raw_tracepoint.prog_fd = prog_fd;
 
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 6953fedb88ff..ade1c32fb083 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -4,7 +4,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <strings.h>
 #include <unistd.h>
 #include <errno.h>
 #include <linux/err.h>
@@ -484,7 +483,7 @@ int btf__get_from_id(__u32 id, struct btf **btf)
 		goto exit_free;
 	}
 
-	bzero(ptr, last_size);
+	memset(ptr, 0, last_size);
 	btf_info.btf = ptr_to_u64(ptr);
 	err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
 
@@ -498,7 +497,7 @@ int btf__get_from_id(__u32 id, struct btf **btf)
 			goto exit_free;
 		}
 		ptr = temp_ptr;
-		bzero(ptr, last_size);
+		memset(ptr, 0, last_size);
 		btf_info.btf = ptr_to_u64(ptr);
 		err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len);
 	}
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e3c39edfb9d3..6ef7e6e4cbd3 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -18,7 +18,6 @@
 #include <libgen.h>
 #include <inttypes.h>
 #include <string.h>
-#include <strings.h>
 #include <unistd.h>
 #include <fcntl.h>
 #include <errno.h>
@@ -308,7 +307,7 @@ bpf_program__init(void *data, size_t size, char *section_name, int idx,
 		return -EINVAL;
 	}
 
-	bzero(prog, sizeof(*prog));
+	memset(prog, 0, sizeof(*prog));
 
 	prog->section_name = strdup(section_name);
 	if (!prog->section_name) {
@@ -1577,7 +1576,7 @@ bpf_program__load(struct bpf_program *prog,
 		struct bpf_prog_prep_result result;
 		bpf_program_prep_t preprocessor = prog->preprocessor;
 
-		bzero(&result, sizeof(result));
+		memset(&result, 0, sizeof(result));
 		err = preprocessor(prog, i, prog->insns,
 				   prog->insns_cnt, &result);
 		if (err) {
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v2 bpf-next 2/2] tools: sync uapi/linux/if_link.h header
From: Martin Lau @ 2019-02-13 18:36 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: andrii.nakryiko@gmail.com, netdev@vger.kernel.org, Kernel Team,
	Yonghong Song, Alexei Starovoitov, daniel@iogearbox.net,
	david.laight@aculab.com, acme@kernel.org
In-Reply-To: <20190213182554.2763867-3-andriin@fb.com>

On Wed, Feb 13, 2019 at 10:25:54AM -0800, Andrii Nakryiko wrote:
> Syncing if_link.h that got out of sync.
Acked-by: Martin KaFai Lau <kafai@fb.com>

^ permalink raw reply

* [PATCH net 0/2] net: phy: fix locking issue
From: Heiner Kallweit @ 2019-02-13 19:10 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller
  Cc: Russell King - ARM Linux, netdev@vger.kernel.org

Russell pointed out that the locking used in phy_is_started() isn't
needed and misleading. This locking also contributes to a race fixed
with patch 2.

Heiner Kallweit (2):
  net: phy: don't use locking in phy_is_started
  net: phy: fix potential race in the phylib state machine

 drivers/net/phy/phy.c | 13 +++++++------
 include/linux/phy.h   | 15 +--------------
 2 files changed, 8 insertions(+), 20 deletions(-)

-- 
2.20.1


^ permalink raw reply

* [PATCH net 1/2] net: phy: don't use locking in phy_is_started
From: Heiner Kallweit @ 2019-02-13 19:11 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller
  Cc: Russell King - ARM Linux, netdev@vger.kernel.org
In-Reply-To: <2a39271d-3b9e-e425-98b4-b2a24074e806@gmail.com>

Russell suggested to remove the locking from phy_is_started() because
the read is atomic anyway and actually the locking may be more
misleading.

Fixes: 2b3e88ea6528 ("net: phy: improve phy state checking")
Suggested-by: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/phy.c | 11 +++++------
 include/linux/phy.h   | 15 +--------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index ca5e0c0f018c..602816d70281 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -553,7 +553,7 @@ int phy_start_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto out_unlock;
 
-	if (__phy_is_started(phydev)) {
+	if (phy_is_started(phydev)) {
 		if (phydev->autoneg == AUTONEG_ENABLE) {
 			err = phy_check_link_status(phydev);
 		} else {
@@ -709,7 +709,7 @@ void phy_stop_machine(struct phy_device *phydev)
 	cancel_delayed_work_sync(&phydev->state_queue);
 
 	mutex_lock(&phydev->lock);
-	if (__phy_is_started(phydev))
+	if (phy_is_started(phydev))
 		phydev->state = PHY_UP;
 	mutex_unlock(&phydev->lock);
 }
@@ -839,15 +839,14 @@ EXPORT_SYMBOL(phy_stop_interrupts);
  */
 void phy_stop(struct phy_device *phydev)
 {
-	mutex_lock(&phydev->lock);
-
-	if (!__phy_is_started(phydev)) {
+	if (!phy_is_started(phydev)) {
 		WARN(1, "called from state %s\n",
 		     phy_state_to_str(phydev->state));
-		mutex_unlock(&phydev->lock);
 		return;
 	}
 
+	mutex_lock(&phydev->lock);
+
 	if (phy_interrupt_is_valid(phydev))
 		phy_disable_interrupts(phydev);
 
diff --git a/include/linux/phy.h b/include/linux/phy.h
index ef20aeea10cc..127fcc9c3778 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -674,26 +674,13 @@ phy_lookup_setting(int speed, int duplex, const unsigned long *mask,
 size_t phy_speeds(unsigned int *speeds, size_t size,
 		  unsigned long *mask);
 
-static inline bool __phy_is_started(struct phy_device *phydev)
-{
-	WARN_ON(!mutex_is_locked(&phydev->lock));
-
-	return phydev->state >= PHY_UP;
-}
-
 /**
  * phy_is_started - Convenience function to check whether PHY is started
  * @phydev: The phy_device struct
  */
 static inline bool phy_is_started(struct phy_device *phydev)
 {
-	bool started;
-
-	mutex_lock(&phydev->lock);
-	started = __phy_is_started(phydev);
-	mutex_unlock(&phydev->lock);
-
-	return started;
+	return phydev->state >= PHY_UP;
 }
 
 void phy_resolve_aneg_linkmode(struct phy_device *phydev);
-- 
2.20.1



^ permalink raw reply related

* [PATCH net 2/2] net: phy: fix potential race in the phylib state machine
From: Heiner Kallweit @ 2019-02-13 19:12 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, David Miller
  Cc: Russell King - ARM Linux, netdev@vger.kernel.org
In-Reply-To: <2a39271d-3b9e-e425-98b4-b2a24074e806@gmail.com>

Russell reported the following race in the phylib state machine
(quoting from his mail):

if (phy_polling_mode(phydev) && phy_is_started(phydev))
	phy_queue_state_machine(phydev, PHY_STATE_TIME);

state = PHY_UP
thread 0			thread 1
				phy_disconnect()
				+-phy_is_started()
phy_is_started()                |
				`-phy_stop()
				  +-phydev->state = PHY_HALTED
				  `-phy_stop_machine()
				    `-cancel_delayed_work_sync()
phy_queue_state_machine()
`-mod_delayed_work()

At this point, the phydev->state_queue() has been added back onto the
system workqueue despite phy_stop_machine() having been called and
cancel_delayed_work_sync() called on it.

Fix this by protecting the complete operation in thread 0.

Fixes: 2b3e88ea6528 ("net: phy: improve phy state checking")
Reported-by: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
---
 drivers/net/phy/phy.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 602816d70281..c5675df5fc6f 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -985,8 +985,10 @@ void phy_state_machine(struct work_struct *work)
 	 * state machine would be pointless and possibly error prone when
 	 * called from phy_disconnect() synchronously.
 	 */
+	mutex_lock(&phydev->lock);
 	if (phy_polling_mode(phydev) && phy_is_started(phydev))
 		phy_queue_state_machine(phydev, PHY_STATE_TIME);
+	mutex_unlock(&phydev->lock);
 }
 
 /**
-- 
2.20.1



^ permalink raw reply related

* Re: Fw: [Bug 202561] BUG: Null pointer dereference in __skb_unlink()
From: Cong Wang @ 2019-02-13 19:30 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Linux Kernel Network Developers, sharathkernel
In-Reply-To: <20190212144547.27dca239@shemminger-XPS-13-9360>

On Tue, Feb 12, 2019 at 6:10 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
> Original report from sharathkernel@gmail.com:
>
> NULL POINTER DEFERENCE DURING __skb_unlink()
>
> In the function call, __skb_try_recv_from_queue() (net/core/datagram.c),
> sbk_queue_walk() walks through the queue without checking if the next member in the queue has valid next pointer/address. When a socket buffer has to unlink, __skb_unlink() is called.
>
>
>
> Inside __skb_unlink() function, it doesn't verify if skb->next has a valid address. skb->next is assigned and used, without verifying the value inside it.


It should always have a valid ->next pointer as it is in a doubly
linked list, where the last one simply points to the head of the
list. I don't see any problem in the code you quote here.


>
> What could be probable solution, in this scenario? Should we check if skb->next is not NULL, before calling __skb_unlink()?


Do you have a reproducer? Also, your crash report is incomplete,
it doesn't even show a kernel version... Is it 4.20.7? Is it tainted?
Please share the complete dmesg.


Thanks.

^ permalink raw reply

* [PATCH net-next] net: ip6_gre: Give ERSPAN a fill_info link op of its own
From: Petr Machata @ 2019-02-13 19:31 UTC (permalink / raw)
  To: netdev@vger.kernel.org
  Cc: Petr Machata, davem@davemloft.net, kuznet@ms2.inr.ac.ru,
	yoshfuji@linux-ipv6.org, Lorenzo Bianconi

In commit c706863bc890 ("net: ip6_gre: always reports o_key to
userspace"), ip6gre and ip6gretap tunnels started reporting a TUNNEL_KEY
output flag even if one was not configured at the device.

When an okey-less ip6gre or ip6gretap netdevice is created, it initially
encapsulates the packets without okey. But any configuration change
(even a non-change such as setting TOS to an already-configured value)
then causes the okey flag from the reported configuration to be
circulated back to actual configuration. From that point on, the device
encapsulates packets with output key of 0.

The intention was to implement this behavior for ERSPAN devices, not for
all ip6gre devices. The ERSPAN netdevice should really have its own
fill_info callback. Add one.

Fixes: c706863bc890 ("net: ip6_gre: always reports o_key to userspace")
CC: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
Signed-off-by: Petr Machata <petrm@mellanox.com>
---
 net/ipv6/ip6_gre.c | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 65a4f96dc462..0a6087cffe54 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -2094,15 +2094,13 @@ static size_t ip6gre_get_size(const struct net_device *dev)
 		0;
 }
 
-static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+static int __ip6gre_fill_info(struct sk_buff *skb,
+			      const struct net_device *dev,
+			      __be16 base_o_flags)
 {
 	struct ip6_tnl *t = netdev_priv(dev);
 	struct __ip6_tnl_parm *p = &t->parms;
-	__be16 o_flags = p->o_flags;
-
-	if ((p->erspan_ver == 1 || p->erspan_ver == 2) &&
-	    !p->collect_md)
-		o_flags |= TUNNEL_KEY;
+	__be16 o_flags = p->o_flags | base_o_flags;
 
 	if (nla_put_u32(skb, IFLA_GRE_LINK, p->link) ||
 	    nla_put_be16(skb, IFLA_GRE_IFLAGS,
@@ -2155,6 +2153,11 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	return -EMSGSIZE;
 }
 
+static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	return __ip6gre_fill_info(skb, dev, 0);
+}
+
 static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = {
 	[IFLA_GRE_LINK]        = { .type = NLA_U32 },
 	[IFLA_GRE_IFLAGS]      = { .type = NLA_U16 },
@@ -2256,6 +2259,20 @@ static int ip6erspan_changelink(struct net_device *dev, struct nlattr *tb[],
 	return 0;
 }
 
+static int ip6erspan_fill_info(struct sk_buff *skb,
+			       const struct net_device *dev)
+{
+	struct ip6_tnl *t = netdev_priv(dev);
+	struct __ip6_tnl_parm *p = &t->parms;
+	__be16 base_o_flags = 0;
+
+	if ((p->erspan_ver == 1 || p->erspan_ver == 2) &&
+	    !p->collect_md)
+		base_o_flags |= TUNNEL_KEY;
+
+	return __ip6gre_fill_info(skb, dev, base_o_flags);
+}
+
 static struct rtnl_link_ops ip6gre_link_ops __read_mostly = {
 	.kind		= "ip6gre",
 	.maxtype	= IFLA_GRE_MAX,
@@ -2295,7 +2312,7 @@ static struct rtnl_link_ops ip6erspan_tap_ops __read_mostly = {
 	.newlink	= ip6erspan_newlink,
 	.changelink	= ip6erspan_changelink,
 	.get_size	= ip6gre_get_size,
-	.fill_info	= ip6gre_fill_info,
+	.fill_info	= ip6erspan_fill_info,
 	.get_link_net	= ip6_tnl_get_link_net,
 };
 
-- 
2.4.11


^ permalink raw reply related

* [RFC PATCH] net act_vlan: use correct len in skb_pull
From: Zahari Doychev @ 2019-02-13 19:51 UTC (permalink / raw)
  To: netdev, bridge, makita.toshiaki, nikolay, roopa, jhs, jiri,
	xiyou.wangcong
  Cc: johannes, zahari.doychev

The bridge and VLAN code expects that skb->data points to the start of the
VLAN header instead of the next (network) header. Currently after
tcf_vlan_act() on ingress filter skb->data points to the next network
header. In this case the Linux bridge does not forward correctly double
tagged VLAN packets added using tc vlan action as the outer vlan tag from
the skb is inserted at the wrong offset after the vlan tag in the payload.
Making skb->data to point to the VLAN header in tcf_vlan_act() by using
ETH_HLEN in skb_pull_rcsum() fixes the problem.

The following commands were used for testing:

ip link add name br0 type bridge vlan_filtering 1
ip link set dev br0 up

ip link set dev net0 up
ip link set dev net0 master br0

ip link set dev net1 up
ip link set dev net1 master br0

bridge vlan add dev net0 vid 100 master
bridge vlan add dev br0 vid 100 self
bridge vlan add dev net1 vid 100 master

tc qdisc add dev net0 handle ffff: clsact
tc qdisc add dev net1 handle ffff: clsact

tc filter add dev net0 ingress pref 1 protocol all flower \
                  action vlan push id 10 pipe action vlan push id 100

tc filter add dev net0 egress pref 1 protocol 802.1q flower \
                  vlan_id 100 vlan_ethtype 802.1q cvlan_id 10 \
                  action vlan pop pipe action vlan pop

Signed-off-by: Zahari Doychev <zahari.doychev@linux.com>
---
 net/sched/act_vlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 93fdaf707313..308d7d89f925 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -86,7 +86,7 @@ static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a,
 
 out:
 	if (skb_at_tc_ingress(skb))
-		skb_pull_rcsum(skb, skb->mac_len);
+		skb_pull_rcsum(skb, ETH_HLEN);
 
 	return action;
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH bpf-next v11 0/7] bpf: add BPF_LWT_ENCAP_IP option to bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-02-13 19:53 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov

This patchset implements BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
BPF helper. It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN
and BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

V2 changes: added flowi-based route lookup, IPv6 encapping, and
   encapping on ingress.

V3 changes: incorporated David Ahern's suggestions:
   - added l3mdev check/oif (patch 2)
   - sync bpf.h from include/uapi into tools/include/uapi
   - selftest tweaks

V4 changes: moved route lookup/dst change from bpf_push_ip_encap
   to when BPF_LWT_REROUTE is handled, as suggested by David Ahern.

V5 changes: added a check in lwt_xmit that skb->protocol stays the
   same if the skb is to be passed back to the stack (ret == BPF_OK).
   Again, suggested by David Ahern.

V6 changes: abandoned.

V7 changes: added handling of GSO packets (patch 3 in the patchset added),
   as suggested by BPF maintainers.

V8 changes:
   - fixed build errors when LWT or IPV6 are not enabled;
   - whitelisted TCP GSO instead of blacklisting SCTP and UDP GSO, as
     suggested by Willem de Bruijn;
   - added validation that pushed length cover needed headers when GRE/UDP
     encap is detected, as suggested by Willem de Bruijn;
   - a couple of minor/stylistic tweaks/fixed typos.

V9 changes:
   - fixed a kbuild test robot compiler warning;
   - added ipv6_route_input to ipv6_stub (patch 4 in the patchset
     added), and IPv6 routing functions are now invoked via ipv6_stub,
     as suggested by David Ahern.

V10 changes:
   - removed unnecessary IS_ENABLED and pr_warn_once from patch 5.

V11 changes: fixed a potential dst leak in patch 5, as suggested by
    David Ahern.

Peter Oskolkov (7):
  bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap
  bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
  bpf: handle GSO in bpf_lwt_push_encap
  ipv6_stub: add ipv6_route_input stub/proxy.
  bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c
  bpf: sync <kdir>/include/.../bpf.h with tools/include/.../bpf.h
  selftests: bpf: add test_lwt_ip_encap selftest

 include/net/addrconf.h                        |   1 +
 include/net/lwtunnel.h                        |   2 +
 include/uapi/linux/bpf.h                      |  26 +-
 net/core/filter.c                             |  49 ++-
 net/core/lwt_bpf.c                            | 254 +++++++++++++-
 net/ipv6/addrconf_core.c                      |   6 +
 net/ipv6/af_inet6.c                           |   7 +
 tools/include/uapi/linux/bpf.h                |  26 +-
 tools/testing/selftests/bpf/Makefile          |   3 +-
 .../selftests/bpf/progs/test_lwt_ip_encap.c   |  85 +++++
 .../selftests/bpf/test_lwt_ip_encap.sh        | 311 ++++++++++++++++++
 11 files changed, 758 insertions(+), 12 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/test_lwt_ip_encap.c
 create mode 100755 tools/testing/selftests/bpf/test_lwt_ip_encap.sh

-- 
2.20.1.791.gb4d0f1c61a-goog

^ permalink raw reply

* [PATCH bpf-next v11 1/7] bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-02-13 19:53 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov
In-Reply-To: <20190213195341.184969-1-posk@google.com>

This patch adds all needed plumbing in preparation to allowing
bpf programs to do IP encapping via bpf_lwt_push_encap. Actual
implementation is added in the next patch in the patchset.

Of note:
- bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT
  prog types in addition to BPF_PROG_TYPE_LWT_IN;
- if the skb being encapped has GSO set, encapsulation is limited
  to IPIP/IP+GRE/IP+GUE (both IPv4 and IPv6);
- as route lookups are different for ingress vs egress, the single
  external bpf_lwt_push_encap BPF helper is routed internally to
  either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs,
  depending on prog type.

v8 changes: fixed a typo.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/uapi/linux/bpf.h | 26 ++++++++++++++++++++--
 net/core/filter.c        | 48 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 25c8c0e62ecf..bcdd2474eee7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2016,6 +2016,19 @@ union bpf_attr {
  *			Only works if *skb* contains an IPv6 packet. Insert a
  *			Segment Routing Header (**struct ipv6_sr_hdr**) inside
  *			the IPv6 header.
+ *		**BPF_LWT_ENCAP_IP**
+ *			IP encapsulation (GRE/GUE/IPIP/etc). The outer header
+ *			must be IPv4 or IPv6, followed by zero or more
+ *			additional headers, up to LWT_BPF_MAX_HEADROOM total
+ *			bytes in all prepended headers. Please note that
+ *			if skb_is_gso(skb) is true, no more than two headers
+ *			can be prepended, and the inner header, if present,
+ *			should be either GRE or UDP/GUE.
+ *
+ *		BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of
+ *		type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called
+ *		by bpf programs of types BPF_PROG_TYPE_LWT_IN and
+ *		BPF_PROG_TYPE_LWT_XMIT.
  *
  * 		A call to this helper is susceptible to change the underlaying
  * 		packet buffer. Therefore, at load time, all checks on pointers
@@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off {
 /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */
 enum bpf_lwt_encap_mode {
 	BPF_LWT_ENCAP_SEG6,
-	BPF_LWT_ENCAP_SEG6_INLINE
+	BPF_LWT_ENCAP_SEG6_INLINE,
+	BPF_LWT_ENCAP_IP,
 };
 
 #define __bpf_md_ptr(type, name)	\
@@ -2606,7 +2620,15 @@ enum bpf_ret_code {
 	BPF_DROP = 2,
 	/* 3-6 reserved */
 	BPF_REDIRECT = 7,
-	/* >127 are reserved for prog type specific return codes */
+	/* >127 are reserved for prog type specific return codes.
+	 *
+	 * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and
+	 *    BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been
+	 *    changed and should be routed based on its new L3 header.
+	 *    (This is an L3 redirect, as opposed to L2 redirect
+	 *    represented by BPF_REDIRECT above).
+	 */
+	BPF_LWT_REROUTE = 128,
 };
 
 struct bpf_sock {
diff --git a/net/core/filter.c b/net/core/filter.c
index 353735575204..12c88c21b6b8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4815,7 +4815,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 }
 #endif /* CONFIG_IPV6_SEG6_BPF */
 
-BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			     bool ingress)
+{
+	return -EINVAL;  /* Implemented in the next patch. */
+}
+#endif
+
+BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	   u32, len)
 {
 	switch (type) {
@@ -4823,14 +4831,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
 	case BPF_LWT_ENCAP_SEG6:
 	case BPF_LWT_ENCAP_SEG6_INLINE:
 		return bpf_push_seg6_encap(skb, type, hdr, len);
+#endif
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
+#endif
+	default:
+		return -EINVAL;
+	}
+}
+
+BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
+	   void *, hdr, u32, len)
+{
+	switch (type) {
+#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
+	case BPF_LWT_ENCAP_IP:
+		return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
 #endif
 	default:
 		return -EINVAL;
 	}
 }
 
-static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
-	.func		= bpf_lwt_push_encap,
+static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
+	.func		= bpf_lwt_in_push_encap,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_MEM,
+	.arg4_type	= ARG_CONST_SIZE
+};
+
+static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
+	.func		= bpf_lwt_xmit_push_encap,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_CTX,
@@ -5417,7 +5452,8 @@ bool bpf_helper_changes_pkt_data(void *func)
 	    func == bpf_lwt_seg6_adjust_srh ||
 	    func == bpf_lwt_seg6_action ||
 #endif
-	    func == bpf_lwt_push_encap)
+	    func == bpf_lwt_in_push_encap ||
+	    func == bpf_lwt_xmit_push_encap)
 		return true;
 
 	return false;
@@ -5815,7 +5851,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
 	switch (func_id) {
 	case BPF_FUNC_lwt_push_encap:
-		return &bpf_lwt_push_encap_proto;
+		return &bpf_lwt_in_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
@@ -5851,6 +5887,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_l4_csum_replace_proto;
 	case BPF_FUNC_set_hash_invalid:
 		return &bpf_set_hash_invalid_proto;
+	case BPF_FUNC_lwt_push_encap:
+		return &bpf_lwt_xmit_push_encap_proto;
 	default:
 		return lwt_out_func_proto(func_id, prog);
 	}
-- 
2.20.1.791.gb4d0f1c61a-goog


^ permalink raw reply related

* [PATCH bpf-next v11 2/7] bpf: implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-02-13 19:53 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov
In-Reply-To: <20190213195341.184969-1-posk@google.com>

Implement BPF_LWT_ENCAP_IP mode in bpf_lwt_push_encap BPF helper.
It enables BPF programs (specifically, BPF_PROG_TYPE_LWT_IN and
BPF_PROG_TYPE_LWT_XMIT prog types) to add IP encapsulation headers
to packets (e.g. IP/GRE, GUE, IPIP).

This is useful when thousands of different short-lived flows should be
encapped, each with different and dynamically determined destination.
Although lwtunnels can be used in some of these scenarios, the ability
to dynamically generate encap headers adds more flexibility, e.g.
when routing depends on the state of the host (reflected in global bpf
maps).

v7 changes:
 - added a call skb_clear_hash();
 - removed calls to skb_set_transport_header();
 - refuse to encap GSO-enabled packets.

v8 changes:
 - fix build errors when LWT is not enabled.

Note: the next patch in the patchset with deal with GSO-enabled packets,
which are currently rejected at encapping attempt.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 include/net/lwtunnel.h |  2 ++
 net/core/filter.c      |  3 +-
 net/core/lwt_bpf.c     | 65 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 33fd9ba7e0e5..671113bcb2cc 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -126,6 +126,8 @@ int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
 int lwtunnel_xmit(struct sk_buff *skb);
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
+			  bool ingress);
 
 static inline void lwtunnel_set_redirect(struct dst_entry *dst)
 {
diff --git a/net/core/filter.c b/net/core/filter.c
index 12c88c21b6b8..a78deb2656e1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -73,6 +73,7 @@
 #include <linux/seg6_local.h>
 #include <net/seg6.h>
 #include <net/seg6_local.h>
+#include <net/lwtunnel.h>
 
 /**
  *	sk_filter_trim_cap - run a packet through a socket filter
@@ -4819,7 +4820,7 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
 static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
 			     bool ingress)
 {
-	return -EINVAL;  /* Implemented in the next patch. */
+	return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
 }
 #endif
 
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index a648568c5e8f..e5a9850d9f48 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -390,6 +390,71 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
+{
+	/* Handling of GSO-enabled packets is added in the next patch. */
+	return -EOPNOTSUPP;
+}
+
+int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
+{
+	struct iphdr *iph;
+	bool ipv4;
+	int err;
+
+	if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
+		return -EINVAL;
+
+	/* validate protocol and length */
+	iph = (struct iphdr *)hdr;
+	if (iph->version == 4) {
+		ipv4 = true;
+		if (unlikely(len < iph->ihl * 4))
+			return -EINVAL;
+	} else if (iph->version == 6) {
+		ipv4 = false;
+		if (unlikely(len < sizeof(struct ipv6hdr)))
+			return -EINVAL;
+	} else {
+		return -EINVAL;
+	}
+
+	if (ingress)
+		err = skb_cow_head(skb, len + skb->mac_len);
+	else
+		err = skb_cow_head(skb,
+				   len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
+	if (unlikely(err))
+		return err;
+
+	/* push the encap headers and fix pointers */
+	skb_reset_inner_headers(skb);
+	skb->encapsulation = 1;
+	skb_push(skb, len);
+	if (ingress)
+		skb_postpush_rcsum(skb, iph, len);
+	skb_reset_network_header(skb);
+	memcpy(skb_network_header(skb), hdr, len);
+	bpf_compute_data_pointers(skb);
+	skb_clear_hash(skb);
+
+	if (ipv4) {
+		skb->protocol = htons(ETH_P_IP);
+		iph = ip_hdr(skb);
+
+		if (!iph->check)
+			iph->check = ip_fast_csum((unsigned char *)iph,
+						  iph->ihl);
+	} else {
+		skb->protocol = htons(ETH_P_IPV6);
+	}
+
+	if (skb_is_gso(skb))
+		return handle_gso_encap(skb, ipv4, len);
+
+	return 0;
+}
+
 static int __init bpf_lwt_init(void)
 {
 	return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);
-- 
2.20.1.791.gb4d0f1c61a-goog


^ permalink raw reply related

* [PATCH bpf-next v11 3/7] bpf: handle GSO in bpf_lwt_push_encap
From: Peter Oskolkov @ 2019-02-13 19:53 UTC (permalink / raw)
  To: Alexei Starovoitov, Daniel Borkmann, netdev
  Cc: Peter Oskolkov, David Ahern, Willem de Bruijn, Peter Oskolkov
In-Reply-To: <20190213195341.184969-1-posk@google.com>

This patch adds handling of GSO packets in bpf_lwt_push_ip_encap()
(called from bpf_lwt_push_encap):

* IPIP, GRE, and UDP encapsulation types are deduced by looking
  into iphdr->protocol or ipv6hdr->next_header;
* SCTP GSO packets are not supported (as bpf_skb_proto_4_to_6
  and similar do);
* UDP_L4 GSO packets are also not supported (although they are
  not blocked in bpf_skb_proto_4_to_6 and similar), as
  skb_decrease_gso_size() will break it;
* SKB_GSO_DODGY bit is set.

Note: it may be possible to support SCTP and UDP_L4 gso packets;
      but as these cases seem to be not well handled by other
      tunneling/encapping code paths, the solution should
      be generic enough to apply to all tunneling/encapping code.

v8 changes:
   - make sure that if GRE or UDP encap is detected, there is
     enough of pushed bytes to cover both IP[v6] + GRE|UDP headers;
   - do not reject double-encapped packets;
   - whitelist TCP GSO packets rather than block SCTP GSO and
     UDP GSO.

Signed-off-by: Peter Oskolkov <posk@google.com>
---
 net/core/lwt_bpf.c | 67 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e5a9850d9f48..079871fc020f 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/bpf.h>
 #include <net/lwtunnel.h>
+#include <net/gre.h>
 
 struct bpf_lwt_prog {
 	struct bpf_prog *prog;
@@ -390,10 +391,72 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.owner		= THIS_MODULE,
 };
 
+static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
+			   int encap_len)
+{
+	struct skb_shared_info *shinfo = skb_shinfo(skb);
+
+	gso_type |= SKB_GSO_DODGY;
+	shinfo->gso_type |= gso_type;
+	skb_decrease_gso_size(shinfo, encap_len);
+	shinfo->gso_segs = 0;
+	return 0;
+}
+
 static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
 {
-	/* Handling of GSO-enabled packets is added in the next patch. */
-	return -EOPNOTSUPP;
+	int next_hdr_offset;
+	void *next_hdr;
+	__u8 protocol;
+
+	/* SCTP and UDP_L4 gso need more nuanced handling than what
+	 * handle_gso_type() does above: skb_decrease_gso_size() is not enough.
+	 * So at the moment only TCP GSO packets are let through.
+	 */
+	if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
+		return -ENOTSUPP;
+
+	if (ipv4) {
+		protocol = ip_hdr(skb)->protocol;
+		next_hdr_offset = sizeof(struct iphdr);
+		next_hdr = skb_network_header(skb) + next_hdr_offset;
+	} else {
+		protocol = ipv6_hdr(skb)->nexthdr;
+		next_hdr_offset = sizeof(struct ipv6hdr);
+		next_hdr = skb_network_header(skb) + next_hdr_offset;
+	}
+
+	switch (protocol) {
+	case IPPROTO_GRE:
+		next_hdr_offset += sizeof(struct gre_base_hdr);
+		if (next_hdr_offset > encap_len)
+			return -EINVAL;
+
+		if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
+			return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
+					       encap_len);
+		return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
+
+	case IPPROTO_UDP:
+		next_hdr_offset += sizeof(struct udphdr);
+		if (next_hdr_offset > encap_len)
+			return -EINVAL;
+
+		if (((struct udphdr *)next_hdr)->check)
+			return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
+					       encap_len);
+		return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
+
+	case IPPROTO_IP:
+	case IPPROTO_IPV6:
+		if (ipv4)
+			return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
+		else
+			return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
+
+	default:
+		return -EPROTONOSUPPORT;
+	}
 }
 
 int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
-- 
2.20.1.791.gb4d0f1c61a-goog


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox