Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH AUTOSEL 5.2 17/94] s390/bpf: use 32-bit index for tail calls
From: Sasha Levin @ 2019-09-04 15:56 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Ilya Leoshkevich, Yauheni Kaliuta, Vasily Gorbik, Daniel Borkmann,
	Sasha Levin, netdev, bpf, linux-s390
In-Reply-To: <20190904155739.2816-1-sashal@kernel.org>

From: Ilya Leoshkevich <iii@linux.ibm.com>

[ Upstream commit 91b4db5313a2c793aabc2143efb8ed0cf0fdd097 ]

"p runtime/jit: pass > 32bit index to tail_call" fails when
bpf_jit_enable=1, because the tail call is not executed.

This in turn is because the generated code assumes index is 64-bit,
while it must be 32-bit, and as a result prog array bounds check fails,
while it should pass. Even if bounds check would have passed, the code
that follows uses 64-bit index to compute prog array offset.

Fix by using clrj instead of clgrj for comparing index with array size,
and also by using llgfr for truncating index to 32 bits before using it
to compute prog array offset.

Fixes: 6651ee070b31 ("s390/bpf: implement bpf_tail_call() helper")
Reported-by: Yauheni Kaliuta <yauheni.kaliuta@redhat.com>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 9a711472cbdc0..fd9844f947f79 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -1027,8 +1027,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		/* llgf %w1,map.max_entries(%b2) */
 		EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_2,
 			      offsetof(struct bpf_array, map.max_entries));
-		/* clgrj %b3,%w1,0xa,label0: if %b3 >= %w1 goto out */
-		EMIT6_PCREL_LABEL(0xec000000, 0x0065, BPF_REG_3,
+		/* clrj %b3,%w1,0xa,label0: if (u32)%b3 >= (u32)%w1 goto out */
+		EMIT6_PCREL_LABEL(0xec000000, 0x0077, BPF_REG_3,
 				  REG_W1, 0, 0xa);
 
 		/*
@@ -1054,8 +1054,10 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		 *         goto out;
 		 */
 
-		/* sllg %r1,%b3,3: %r1 = index * 8 */
-		EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, BPF_REG_3, REG_0, 3);
+		/* llgfr %r1,%b3: %r1 = (u32) index */
+		EMIT4(0xb9160000, REG_1, BPF_REG_3);
+		/* sllg %r1,%r1,3: %r1 *= 8 */
+		EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, REG_1, REG_0, 3);
 		/* lg %r1,prog(%b2,%r1) */
 		EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2,
 			      REG_1, offsetof(struct bpf_array, ptrs));
-- 
2.20.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.2 10/94] s390/bpf: fix lcgr instruction encoding
From: Sasha Levin @ 2019-09-04 15:56 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Ilya Leoshkevich, Vasily Gorbik, Daniel Borkmann, Sasha Levin,
	netdev, bpf, linux-s390
In-Reply-To: <20190904155739.2816-1-sashal@kernel.org>

From: Ilya Leoshkevich <iii@linux.ibm.com>

[ Upstream commit bb2d267c448f4bc3a3389d97c56391cb779178ae ]

"masking, test in bounds 3" fails on s390, because
BPF_ALU64_IMM(BPF_NEG, BPF_REG_2, 0) ignores the top 32 bits of
BPF_REG_2. The reason is that JIT emits lcgfr instead of lcgr.
The associated comment indicates that the code was intended to
emit lcgr in the first place, it's just that the wrong opcode
was used.

Fix by using the correct opcode.

Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend")
Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com>
Acked-by: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 arch/s390/net/bpf_jit_comp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c
index 5e7c630331590..9a711472cbdc0 100644
--- a/arch/s390/net/bpf_jit_comp.c
+++ b/arch/s390/net/bpf_jit_comp.c
@@ -853,7 +853,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		break;
 	case BPF_ALU64 | BPF_NEG: /* dst = -dst */
 		/* lcgr %dst,%dst */
-		EMIT4(0xb9130000, dst_reg, dst_reg);
+		EMIT4(0xb9030000, dst_reg, dst_reg);
 		break;
 	/*
 	 * BPF_FROM_BE/LE
-- 
2.20.1


^ permalink raw reply related

* [PATCH AUTOSEL 5.2 01/94] ieee802154: hwsim: Fix error handle path in hwsim_init_module
From: Sasha Levin @ 2019-09-04 15:56 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: YueHaibing, Hulk Robot, Alexander Aring, Stefan Schmidt,
	Sasha Levin, linux-wpan, netdev, bpf

From: YueHaibing <yuehaibing@huawei.com>

[ Upstream commit 1cbbbf39efab05fae67f59e6ed01bb85061c69e2 ]

KASAN report this:

BUG: unable to handle kernel paging request at fffffbfff834f001
PGD 237fe8067 P4D 237fe8067 PUD 237e64067 PMD 1c968d067 PTE 0
Oops: 0000 [#1] SMP KASAN PTI
CPU: 1 PID: 8871 Comm: syz-executor.0 Tainted: G         C        5.0.0+ #5
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014
RIP: 0010:strcmp+0x31/0xa0 lib/string.c:328
Code: 00 00 00 00 fc ff df 55 53 48 83 ec 08 eb 0a 84 db 48 89 ef 74 5a 4c 89 e6 48 89 f8 48 89 fa 48 8d 6f 01 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 04 84 c0 75 50 48 89 f0 48 89 f2 0f b6 5d
RSP: 0018:ffff8881e0c57800 EFLAGS: 00010246
RAX: 1ffffffff834f001 RBX: ffffffffc1a78000 RCX: ffffffff827b9503
RDX: 0000000000000000 RSI: ffffffffc1a40008 RDI: ffffffffc1a78008
RBP: ffffffffc1a78009 R08: fffffbfff6a92195 R09: fffffbfff6a92195
R10: ffff8881e0c578b8 R11: fffffbfff6a92194 R12: ffffffffc1a40008
R13: dffffc0000000000 R14: ffffffffc1a3e470 R15: ffffffffc1a40000
FS:  00007fdcc02ff700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffffbfff834f001 CR3: 00000001b3134003 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 genl_family_find_byname+0x7f/0xf0 net/netlink/genetlink.c:104
 genl_register_family+0x1e1/0x1070 net/netlink/genetlink.c:333
 ? 0xffffffffc1978000
 hwsim_init_module+0x6a/0x1000 [mac802154_hwsim]
 ? 0xffffffffc1978000
 ? 0xffffffffc1978000
 ? 0xffffffffc1978000
 do_one_initcall+0xbc/0x47d init/main.c:887
 do_init_module+0x1b5/0x547 kernel/module.c:3456
 load_module+0x6405/0x8c10 kernel/module.c:3804
 __do_sys_finit_module+0x162/0x190 kernel/module.c:3898
 do_syscall_64+0x9f/0x450 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x462e99
Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
RSP: 002b:00007fdcc02fec58 EFLAGS: 00000246 ORIG_RAX: 0000000000000139
RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000462e99
RDX: 0000000000000000 RSI: 0000000020000200 RDI: 0000000000000003
RBP: 00007fdcc02fec70 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fdcc02ff6bc
R13: 00000000004bcefa R14: 00000000006f6fb0 R15: 0000000000000004
Modules linked in: mac802154_hwsim(+) mac802154 ieee802154 speakup(C) rc_proteus_2309 rtc_rk808 streebog_generic rds vboxguest madera_spi madera da9052_wdt mISDN_core ueagle_atm usbatm atm ir_imon_decoder scsi_transport_sas rc_dntv_live_dvb_t panel_samsung_s6d16d0 drm drm_panel_orientation_quirks lib80211 fb_agm1264k_fl(C) gspca_pac7302 gspca_main videobuf2_v4l2 soundwire_intel_init i2c_dln2 dln2 usbcore hid_gaff 88pm8607 nfnetlink axp20x_i2c axp20x uio pata_marvell pmbus_core snd_sonicvibes gameport snd_pcm snd_opl3_lib snd_timer snd_hwdep snd_mpu401_uart snd_rawmidi snd_seq_device snd soundcore rtc_ds1511 rtc_ds1742 vsock dwc_xlgmac rtc_rx8010 libphy twofish_x86_64_3way twofish_x86_64 twofish_common ad5696_i2c ad5686 lp8788_charger cxd2880_spi dvb_core videobuf2_common videodev media videobuf2_vmalloc videobuf2_memops fbtft(C) sysimgblt sysfillrect syscopyarea fb_sys_fops janz_ican3 firewire_net firewire_core crc_itu_t spi_slave_system_control i2c_matroxfb i2c_algo_bit
 matroxfb_base fb fbdev matroxfb_DAC1064 matroxfb_accel cfbcopyarea cfbimgblt cfbfillrect matroxfb_Ti3026 matroxfb_g450 g450_pll matroxfb_misc leds_blinkm ti_dac7311 intel_spi_pci intel_spi spi_nor hid_elan hid async_tx rc_cinergy_1400 rc_core intel_ishtp kxcjk_1013 industrialio_triggered_buffer kfifo_buf can_dev intel_th spi_pxa2xx_platform pata_artop vme_ca91cx42 gb_gbphy(C) greybus(C) industrialio mptbase st_drv cmac ttpci_eeprom via_wdt gpio_xra1403 mtd iptable_security iptable_raw iptable_mangle iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_filter bpfilter ip6_vti ip_vti ip_gre ipip sit tunnel4 ip_tunnel hsr veth netdevsim vxcan batman_adv cfg80211 rfkill chnl_net caif nlmon dummy team bonding vcan bridge stp llc ip6_gre gre ip6_tunnel tunnel6 tun joydev mousedev ppdev kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel aesni_intel aes_x86_64 input_leds crypto_simd cryptd glue_helper ide_pci_generic piix psmouse
 ide_core serio_raw ata_generic i2c_piix4 pata_acpi parport_pc parport floppy rtc_cmos intel_agp intel_gtt agpgart sch_fq_codel ip_tables x_tables sha1_ssse3 sha1_generic ipv6 [last unloaded: speakup]
Dumping ftrace buffer:
   (ftrace buffer empty)
CR2: fffffbfff834f001
---[ end trace 5aa772c793e0e971 ]---
RIP: 0010:strcmp+0x31/0xa0 lib/string.c:328
Code: 00 00 00 00 fc ff df 55 53 48 83 ec 08 eb 0a 84 db 48 89 ef 74 5a 4c 89 e6 48 89 f8 48 89 fa 48 8d 6f 01 48 c1 e8 03 83 e2 07 <42> 0f b6 04 28 38 d0 7f 04 84 c0 75 50 48 89 f0 48 89 f2 0f b6 5d
RSP: 0018:ffff8881e0c57800 EFLAGS: 00010246
RAX: 1ffffffff834f001 RBX: ffffffffc1a78000 RCX: ffffffff827b9503
RDX: 0000000000000000 RSI: ffffffffc1a40008 RDI: ffffffffc1a78008
RBP: ffffffffc1a78009 R08: fffffbfff6a92195 R09: fffffbfff6a92195
R10: ffff8881e0c578b8 R11: fffffbfff6a92194 R12: ffffffffc1a40008
R13: dffffc0000000000 R14: ffffffffc1a3e470 R15: ffffffffc1a40000
FS:  00007fdcc02ff700(0000) GS:ffff8881f7300000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: fffffbfff834f001 CR3: 00000001b3134003 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554

The error handing path misplace the cleanup in hwsim_init_module,
switch the two cleanup functions to fix above issues.

Reported-by: Hulk Robot <hulkci@huawei.com>
Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb")
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
Acked-by: Alexander Aring <aring@mojatatu.com>
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ieee802154/mac802154_hwsim.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c
index b41696e16bdc8..94b9e9d775e40 100644
--- a/drivers/net/ieee802154/mac802154_hwsim.c
+++ b/drivers/net/ieee802154/mac802154_hwsim.c
@@ -901,9 +901,9 @@ static __init int hwsim_init_module(void)
 	return 0;
 
 platform_drv:
-	genl_unregister_family(&hwsim_genl_family);
-platform_dev:
 	platform_device_unregister(mac802154hwsim_dev);
+platform_dev:
+	genl_unregister_family(&hwsim_genl_family);
 	return rc;
 }
 
-- 
2.20.1


^ permalink raw reply related

* [PATCH iproute2-next] bpf: fix snprintf truncation warning
From: Andrea Claudi @ 2019-09-04 15:50 UTC (permalink / raw)
  To: netdev; +Cc: stephen, dsahern

gcc v9.2.1 produces the following warning compiling iproute2:

bpf.c: In function ‘bpf_get_work_dir’:
bpf.c:784:49: warning: ‘snprintf’ output may be truncated before the last format character [-Wformat-truncation=]
  784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
      |                                                 ^
bpf.c:784:2: note: ‘snprintf’ output between 2 and 4097 bytes into a destination of size 4096
  784 |  snprintf(bpf_wrk_dir, sizeof(bpf_wrk_dir), "%s/", mnt);
      |  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Fix it extending bpf_wrk_dir size by 1 byte for the extra "/" char.

Signed-off-by: Andrea Claudi <aclaudi@redhat.com>
---
 lib/bpf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/bpf.c b/lib/bpf.c
index 7d2a322ffbaec..95de7894a93ce 100644
--- a/lib/bpf.c
+++ b/lib/bpf.c
@@ -742,7 +742,7 @@ static int bpf_gen_hierarchy(const char *base)
 static const char *bpf_get_work_dir(enum bpf_prog_type type)
 {
 	static char bpf_tmp[PATH_MAX] = BPF_DIR_MNT;
-	static char bpf_wrk_dir[PATH_MAX];
+	static char bpf_wrk_dir[PATH_MAX + 1];
 	static const char *mnt;
 	static bool bpf_mnt_cached;
 	const char *mnt_env = getenv(BPF_ENV_MNT);
-- 
2.21.0


^ permalink raw reply related

* Re: Is bug 200755 in anyone's queue??
From: Willem de Bruijn @ 2019-09-04 15:46 UTC (permalink / raw)
  To: Steve Zabele
  Cc: Eric Dumazet, Mark KEATON, Willem de Bruijn, Network Development,
	shum@canndrew.org, vladimir116@gmail.com, saifi.khan@strikr.in,
	Daniel Borkmann, on2k16nm@gmail.com, Stephen Hemminger
In-Reply-To: <F119F197-FD88-4F9B-B064-F23B2E5025A3@comcast.net>

On Wed, Sep 4, 2019 at 10:51 AM Steve Zabele <zabele@comcast.net> wrote:
>
> I think a dual table approach makes a lot of sense here, especially if we look at the different use cases. For the DNS server example, almost certainly there will not be any connected sockets using the server port, so a test of whether the connected table is empty (maybe a boolean stored with the unconnected table?) should get to the existing code very quickly and not require accessing the memory holding the connected table. For our use case, the connected sockets persist for long periods (at network timescales at least) and so any rehashing should be infrequent and so have limited impact on performance overall.
>
> So does a dual table approach seem workable to other folks that know the internals?

Let me take a stab and compare. A dual table does bring it more in
line with how the TCP code is structured.

^ permalink raw reply

* Re: KSZ8863 ethernet PHY support question
From: Andrew Lunn @ 2019-09-04 15:44 UTC (permalink / raw)
  To: HOLTZ Matthieu; +Cc: netdev@vger.kernel.org, matthieu.holtz@gmail.com
In-Reply-To: <deba2802f6914ac3ba3245de0bfd2c1a@hagergroup.com>

On Tue, Sep 03, 2019 at 01:59:18PM +0000, HOLTZ Matthieu wrote:
> Hello,
> 
> I’d like to use a switch phy KSZ8863 with an NXP i.mx8mm MPU (new motherboard dev) and a kernel 4.14.x but I am a bit lost regarding the driver support.
> 
> Is the Phy supported by the driver under linux/drivers/net/phy/micrel.c and what about the switch configuration, is it implemented in the DSA subsystem ?

Hi Matthieu

There was a set of RFC patches for the switch posted a while ago:

https://www.spinics.net/lists/netdev/msg569654.html

You might want to talk to the author and help get them merged.

    Andrew

^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Alexei Starovoitov @ 2019-09-04 15:21 UTC (permalink / raw)
  To: Daniel Borkmann, nicolas.dichtel@6wind.com, Alexei Starovoitov
  Cc: Alexei Starovoitov, luto@amacapital.net, davem@davemloft.net,
	peterz@infradead.org, rostedt@goodmis.org, netdev@vger.kernel.org,
	bpf@vger.kernel.org, Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <5e36a193-8ad9-77e7-e2ff-429fb521a79c@iogearbox.net>

On 9/4/19 8:16 AM, Daniel Borkmann wrote:
> opening/creating BPF maps" error="Unable to create map 
> /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted" 
> subsys=daemon
> 2019-09-04T14:11:47.28178666Z level=fatal msg="Error while creating 
> daemon" error="Unable to create map 
> /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted" 
> subsys=daemon

Ok. We have to include caps in both cap_sys_admin and cap_bpf then.

> And /same/ deployment with reverted patches, hence no CAP_BPF gets it up 
> and running again:
> 
> # kubectl get pods --all-namespaces -o wide

Can you share what this magic commands do underneath?

What user do they pick to start under? and what caps are granted?

^ permalink raw reply

* Re: net: hsr: remove a redundant null check before kfree_skb
From: zhong jiang @ 2019-09-04 15:21 UTC (permalink / raw)
  To: Markus Elfring
  Cc: Arvid Brodin, David S. Miller, netdev, linux-kernel,
	kernel-janitors
In-Reply-To: <ce4f53c1-fd91-af5b-7f0a-4746c3ad8de1@web.de>

On 2019/9/4 19:55, Markus Elfring wrote:
>> kfree_skb has taken the null pointer into account.
> I suggest to take another look also at information around
> a similar update suggestion.
>
> net-hsr: Delete unnecessary checks before the function call "kfree_skb"
> https://lkml.org/lkml/2015/11/14/120
> https://lore.kernel.org/patchwork/patch/617878/
> https://lore.kernel.org/r/5647A77E.6040501@users.sourceforge.net/
>
> https://lkml.org/lkml/2015/11/24/433
> https://lore.kernel.org/r/56546951.9080101@alten.se/
Thanks you for explaination. I miss the similar patch before sending it.

Sincerely,
zhong jiang
> Regards,
> Markus
>
> .
>



^ permalink raw reply

* Re: [PATCH v2 bpf-next 2/3] bpf: implement CAP_BPF
From: Daniel Borkmann @ 2019-09-04 15:16 UTC (permalink / raw)
  To: Alexei Starovoitov, nicolas.dichtel@6wind.com, Alexei Starovoitov
  Cc: Alexei Starovoitov, luto@amacapital.net, davem@davemloft.net,
	peterz@infradead.org, rostedt@goodmis.org, netdev@vger.kernel.org,
	bpf@vger.kernel.org, Kernel Team, linux-api@vger.kernel.org
In-Reply-To: <46df2c36-4276-33c0-626b-c51e77b3a04f@fb.com>

On 9/4/19 3:39 AM, Alexei Starovoitov wrote:
> On 8/30/19 8:19 AM, Nicolas Dichtel wrote:
>> Le 29/08/2019 à 19:30, Alexei Starovoitov a écrit :
>> [snip]
>>> These are the links that showing that k8 can delegates caps.
>>> Are you saying that you know of folks who specifically
>>> delegate cap_sys_admin and cap_net_admin _only_ to a container to run bpf in there?
>>>
>> Yes, we need cap_sys_admin only to load bpf:
>> tc filter add dev eth0 ingress matchall action bpf obj ./tc_test_kern.o sec test
>>
>> I'm not sure to understand why cap_net_admin is not enough to run the previous
>> command (ie why load is forbidden).
> 
> because bpf syscall prog_load command requires cap_sys_admin in
> the current implementation.
> 
>> I want to avoid sys_admin, thus cap_bpf will be ok. But we need to manage the
>> backward compatibility.
> 
> re: backward compatibility...
> do you know of any case where task is running under userid=nobody
> with cap_sys_admin and cap_net_admin in order to do bpf ?
> 
> If not then what is the concern about compatibility?

Finally managed to find some cycles to pull up a k8s cluster. Looks like it would
break deployments with the patches as-is right away; meaning, any constellation
where BPF is used inside the pod.

With CAP_BPF patches applied on bpf-next:

# kubectl apply -f ./cilium.yaml
[...]
# kubectl get pods --all-namespaces -o wide
NAMESPACE     NAME                               READY   STATUS              RESTARTS   AGE     IP              NODE     NOMINATED NODE   READINESS GATES
kube-system   cilium-cz9qs                       0/1     CrashLoopBackOff    4          2m36s   192.168.1.125   apoc     <none>           <none>
kube-system   cilium-operator-6c7c6c788b-xcm9d   0/1     Pending             0          2m36s   <none>          <none>   <none>           <none>
kube-system   coredns-5c98db65d4-6nhpg           0/1     ContainerCreating   0          4m12s   <none>          apoc     <none>           <none>
kube-system   coredns-5c98db65d4-l5b94           0/1     ContainerCreating   0          4m12s   <none>          apoc     <none>           <none>
kube-system   etcd-apoc                          1/1     Running             0          3m26s   192.168.1.125   apoc     <none>           <none>
kube-system   kube-apiserver-apoc                1/1     Running             0          3m32s   192.168.1.125   apoc     <none>           <none>
kube-system   kube-controller-manager-apoc       1/1     Running             0          3m18s   192.168.1.125   apoc     <none>           <none>
kube-system   kube-proxy-jj9kz                   1/1     Running             0          4m12s   192.168.1.125   apoc     <none>           <none>
kube-system   kube-scheduler-apoc                1/1     Running             0          3m26s   192.168.1.125   apoc     <none>           <none>
# kubectl -n kube-system logs --timestamps cilium-cz9qs
[...]
2019-09-04T14:11:46.399478585Z level=info msg="Cilium 1.6.90 ba0ed147b 2019-09-03T21:20:30+02:00 go version go1.12.8 linux/amd64" subsys=daemon
2019-09-04T14:11:46.410564471Z level=info msg="cilium-envoy  version: b7a919ebdca3d3bbc6aae51357e78e9c603450ae/1.11.1/Modified/RELEASE/BoringSSL" subsys=daemon
2019-09-04T14:11:46.446983926Z level=info msg="clang (7.0.0) and kernel (5.3.0) versions: OK!" subsys=daemon
[...]
2019-09-04T14:11:47.27988188Z level=info msg="Mounting BPF filesystem at /run/cilium/bpffs" subsys=bpf
2019-09-04T14:11:47.279904256Z level=info msg="Detected mounted BPF filesystem at /run/cilium/bpffs" subsys=bpf
2019-09-04T14:11:47.280205098Z level=info msg="Valid label prefix configuration:" subsys=labels-filter
2019-09-04T14:11:47.280214528Z level=info msg=" - :io.kubernetes.pod.namespace" subsys=labels-filter
2019-09-04T14:11:47.28021738Z level=info msg=" - :io.cilium.k8s.namespace.labels" subsys=labels-filter
2019-09-04T14:11:47.280220836Z level=info msg=" - :app.kubernetes.io" subsys=labels-filter
2019-09-04T14:11:47.280223355Z level=info msg=" - !:io.kubernetes" subsys=labels-filter
2019-09-04T14:11:47.280225723Z level=info msg=" - !:kubernetes.io" subsys=labels-filter
2019-09-04T14:11:47.280228095Z level=info msg=" - !:.*beta.kubernetes.io" subsys=labels-filter
2019-09-04T14:11:47.280230409Z level=info msg=" - !:k8s.io" subsys=labels-filter
2019-09-04T14:11:47.280232699Z level=info msg=" - !:pod-template-generation" subsys=labels-filter
2019-09-04T14:11:47.280235569Z level=info msg=" - !:pod-template-hash" subsys=labels-filter
2019-09-04T14:11:47.28023792Z level=info msg=" - !:controller-revision-hash" subsys=labels-filter
2019-09-04T14:11:47.280240253Z level=info msg=" - !:annotation.*" subsys=labels-filter
2019-09-04T14:11:47.280242566Z level=info msg=" - !:etcd_node" subsys=labels-filter
2019-09-04T14:11:47.28026585Z level=info msg="Initializing daemon" subsys=daemon
2019-09-04T14:11:47.281344002Z level=info msg="Detected MTU 1500" subsys=mtu
2019-09-04T14:11:47.281771889Z level=error msg="Error while opening/creating BPF maps" error="Unable to create map /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted" subsys=daemon
2019-09-04T14:11:47.28178666Z level=fatal msg="Error while creating daemon" error="Unable to create map /run/cilium/bpffs/tc/globals/cilium_lxc: operation not permitted" subsys=daemon

And /same/ deployment with reverted patches, hence no CAP_BPF gets it up and running again:

# kubectl get pods --all-namespaces -o wide
NAMESPACE     NAME                               READY   STATUS    RESTARTS   AGE   IP              NODE     NOMINATED NODE   READINESS GATES
kube-system   cilium-cz9qs                       1/1     Running   13         50m   192.168.1.125   apoc     <none>           <none>
kube-system   cilium-operator-6c7c6c788b-xcm9d   0/1     Pending   0          50m   <none>          <none>   <none>           <none>
kube-system   coredns-5c98db65d4-6nhpg           1/1     Running   0          52m   10.217.0.91     apoc     <none>           <none>
kube-system   coredns-5c98db65d4-l5b94           1/1     Running   0          52m   10.217.0.225    apoc     <none>           <none>
kube-system   etcd-apoc                          1/1     Running   1          51m   192.168.1.125   apoc     <none>           <none>
kube-system   kube-apiserver-apoc                1/1     Running   1          51m   192.168.1.125   apoc     <none>           <none>
kube-system   kube-controller-manager-apoc       1/1     Running   1          51m   192.168.1.125   apoc     <none>           <none>
kube-system   kube-proxy-jj9kz                   1/1     Running   1          52m   192.168.1.125   apoc     <none>           <none>
kube-system   kube-scheduler-apoc                1/1     Running   1          51m   192.168.1.125   apoc     <none>           <none>

Thanks,
Daniel

^ permalink raw reply

* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Qian Cai @ 2019-09-04 15:07 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Sergey Senozhatsky, Michal Hocko, Eric Dumazet, davem, netdev,
	linux-mm, linux-kernel, Petr Mladek, Steven Rostedt
In-Reply-To: <20190904144850.GA8296@tigerII.localdomain>

On Wed, 2019-09-04 at 23:48 +0900, Sergey Senozhatsky wrote:
> On (09/04/19 08:14), Qian Cai wrote:
> > > Plus one more check - waitqueue_active(&log_wait). printk() adds
> > > pending irq_work only if there is a user-space process sleeping on
> > > log_wait and irq_work is not already scheduled. If the syslog is
> > > active or there is noone to wakeup then we don't queue irq_work.
> > 
> > Another possibility for this potential livelock is that those printk() from
> > warn_alloc(), dump_stack() and show_mem() increase the time it needs to
> > process
> > build_skb() allocation failures significantly under memory pressure. As the
> > result, ksoftirqd() could be rescheduled during that time via a different
> > CPU
> > (this is a large x86 NUMA system anyway),
> > 
> > [83605.577256][   C31]  run_ksoftirqd+0x1f/0x40
> > [83605.577256][   C31]  smpboot_thread_fn+0x255/0x440
> > [83605.577256][   C31]  kthread+0x1df/0x200
> > [83605.577256][   C31]  ret_from_fork+0x35/0x40
> 
> Hum hum hum...
> 
> So I can, _probably_, think of several patches.
> 
> First, move wake_up_klogd() back to console_unlock().
> 
> Second, move `printk_pending' out of per-CPU region and make it global.
> So we will have just one printk irq_work scheduled across all CPUs;
> currently we have one irq_work per CPU. I think I sent a patch a long
> long time ago, but we never discussed it, as far as I remember.
> 
> > In addition, those printk() will deal with console drivers or even a
> > networking
> > console, so it is probably not unusual that it could call irq_exit()-
> > __do_softirq() at one point and then this livelock.
> 
> Do you use netcon? Because this, theoretically, can open up one more
> vector. netcon allocates skbs from ->write() path. We call con drivers'
> ->write() from printk_safe context, so should netcon skb allocation
> warn we will scedule one more irq_work on that CPU to flush per-CPU
> printk_safe buffer.

No, I don't use netcon. Just thought to mention it anyway since there could
other people use it.

> 
> If this is the case, then we can stop calling console_driver() under
> printk_safe. I sent a patch a while ago, but we agreed to keep the
> things the way they are, fot the time being.
> 
> Let me think more.
> 
> 	-ss

^ permalink raw reply

* [v3] iproute2-next: police: support 64bit rate and peakrate in tc utility
From: David Dai @ 2019-09-04 15:06 UTC (permalink / raw)
  To: jhs, xiyou.wangcong, jiri, netdev, linux-kernel; +Cc: zdai, zdai

For high speed adapter like Mellanox CX-5 card, it can reach upto
100 Gbits per second bandwidth. Currently htb already supports 64bit rate
in tc utility. However police action rate and peakrate are still limited
to 32bit value (upto 32 Gbits per second). Taking advantage of the 2 new
attributes TCA_POLICE_RATE64 and TCA_POLICE_PEAKRATE64 from kernel,
tc can use them to break the 32bit limit, and still keep the backward
binary compatibility.

Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Signed-off-by: David Dai <zdai@linux.vnet.ibm.com>
---
Changelog:
v1->v2:
 - Change patch submit component from iproute2 to iproute2-next
 - Move 2 attributes TCA_POLICE_RATE64 TCA_POLICE_PEAKRATE64 after
   TCA_POLICE_PAD in pkt_cls.h header to be consistent with kernel's
   pkt_cls.h header.
v2->v3:
  - Use common functions of duparg and invarg in police filter.
---
 include/uapi/linux/pkt_cls.h |    2 +
 tc/m_police.c                |  149 +++++++++++++++++++-----------------------
 tc/tc_core.c                 |   29 ++++++++
 tc/tc_core.h                 |    3 +
 4 files changed, 102 insertions(+), 81 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index b057aee..a6aa466 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -160,6 +160,8 @@ enum {
 	TCA_POLICE_RESULT,
 	TCA_POLICE_TM,
 	TCA_POLICE_PAD,
+	TCA_POLICE_RATE64,
+	TCA_POLICE_PEAKRATE64,
 	__TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
diff --git a/tc/m_police.c b/tc/m_police.c
index 862a39f..a5bc20c 100644
--- a/tc/m_police.c
+++ b/tc/m_police.c
@@ -49,11 +49,6 @@ static void usage(void)
 	exit(-1);
 }
 
-static void explain1(char *arg)
-{
-	fprintf(stderr, "Illegal \"%s\"\n", arg);
-}
-
 static int act_parse_police(struct action_util *a, int *argc_p, char ***argv_p,
 			    int tca_id, struct nlmsghdr *n)
 {
@@ -71,6 +66,7 @@ static int act_parse_police(struct action_util *a, int *argc_p, char ***argv_p,
 	unsigned int linklayer = LINKLAYER_ETHERNET; /* Assume ethernet */
 	int Rcell_log =  -1, Pcell_log = -1;
 	struct rtattr *tail;
+	__u64 rate64 = 0, prate64 = 0;
 
 	if (a) /* new way of doing things */
 		NEXT_ARG();
@@ -82,73 +78,47 @@ static int act_parse_police(struct action_util *a, int *argc_p, char ***argv_p,
 
 		if (matches(*argv, "index") == 0) {
 			NEXT_ARG();
-			if (get_u32(&p.index, *argv, 10)) {
-				fprintf(stderr, "Illegal \"index\"\n");
-				return -1;
-			}
+			if (get_u32(&p.index, *argv, 10))
+				invarg("index", *argv);
 		} else if (matches(*argv, "burst") == 0 ||
 			strcmp(*argv, "buffer") == 0 ||
 			strcmp(*argv, "maxburst") == 0) {
 			NEXT_ARG();
-			if (buffer) {
-				fprintf(stderr, "Double \"buffer/burst\" spec\n");
-				return -1;
-			}
-			if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0) {
-				explain1("buffer");
-				return -1;
-			}
+			if (buffer)
+				duparg("buffer/burst", *argv);
+			if (get_size_and_cell(&buffer, &Rcell_log, *argv) < 0)
+				invarg("buffer", *argv);
 		} else if (strcmp(*argv, "mtu") == 0 ||
 			   strcmp(*argv, "minburst") == 0) {
 			NEXT_ARG();
-			if (mtu) {
-				fprintf(stderr, "Double \"mtu/minburst\" spec\n");
-				return -1;
-			}
-			if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0) {
-				explain1("mtu");
-				return -1;
-			}
+			if (mtu)
+				duparg("mtu/minburst", *argv);
+			if (get_size_and_cell(&mtu, &Pcell_log, *argv) < 0)
+				invarg("mtu", *argv);
 		} else if (strcmp(*argv, "mpu") == 0) {
 			NEXT_ARG();
-			if (mpu) {
-				fprintf(stderr, "Double \"mpu\" spec\n");
-				return -1;
-			}
-			if (get_size(&mpu, *argv)) {
-				explain1("mpu");
-				return -1;
-			}
+			if (mpu)
+				duparg("mpu", *argv);
+			if (get_size(&mpu, *argv))
+				invarg("mpu", *argv);
 		} else if (strcmp(*argv, "rate") == 0) {
 			NEXT_ARG();
-			if (p.rate.rate) {
-				fprintf(stderr, "Double \"rate\" spec\n");
-				return -1;
-			}
-			if (get_rate(&p.rate.rate, *argv)) {
-				explain1("rate");
-				return -1;
-			}
+			if (rate64)
+				duparg("rate", *argv);
+			if (get_rate64(&rate64, *argv))
+				invarg("rate", *argv);
 		} else if (strcmp(*argv, "avrate") == 0) {
 			NEXT_ARG();
-			if (avrate) {
-				fprintf(stderr, "Double \"avrate\" spec\n");
-				return -1;
-			}
-			if (get_rate(&avrate, *argv)) {
-				explain1("avrate");
-				return -1;
-			}
+			if (avrate)
+				duparg("avrate", *argv);
+			if (get_rate(&avrate, *argv))
+				invarg("avrate", *argv);
 		} else if (matches(*argv, "peakrate") == 0) {
 			NEXT_ARG();
-			if (p.peakrate.rate) {
-				fprintf(stderr, "Double \"peakrate\" spec\n");
-				return -1;
-			}
-			if (get_rate(&p.peakrate.rate, *argv)) {
-				explain1("peakrate");
-				return -1;
-			}
+			if (prate64)
+				duparg("peakrate", *argv);
+			if (get_rate64(&prate64, *argv))
+				invarg("peakrate", *argv);
 		} else if (matches(*argv, "reclassify") == 0 ||
 			   matches(*argv, "drop") == 0 ||
 			   matches(*argv, "shot") == 0 ||
@@ -168,14 +138,12 @@ static int act_parse_police(struct action_util *a, int *argc_p, char ***argv_p,
 			return -1;
 		} else if (matches(*argv, "overhead") == 0) {
 			NEXT_ARG();
-			if (get_u16(&overhead, *argv, 10)) {
-				explain1("overhead"); return -1;
-			}
+			if (get_u16(&overhead, *argv, 10))
+				invarg("overhead", *argv);
 		} else if (matches(*argv, "linklayer") == 0) {
 			NEXT_ARG();
-			if (get_linklayer(&linklayer, *argv)) {
-				explain1("linklayer"); return -1;
-			}
+			if (get_linklayer(&linklayer, *argv))
+				invarg("linklayer", *argv);
 		} else if (strcmp(*argv, "help") == 0) {
 			usage();
 		} else {
@@ -189,23 +157,23 @@ action_ctrl_ok:
 	if (!ok)
 		return -1;
 
-	if (p.rate.rate && avrate)
+	if (rate64 && avrate)
 		return -1;
 
 	/* Must at least do late binding, use TB or ewma policing */
-	if (!p.rate.rate && !avrate && !p.index) {
+	if (!rate64 && !avrate && !p.index) {
 		fprintf(stderr, "\"rate\" or \"avrate\" MUST be specified.\n");
 		return -1;
 	}
 
 	/* When the TB policer is used, burst is required */
-	if (p.rate.rate && !buffer && !avrate) {
+	if (rate64 && !buffer && !avrate) {
 		fprintf(stderr, "\"burst\" requires \"rate\".\n");
 		return -1;
 	}
 
-	if (p.peakrate.rate) {
-		if (!p.rate.rate) {
+	if (prate64) {
+		if (!rate64) {
 			fprintf(stderr, "\"peakrate\" requires \"rate\".\n");
 			return -1;
 		}
@@ -215,22 +183,24 @@ action_ctrl_ok:
 		}
 	}
 
-	if (p.rate.rate) {
+	if (rate64) {
+		p.rate.rate = (rate64 >= (1ULL << 32)) ? ~0U : rate64;
 		p.rate.mpu = mpu;
 		p.rate.overhead = overhead;
-		if (tc_calc_rtable(&p.rate, rtab, Rcell_log, mtu,
-				   linklayer) < 0) {
+		if (tc_calc_rtable_64(&p.rate, rtab, Rcell_log, mtu,
+				   linklayer, rate64) < 0) {
 			fprintf(stderr, "POLICE: failed to calculate rate table.\n");
 			return -1;
 		}
-		p.burst = tc_calc_xmittime(p.rate.rate, buffer);
+		p.burst = tc_calc_xmittime(rate64, buffer);
 	}
 	p.mtu = mtu;
-	if (p.peakrate.rate) {
+	if (prate64) {
+		p.peakrate.rate = (prate64 >= (1ULL << 32)) ? ~0U : prate64;
 		p.peakrate.mpu = mpu;
 		p.peakrate.overhead = overhead;
-		if (tc_calc_rtable(&p.peakrate, ptab, Pcell_log, mtu,
-				   linklayer) < 0) {
+		if (tc_calc_rtable_64(&p.peakrate, ptab, Pcell_log, mtu,
+				   linklayer, prate64) < 0) {
 			fprintf(stderr, "POLICE: failed to calculate peak rate table.\n");
 			return -1;
 		}
@@ -238,10 +208,16 @@ action_ctrl_ok:
 
 	tail = addattr_nest(n, MAX_MSG, tca_id);
 	addattr_l(n, MAX_MSG, TCA_POLICE_TBF, &p, sizeof(p));
-	if (p.rate.rate)
+	if (rate64) {
 		addattr_l(n, MAX_MSG, TCA_POLICE_RATE, rtab, 1024);
-	if (p.peakrate.rate)
+		if (rate64 >= (1ULL << 32))
+			addattr64(n, MAX_MSG, TCA_POLICE_RATE64, rate64);
+	}
+	if (prate64) {
 		addattr_l(n, MAX_MSG, TCA_POLICE_PEAKRATE, ptab, 1024);
+		if (prate64 >= (1ULL << 32))
+			addattr64(n, MAX_MSG, TCA_POLICE_PEAKRATE64, prate64);
+	}
 	if (avrate)
 		addattr32(n, MAX_MSG, TCA_POLICE_AVRATE, avrate);
 	if (presult)
@@ -268,6 +244,7 @@ static int print_police(struct action_util *a, FILE *f, struct rtattr *arg)
 	struct rtattr *tb[TCA_POLICE_MAX+1];
 	unsigned int buffer;
 	unsigned int linklayer;
+	__u64 rate64, prate64;
 
 	if (arg == NULL)
 		return 0;
@@ -286,16 +263,26 @@ static int print_police(struct action_util *a, FILE *f, struct rtattr *arg)
 #endif
 	p = RTA_DATA(tb[TCA_POLICE_TBF]);
 
+	rate64 = p->rate.rate;
+	if (tb[TCA_POLICE_RATE64] &&
+	    RTA_PAYLOAD(tb[TCA_POLICE_RATE64]) >= sizeof(rate64))
+		rate64 = rta_getattr_u64(tb[TCA_POLICE_RATE64]);
+
 	fprintf(f, " police 0x%x ", p->index);
-	fprintf(f, "rate %s ", sprint_rate(p->rate.rate, b1));
-	buffer = tc_calc_xmitsize(p->rate.rate, p->burst);
+	fprintf(f, "rate %s ", sprint_rate(rate64, b1));
+	buffer = tc_calc_xmitsize(rate64, p->burst);
 	fprintf(f, "burst %s ", sprint_size(buffer, b1));
 	fprintf(f, "mtu %s ", sprint_size(p->mtu, b1));
 	if (show_raw)
 		fprintf(f, "[%08x] ", p->burst);
 
-	if (p->peakrate.rate)
-		fprintf(f, "peakrate %s ", sprint_rate(p->peakrate.rate, b1));
+	prate64 = p->peakrate.rate;
+	if (tb[TCA_POLICE_PEAKRATE64] &&
+	    RTA_PAYLOAD(tb[TCA_POLICE_PEAKRATE64]) >= sizeof(prate64))
+		prate64 = rta_getattr_u64(tb[TCA_POLICE_PEAKRATE64]);
+
+	if (prate64)
+		fprintf(f, "peakrate %s ", sprint_rate(prate64, b1));
 
 	if (tb[TCA_POLICE_AVRATE])
 		fprintf(f, "avrate %s ",
diff --git a/tc/tc_core.c b/tc/tc_core.c
index 8eb1122..498d35d 100644
--- a/tc/tc_core.c
+++ b/tc/tc_core.c
@@ -152,6 +152,35 @@ int tc_calc_rtable(struct tc_ratespec *r, __u32 *rtab,
 	return cell_log;
 }
 
+int tc_calc_rtable_64(struct tc_ratespec *r, __u32 *rtab,
+		   int cell_log, unsigned int mtu,
+		   enum link_layer linklayer, __u64 rate)
+{
+	int i;
+	unsigned int sz;
+	__u64 bps = rate;
+	unsigned int mpu = r->mpu;
+
+	if (mtu == 0)
+		mtu = 2047;
+
+	if (cell_log < 0) {
+		cell_log = 0;
+		while ((mtu >> cell_log) > 255)
+			cell_log++;
+	}
+
+	for (i = 0; i < 256; i++) {
+		sz = tc_adjust_size((i + 1) << cell_log, mpu, linklayer);
+		rtab[i] = tc_calc_xmittime(bps, sz);
+	}
+
+	r->cell_align =  -1;
+	r->cell_log = cell_log;
+	r->linklayer = (linklayer & TC_LINKLAYER_MASK);
+	return cell_log;
+}
+
 /*
    stab[pkt_len>>cell_log] = pkt_xmit_size>>size_log
  */
diff --git a/tc/tc_core.h b/tc/tc_core.h
index bd4a99f..6dab272 100644
--- a/tc/tc_core.h
+++ b/tc/tc_core.h
@@ -21,6 +21,9 @@ unsigned tc_calc_xmittime(__u64 rate, unsigned size);
 unsigned tc_calc_xmitsize(__u64 rate, unsigned ticks);
 int tc_calc_rtable(struct tc_ratespec *r, __u32 *rtab,
 		   int cell_log, unsigned mtu, enum link_layer link_layer);
+int tc_calc_rtable_64(struct tc_ratespec *r, __u32 *rtab,
+			int cell_log, unsigned mtu, enum link_layer link_layer,
+			__u64 rate);
 int tc_calc_size_table(struct tc_sizespec *s, __u16 **stab);
 
 int tc_setup_estimator(unsigned A, unsigned time_const, struct tc_estimator *est);
-- 
1.7.1


^ permalink raw reply related

* [PATCH iproute2] nexthop: Add space after blackhole
From: David Ahern @ 2019-09-04 15:09 UTC (permalink / raw)
  To: stephen; +Cc: netdev, David Ahern

From: David Ahern <dsahern@gmail.com>

Add a space after 'blackhole' is missing to properly separate the
protocol when it is given.

Signed-off-by: David Ahern <dsahern@gmail.com>
---
 ip/ipnexthop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/ipnexthop.c b/ip/ipnexthop.c
index f35aab52b775..8356aca296de 100644
--- a/ip/ipnexthop.c
+++ b/ip/ipnexthop.c
@@ -242,7 +242,7 @@ int print_nexthop(struct nlmsghdr *n, void *arg)
 	}
 
 	if (tb[NHA_BLACKHOLE])
-		print_null(PRINT_ANY, "blackhole", "blackhole", NULL);
+		print_null(PRINT_ANY, "blackhole", "blackhole ", NULL);
 
 	if (nhm->nh_protocol != RTPROT_UNSPEC || show_details > 0) {
 		print_string(PRINT_ANY, "protocol", "proto %s ",
-- 
2.11.0


^ permalink raw reply related

* [v3] net_sched: act_police: add 2 new attributes to support police 64bit rate and peakrate
From: David Dai @ 2019-09-04 15:03 UTC (permalink / raw)
  To: jhs, xiyou.wangcong, jiri, davem, netdev, linux-kernel; +Cc: zdai, zdai

For high speed adapter like Mellanox CX-5 card, it can reach upto
100 Gbits per second bandwidth. Currently htb already supports 64bit rate
in tc utility. However police action rate and peakrate are still limited
to 32bit value (upto 32 Gbits per second). Add 2 new attributes
TCA_POLICE_RATE64 and TCA_POLICE_RATE64 in kernel for 64bit support
so that tc utility can use them for 64bit rate and peakrate value to
break the 32bit limit, and still keep the backward binary compatibility.

Tested-by: David Dai <zdai@linux.vnet.ibm.com>
Signed-off-by: David Dai <zdai@linux.vnet.ibm.com>
---
Changelog:
v1->v2:
 - Move 2 attributes TCA_POLICE_RATE64 TCA_POLICE_PEAKRATE64 after
   TCA_POLICE_PAD in pkt_cls.h header.
v2->v3:
 - Use TCA_POLICE_PAD instead of __TCA_POLICE_MAX as padding attr
   in last parameter in nla_put_u64_64bit() routine.
---
 include/uapi/linux/pkt_cls.h |    2 ++
 net/sched/act_police.c       |   27 +++++++++++++++++++++++----
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index b057aee..a6aa466 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -160,6 +160,8 @@ enum {
 	TCA_POLICE_RESULT,
 	TCA_POLICE_TM,
 	TCA_POLICE_PAD,
+	TCA_POLICE_RATE64,
+	TCA_POLICE_PEAKRATE64,
 	__TCA_POLICE_MAX
 #define TCA_POLICE_RESULT TCA_POLICE_RESULT
 };
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 49cec3e..425f2a3 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -40,6 +40,8 @@ static int tcf_police_walker(struct net *net, struct sk_buff *skb,
 	[TCA_POLICE_PEAKRATE]	= { .len = TC_RTAB_SIZE },
 	[TCA_POLICE_AVRATE]	= { .type = NLA_U32 },
 	[TCA_POLICE_RESULT]	= { .type = NLA_U32 },
+	[TCA_POLICE_RATE64]     = { .type = NLA_U64 },
+	[TCA_POLICE_PEAKRATE64] = { .type = NLA_U64 },
 };
 
 static int tcf_police_init(struct net *net, struct nlattr *nla,
@@ -58,6 +60,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	struct tcf_police_params *new;
 	bool exists = false;
 	u32 index;
+	u64 rate64, prate64;
 
 	if (nla == NULL)
 		return -EINVAL;
@@ -155,14 +158,18 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 	if (R_tab) {
 		new->rate_present = true;
-		psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
+		rate64 = tb[TCA_POLICE_RATE64] ?
+			 nla_get_u64(tb[TCA_POLICE_RATE64]) : 0;
+		psched_ratecfg_precompute(&new->rate, &R_tab->rate, rate64);
 		qdisc_put_rtab(R_tab);
 	} else {
 		new->rate_present = false;
 	}
 	if (P_tab) {
 		new->peak_present = true;
-		psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
+		prate64 = tb[TCA_POLICE_PEAKRATE64] ?
+			  nla_get_u64(tb[TCA_POLICE_PEAKRATE64]) : 0;
+		psched_ratecfg_precompute(&new->peak, &P_tab->rate, prate64);
 		qdisc_put_rtab(P_tab);
 	} else {
 		new->peak_present = false;
@@ -313,10 +320,22 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 				      lockdep_is_held(&police->tcf_lock));
 	opt.mtu = p->tcfp_mtu;
 	opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
-	if (p->rate_present)
+	if (p->rate_present) {
 		psched_ratecfg_getrate(&opt.rate, &p->rate);
-	if (p->peak_present)
+		if ((police->params->rate.rate_bytes_ps >= (1ULL << 32)) &&
+		    nla_put_u64_64bit(skb, TCA_POLICE_RATE64,
+				      police->params->rate.rate_bytes_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
+	if (p->peak_present) {
 		psched_ratecfg_getrate(&opt.peakrate, &p->peak);
+		if ((police->params->peak.rate_bytes_ps >= (1ULL << 32)) &&
+		    nla_put_u64_64bit(skb, TCA_POLICE_PEAKRATE64,
+				      police->params->peak.rate_bytes_ps,
+				      TCA_POLICE_PAD))
+			goto nla_put_failure;
+	}
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
 	if (p->tcfp_result &&
-- 
1.7.1


^ permalink raw reply related

* Re: [PATCH v3 net] net: Properly update v4 routes with v6 nexthop
From: David Ahern @ 2019-09-04 15:02 UTC (permalink / raw)
  To: Donald Sharp, netdev, dsahern, sworley
In-Reply-To: <20190904141158.17021-1-sharpd@cumulusnetworks.com>

On 9/4/19 8:11 AM, Donald Sharp wrote:
> When creating a v4 route that uses a v6 nexthop from a nexthop group.
> Allow the kernel to properly send the nexthop as v6 via the RTA_VIA
> attribute.
> 

...

> 
> Fixes: dcb1ecb50edf (“ipv4: Prepare for fib6_nh from a nexthop object”)
> Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
> ---
>  include/net/ip_fib.h     |  4 ++--
>  include/net/nexthop.h    |  5 +++--
>  net/ipv4/fib_semantics.c | 15 ++++++++-------
>  net/ipv6/route.c         | 11 ++++++-----
>  4 files changed, 19 insertions(+), 16 deletions(-)
> 

Reviewed-by: David Ahern <dsahern@gmail.com>



^ permalink raw reply

* Re: [PATCH 2/2] ip nexthop: Allow flush|list operations to specify a specific protocol
From: David Ahern @ 2019-09-04 14:59 UTC (permalink / raw)
  To: Donald Sharp, netdev
In-Reply-To: <20190810001843.32068-3-sharpd@cumulusnetworks.com>

On 8/9/19 6:18 PM, Donald Sharp wrote:
> In the case where we have a large number of nexthops from a specific
> protocol, allow the flush and list operations to take a protocol
> to limit the commands scopes.
> 
> Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
> ---
>  ip/ipnexthop.c | 16 +++++++++++++++-
>  1 file changed, 15 insertions(+), 1 deletion(-)
> 

applied to iproute2-next. Thanks for the test cases.

^ permalink raw reply

* Re: [PATCH v2 net-next] net: stmmac: Add support for MDIO interrupts
From: Andrew Lunn @ 2019-09-04 14:58 UTC (permalink / raw)
  To: Voon Weifeng
  Cc: David S. Miller, Maxime Coquelin, netdev, linux-kernel,
	Jose Abreu, Giuseppe Cavallaro, Alexandre Torgue, Ong Boon Leong
In-Reply-To: <1567605774-5500-1-git-send-email-weifeng.voon@intel.com>

On Wed, Sep 04, 2019 at 10:02:54PM +0800, Voon Weifeng wrote:
> From: "Chuah, Kim Tatt" <kim.tatt.chuah@intel.com>
> 
> DW EQoS v5.xx controllers added capability for interrupt generation
> when MDIO interface is done (GMII Busy bit is cleared).
> This patch adds support for this interrupt on supported HW to avoid
> polling on GMII Busy bit.
> 
> stmmac_mdio_read() & stmmac_mdio_write() will sleep until wake_up() is
> called by the interrupt handler.
> 
> Reviewed-by: Voon Weifeng <weifeng.voon@intel.com>
> Reviewed-by: Kweh, Hock Leong <hock.leong.kweh@intel.com>
> Reviewed-by: Ong Boon Leong <boon.leong.ong@intel.com>
> Signed-off-by: Chuah, Kim Tatt <kim.tatt.chuah@intel.com>
> Signed-off-by: Ong Boon Leong <boon.leong.ong@intel.com>
> Signed-off-by: Voon Weifeng <weifeng.voon@intel.com>

Hi Voon

It is normal to include a short description of what you changed
between the previous version and this version.

The formatting of this patch also looks a bit odd. Did you use 
git format-patch ; git send-email?

Thanks
	Andrew

^ permalink raw reply

* Re: Is bug 200755 in anyone's queue??
From: Steve Zabele @ 2019-09-04 14:51 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mark KEATON, Willem de Bruijn, Network Development,
	shum@canndrew.org, vladimir116@gmail.com, saifi.khan@strikr.in,
	Daniel Borkmann, on2k16nm@gmail.com, Stephen Hemminger
In-Reply-To: <c3b83305-82a5-f3c8-2602-1aed2e9b51ca@gmail.com>

I think a dual table approach makes a lot of sense here, especially if we look at the different use cases. For the DNS server example, almost certainly there will not be any connected sockets using the server port, so a test of whether the connected table is empty (maybe a boolean stored with the unconnected table?) should get to the existing code very quickly and not require accessing the memory holding the connected table. For our use case, the connected sockets persist for long periods (at network timescales at least) and so any rehashing should be infrequent and so have limited impact on performance overall.

So does a dual table approach seem workable to other folks that know the internals?

Thanks!

Steve

Sent from my iPhone

> On Sep 4, 2019, at 8:23 AM, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> 
> 
>> On 9/4/19 2:00 PM, Mark KEATON wrote:
>> Hi Willem,
>> 
>> I am the person who commented on the original bug report in bugzilla.
>> 
>> In communicating with Steve just now about possible solutions that maintain the efficiency that you are after, what would you think of the following:  keep two lists of UDP sockets, those connected and those not connected, and always searching the connected list first. 
> 
> This was my suggestion.
> 
> Note that this requires adding yet another hash table, and yet another lookup
> (another cache line miss per incoming packet)
> 
> This lookup will slow down DNS and QUIC servers, or any application solely using not connected sockets.
> 
> 
> The word 'quick' you use is slightly misleading, since a change like that is a trade off.
> Some applications might become faster, while others become slower.
> 
> Another issue is that a connect() can follow a bind(), we would need to rehash sockets
> from one table to another. (Or add another set of anchors in UDP sockets, so that sockets can be in all the hash tables)
> 
> 
> If the connected list is empty, then the lookup can quickly use the not connected list to find a socket for load balancing.  If there are connected sockets, then only those connected sockets are searched first for an exact match.
>> 
>> Another option might be to do it with a single list if the connected sockets are all at the beginning of the list.  This would require the two separate lookups to start at different points in the list.
>> 
>> Thoughts?
>> 
>> Thanks!
>> Mark
>> 
>> 
>>> On Sep 4, 2019, at 6:28 AM, Steve Zabele <zabele@comcast.net> wrote:
>>> 
>>> Hi Willem,
>>> 
>>> Thanks for continuing to poke at this, much appreciated!
>>> 
>>>> As for the BPF program: good point on accessing the udp port when
>>>> skb->data is already beyond the header.
>>> 
>>>> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>>>> Which I think will work, but have not tested.
>>> 
>>> Please note that the test code was intentionally set up to make testing as simple as possible. Hence the source addresses for the multiple UDP sessions were identical -- but that is not the general case. In the general case a connected and bound socket should be associated with exactly one five tuple (source and dest addresses, source and destination ports, and protocol.
>>> 
>>> So a 'connect bpf' would actually need access to the IP addresses as well, not just the ports. To do this, the load bytes call required negative arguments, which failed miserably when we tried it.
>>> 
>>> In any event, there remains the issue of figuring out which index to return when a match is detected since the index is not the same as the file descriptor value and in fact can change as file descriptors are added and deleted. If I understand the kernel mechanism correctly, the operation is something like this. When you add the first one, its assigned to the first slot; when you add the second its assigned to the second slot; when you delete the first one, the second is moved to the first slot) so tracking this requires figuring out the order stored in the socket array within the kernel, and updating the bpf whenever something changes. I don't know if it's even possible to query which slot a given 
>>> 
>>> So we think handling this with a bpf is really not viable.
>>> 
>>> One thing worth mentioning is that the connect mechanism here is meant to (at least used to) work the same as connect does with TCP. Bind sets the expected/required local address and port; connect sets the expected/required remote address and port -- so a socket file descriptor becomes associated with exactly one five-tuple. That's how it's worked for several decades anyway.
>>> 
>>> Thanks again!!!
>>> 
>>> Steve
>>> 
>>> -----Original Message-----
>>> From: Willem de Bruijn [mailto:willemdebruijn.kernel@gmail.com] 
>>> Sent: Tuesday, September 03, 2019 1:56 PM
>>> Cc: Eric Dumazet; Steve Zabele; Network Development; shum@canndrew.org; vladimir116@gmail.com; saifi.khan@strikr.in; Daniel Borkmann; on2k16nm@gmail.com; Stephen Hemminger
>>> Subject: Re: Is bug 200755 in anyone's queue??
>>> 
>>> On Fri, Aug 30, 2019 at 4:30 PM Willem de Bruijn
>>> <willemdebruijn.kernel@gmail.com> wrote:
>>>> 
>>>>> On Fri, Aug 30, 2019 at 4:54 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>>>>> 
>>>>> 
>>>>> 
>>>>>> On 8/29/19 9:26 PM, Willem de Bruijn wrote:
>>>>>> 
>>>>>> SO_REUSEPORT was not intended to be used in this way. Opening
>>>>>> multiple connected sockets with the same local port.
>>>>>> 
>>>>>> But since the interface allowed connect after joining a group, and
>>>>>> that is being used, I guess that point is moot. Still, I'm a bit
>>>>>> surprised that it ever worked as described.
>>>>>> 
>>>>>> Also note that the default distribution algorithm is not round robin
>>>>>> assignment, but hash based. So multiple consecutive datagrams arriving
>>>>>> at the same socket is not unexpected.
>>>>>> 
>>>>>> I suspect that this quick hack might "work". It seemed to on the
>>>>>> supplied .c file:
>>>>>> 
>>>>>>                 score = compute_score(sk, net, saddr, sport,
>>>>>>                                       daddr, hnum, dif, sdif);
>>>>>>                 if (score > badness) {
>>>>>> -                       if (sk->sk_reuseport) {
>>>>>> +                       if (sk->sk_reuseport && !sk->sk_state !=
>>>>>> TCP_ESTABLISHED) {
>>>> 
>>>> This won't work for a mix of connected and connectionless sockets, of
>>>> course (even ignoring the typo), as it only skips reuseport on the
>>>> connected sockets.
>>>> 
>>>>>> 
>>>>>> But a more robust approach, that also works on existing kernels, is to
>>>>>> swap the default distribution algorithm with a custom BPF based one (
>>>>>> SO_ATTACH_REUSEPORT_EBPF).
>>>>>> 
>>>>> 
>>>>> Yes, I suspect that reuseport could still be used by to load-balance incoming packets
>>>>> targetting the same 4-tuple.
>>>>> 
>>>>> So all sockets would have the same score, and we would select the first socket in
>>>>> the list (if not applying reuseport hashing)
>>>> 
>>>> Can you elaborate a bit?
>>>> 
>>>> One option I see is to record in struct sock_reuseport if any port in
>>>> the group is connected and, if so, don't return immediately on the
>>>> first reuseport_select_sock hit, but continue the search for a higher
>>>> scoring connected socket.
>>>> 
>>>> Or do return immediately, but do this refined search in
>>>> reuseport_select_sock itself, as it has a reference to all sockets in the
>>>> group in sock_reuseport->socks[]. Instead of the straightforward hash.
>>> 
>>> That won't work, as reuseport_select_sock does not have access to
>>> protocol specific data, notably inet_dport.
>>> 
>>> Unfortunately, what I've come up with so far is not concise and slows
>>> down existing reuseport lookup in a busy port table slot. Note that it
>>> is needed for both ipv4 and ipv6.
>>> 
>>> Do not break out of the port table slot early, but continue to search
>>> for a higher scored match even after matching a reuseport:
>>> 
>>> "
>>>  @@ -413,28 +413,39 @@ static struct sock *udp4_lib_lookup2(struct net *net,
>>>                                    struct udp_hslot *hslot2,
>>>                                    struct sk_buff *skb)
>>> {
>>> +       struct sock *reuseport_result = NULL;
>>>       struct sock *sk, *result;
>>> +       int reuseport_score = 0;
>>>       int score, badness;
>>>       u32 hash = 0;
>>> 
>>>       result = NULL;
>>>       badness = 0;
>>>       udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) {
>>>               score = compute_score(sk, net, saddr, sport,
>>>                                     daddr, hnum, dif, sdif);
>>>               if (score > badness) {
>>> -                       if (sk->sk_reuseport) {
>>> +                       if (sk->sk_reuseport &&
>>> +                           sk->sk_state != TCP_ESTABLISHED &&
>>> +                           !reuseport_result) {
>>>                               hash = udp_ehashfn(net, daddr, hnum,
>>>                                                  saddr, sport);
>>> -                               result = reuseport_select_sock(sk, hash, skb,
>>> +                               reuseport_result =
>>> reuseport_select_sock(sk, hash, skb,
>>>                                                       sizeof(struct udphdr));
>>> -                               if (result)
>>> -                                       return result;
>>> +                               if (reuseport_result)
>>> +                                       reuseport_score = score;
>>> +                               continue;
>>>                       }
>>>                       badness = score;
>>>                       result = sk;
>>>               }
>>>       }
>>> +
>>> +       if (badness < reuseport_score)
>>> +               result = reuseport_result;
>>> +
>>>       return result;
>>> "
>>> 
>>> To break out after the first reuseport hit when it is safe, i.e., when
>>> it holds no connected sockets, requires adding this state to struct
>>> reuseport_sock at __ip4_datagram_connect. And modify
>>> reuseport_select_sock to read this. At least, I have not found a more
>>> elegant solution.
>>> 
>>>> Steve, Re: your point on a scalable QUIC server. That is an
>>>> interesting case certainly. Opening a connected socket per flow adds
>>>> both memory and port table pressure. I once looked into an SO_TXONLY
>>>> udp socket option that does not hash connected sockets into the port
>>>> table. In effect receiving on a small set of listening sockets (e.g.,
>>>> one per cpu) and sending over separate tx-only sockets. That still
>>>> introduces unnecessary memory allocation. OTOH it amortizes some
>>>> operations, such as route lookup.
>>>> 
>>>> Anyway, that does not fix the immediate issue you reported when using
>>>> SO_REUSEPORT as described.
>>> 
>>> As for the BPF program: good point on accessing the udp port when
>>> skb->data is already beyond the header.
>>> 
>>> Programs of type sk_filter can use bpf_skb_load_bytes(_relative).
>>> Which I think will work, but have not tested.
>>> 
>>> As of kernel 4.19 programs of type BPF_PROG_TYPE_SK_REUSEPORT can be
>>> attached (with CAP_SYS_ADMIN). See
>>> tools/testing/selftests/bpf/progs/test_select_reuseport_kern.c for an
>>> example that parses udp headers with bpf_skb_load_bytes.
>>> 


^ permalink raw reply

* Re: [PATCH] net/skbuff: silence warnings under memory pressure
From: Sergey Senozhatsky @ 2019-09-04 14:48 UTC (permalink / raw)
  To: Qian Cai
  Cc: Sergey Senozhatsky, Michal Hocko, Eric Dumazet, davem, netdev,
	linux-mm, linux-kernel, Petr Mladek, Sergey Senozhatsky,
	Steven Rostedt
In-Reply-To: <1567599263.5576.72.camel@lca.pw>

On (09/04/19 08:14), Qian Cai wrote:
> > Plus one more check - waitqueue_active(&log_wait). printk() adds
> > pending irq_work only if there is a user-space process sleeping on
> > log_wait and irq_work is not already scheduled. If the syslog is
> > active or there is noone to wakeup then we don't queue irq_work.
> 
> Another possibility for this potential livelock is that those printk() from
> warn_alloc(), dump_stack() and show_mem() increase the time it needs to process
> build_skb() allocation failures significantly under memory pressure. As the
> result, ksoftirqd() could be rescheduled during that time via a different CPU
> (this is a large x86 NUMA system anyway),
> 
> [83605.577256][   C31]  run_ksoftirqd+0x1f/0x40
> [83605.577256][   C31]  smpboot_thread_fn+0x255/0x440
> [83605.577256][   C31]  kthread+0x1df/0x200
> [83605.577256][   C31]  ret_from_fork+0x35/0x40

Hum hum hum...

So I can, _probably_, think of several patches.

First, move wake_up_klogd() back to console_unlock().

Second, move `printk_pending' out of per-CPU region and make it global.
So we will have just one printk irq_work scheduled across all CPUs;
currently we have one irq_work per CPU. I think I sent a patch a long
long time ago, but we never discussed it, as far as I remember.

> In addition, those printk() will deal with console drivers or even a networking
> console, so it is probably not unusual that it could call irq_exit()-
>__do_softirq() at one point and then this livelock.

Do you use netcon? Because this, theoretically, can open up one more
vector. netcon allocates skbs from ->write() path. We call con drivers'
->write() from printk_safe context, so should netcon skb allocation
warn we will scedule one more irq_work on that CPU to flush per-CPU
printk_safe buffer.

If this is the case, then we can stop calling console_driver() under
printk_safe. I sent a patch a while ago, but we agreed to keep the
things the way they are, fot the time being.

Let me think more.

	-ss

^ permalink raw reply

* Re: [Bridge] [PATCH v3 1/2] net: bridge: use mac_len in bridge forwarding
From: Zahari Doychev @ 2019-09-04 14:32 UTC (permalink / raw)
  To: Toshiaki Makita
  Cc: netdev, makita.toshiaki, jiri, nikolay, simon.horman, roopa,
	bridge, jhs, dsahern, xiyou.wangcong, johannes,
	alexei.starovoitov
In-Reply-To: <a9a093f2-1ec6-339c-b015-eb658618cf2b@gmail.com>

On Wed, Sep 04, 2019 at 04:14:28PM +0900, Toshiaki Makita wrote:
> On 2019/09/03 22:36, Zahari Doychev wrote:
> > On Tue, Sep 03, 2019 at 08:37:36PM +0900, Toshiaki Makita wrote:
> > > Hi Zahari,
> > > 
> > > Sorry for reviewing this late.
> > > 
> > > On 2019/09/03 3:09, Zahari Doychev wrote:
> > > ...
> > > > @@ -466,13 +466,14 @@ static bool __allowed_ingress(const struct net_bridge *br,
> > > >    		/* Tagged frame */
> > > >    		if (skb->vlan_proto != br->vlan_proto) {
> > > >    			/* Protocol-mismatch, empty out vlan_tci for new tag */
> > > > -			skb_push(skb, ETH_HLEN);
> > > > +			skb_push(skb, skb->mac_len);
> > > >    			skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto,
> > > >    							skb_vlan_tag_get(skb));
> > > 
> > > I think we should insert vlan at skb->data, i.e. mac_header + mac_len, while this
> > > function inserts the tag at mac_header + ETH_HLEN which is not always the correct
> > > offset.
> > 
> > Maybe I am misunderstanding the concern here but this should make sure that
> > the VLAN tag from the skb is move back in the payload as the outer most tag.
> > So it should follow the ethernet header. It looks like this e.g.,:
> > 
> > VLAN1 in skb:
> > +------+------+-------+
> > | DMAC | SMAC | ETYPE |
> > +------+------+-------+
> > 
> > VLAN1 moved to payload:
> > +------+------+-------+-------+
> > | DMAC | SMAC | VLAN1 | ETYPE |
> > +------+------+-------+-------+
> > 
> > VLAN2 in skb:
> > +------+------+-------+-------+
> > | DMAC | SMAC | VLAN1 | ETYPE |
> > +------+------+-------+-------+
> > 
> > VLAN2 moved to payload:
> > 
> > +------+------+-------+-------+
> > | DMAC | SMAC | VLAN2 | VLAN1 | ....
> > +------+------+-------+-------+
> > 
> > Doing the skb push with mac_len makes sure that VLAN tag is inserted in the
> > correct offset. For mac_len == ETH_HLEN this does not change the current
> > behaviour.
> 
> Reordering VLAN headers here does not look correct to me. If skb->data points to ETH+VLAN,
> then we should insert the vlan at the offset.
> Vlan devices with reorder_hdr disabled produce packets whose mac_len includes ETH+VLAN header,
> and they expects vlan insertion after the outer vlan header.

I see so in this case we should handle differently as it seems sometimes
we have to insert after or before the tag in the packet. I am not quite sure
if this is possible to be detected here. I was trying to do bridging with VLAN
devices with reorder_hdr disabled working but somehow I was not able to get
mac_len longer then ETH_HLEN in all cases that I tried. Can you provide some
example how can I try this out? It will really help me to understand the
problem better.

> 
> Also I'm not sure there is standard ethernet header in mac_len, as mac_len is not ETH_HLEN.
> E.g. tun devices can produce vlan packets without ehternet header.

How is the bridge forwarding decision done in this case when there are no
MAC addresses, vlan based only?

> 
> > 
> > > 
> > > >    			if (unlikely(!skb))
> > > >    				return false;
> > > >    			skb_pull(skb, ETH_HLEN);
> > > 
> > > Now skb->data is mac_header + ETH_HLEN which would be broken when mac_len is not
> > > ETH_HLEN?
> > 
> > I thought it would be better to point in this case to the outer tag as otherwise
> > if mac_len is used the skb->data will point to the next tag which I find somehow
> > inconsistent or do you see some case where this can cause problems?
> 
> Vlan devices with reorder_hdr off will break because it relies on skb->data offset
> as I described in the previous discussion.

I also see in vlan_do_receive that the VLAN tag is moved to the payload when
reorder_hdr is off and the vlan_dev is not a bridge port. So it seems that
I am misunderstanding the reorder_hdr option so if you can give me some more
details about how it is supposed to be used will be highly appreciated.

Thanks
Zahari

> 
> Toshiaki Makita

^ permalink raw reply

* Re: Is bug 200755 in anyone's queue??
From: Willem de Bruijn @ 2019-09-04 14:23 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Mark KEATON, Steve Zabele, Willem de Bruijn, Network Development,
	shum@canndrew.org, vladimir116@gmail.com, saifi.khan@strikr.in,
	Daniel Borkmann, on2k16nm@gmail.com, Stephen Hemminger
In-Reply-To: <c3b83305-82a5-f3c8-2602-1aed2e9b51ca@gmail.com>

On Wed, Sep 4, 2019 at 8:23 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 9/4/19 2:00 PM, Mark KEATON wrote:
> > Hi Willem,
> >
> > I am the person who commented on the original bug report in bugzilla.
> >
> > In communicating with Steve just now about possible solutions that maintain the efficiency that you are after, what would you think of the following:  keep two lists of UDP sockets, those connected and those not connected, and always searching the connected list first.
>
> This was my suggestion.
>
> Note that this requires adding yet another hash table, and yet another lookup
> (another cache line miss per incoming packet)
>
> This lookup will slow down DNS and QUIC servers, or any application solely using not connected sockets.

Exactly.

The only way around it that I see is to keep the single list and
optionally mark a struct reuseport_sock as having no connected
members, in which case the search can break on the first reuseport
match, as it does today.

"
On top of the main patch it requires something like

@@ -22,6 +22,7 @@ struct sock_reuseport {
        /* ID stays the same even after the size of socks[] grows. */
        unsigned int            reuseport_id;
        bool                    bind_inany;
+       unsigned int             connected;
        struct bpf_prog __rcu   *prog;          /* optional BPF sock selector */
        struct sock             *socks[0];      /* array of sock pointers */
 };

@@ -73,6 +74,15 @@ int __ip4_datagram_connect(struct sock *sk, struct
sockaddr *uaddr, int addr_len
        sk_set_txhash(sk);
        inet->inet_id = jiffies;

+       if (rcu_access_pointer(sk->sk_reuseport_cb)) {
+               struct sock_reuseport *reuse;
+
+               rcu_read_lock();
+               reuse = rcu_dereference(sk->sk_reuseport_cb);
+               reuse->connected = 1;
+               rcu_read_unlock();
+       }
+
        sk_dst_set(sk, &rt->dst);
        err = 0;
"

plus a way for reuseport_select_sock to communicate that. Probably a
variant __reuseport_select_sock with an extra argument.

As for BPF: the example I pointed out does read ip addresses and uses
a BPF map for socket selection. But as that feature is new with 4.19
it is probably moot for this purpose, as we are targeting a fix that
can be backported to 4.19 stable.

^ permalink raw reply

* [PATCH] net: hns: Move static keyword to the front of declaration
From: Krzysztof Wilczynski @ 2019-09-04 14:21 UTC (permalink / raw)
  To: Yisen Zhuang, Salil Mehta, David S. Miller, Yonglong Liu, Peng Li,
	Greg Kroah-Hartman, Colin Ian King, Huang Zijiang,
	Thomas Gleixner
  Cc: netdev, linux-kernel

Move the static keyword to the front of declaration of g_dsaf_mode_match,
and resolve the following compiler warning that can be seen when building
with warnings enabled (W=1):

drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c:27:1: warning:
  ‘static’ is not at beginning of declaration [-Wold-style-declaration]

Signed-off-by: Krzysztof Wilczynski <kw@linux.com>
---
Related: https://lore.kernel.org/r/20190827233017.GK9987@google.com

 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index c1eba421ba82..3a14bbc26ea2 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -24,7 +24,7 @@
 #include "hns_dsaf_rcb.h"
 #include "hns_dsaf_misc.h"
 
-const static char *g_dsaf_mode_match[DSAF_MODE_MAX] = {
+static const char *g_dsaf_mode_match[DSAF_MODE_MAX] = {
 	[DSAF_MODE_DISABLE_2PORT_64VM] = "2port-64vf",
 	[DSAF_MODE_DISABLE_6PORT_0VM] = "6port-16rss",
 	[DSAF_MODE_DISABLE_6PORT_16VM] = "6port-16vf",
-- 
2.22.1


^ permalink raw reply related

* Re: [RESEND PATCH 0/5] Add bluetooth support for Orange Pi 3
From: Marcel Holtmann @ 2019-09-04 14:19 UTC (permalink / raw)
  To: Maxime Ripard
  Cc: megous, Chen-Yu Tsai, Rob Herring, Johan Hedberg, Mark Rutland,
	David S. Miller, netdev, devicetree, linux-kernel,
	linux-arm-kernel, linux-bluetooth
In-Reply-To: <20190830132034.u65arlv7umh64lx6@flea>

Hi Maxime,

>>>>> (Resend to add missing lists, sorry for the noise.)
>>>>> 
>>>>> This series implements bluetooth support for Xunlong Orange Pi 3 board.
>>>>> 
>>>>> The board uses AP6256 WiFi/BT 5.0 chip.
>>>>> 
>>>>> Summary of changes:
>>>>> 
>>>>> - add more delay to let initialize the chip
>>>>> - let the kernel detect firmware file path
>>>>> - add new compatible and update dt-bindings
>>>>> - update Orange Pi 3 / H6 DTS
>>>>> 
>>>>> Please take a look.
>>>>> 
>>>>> thank you and regards,
>>>>> Ondrej Jirman
>>>>> 
>>>>> Ondrej Jirman (5):
>>>>> dt-bindings: net: Add compatible for BCM4345C5 bluetooth device
>>>>> bluetooth: bcm: Add support for loading firmware for BCM4345C5
>>>>> bluetooth: hci_bcm: Give more time to come out of reset
>>>>> arm64: dts: allwinner: h6: Add pin configs for uart1
>>>>> arm64: dts: allwinner: orange-pi-3: Enable UART1 / Bluetooth
>>>>> 
>>>>> .../bindings/net/broadcom-bluetooth.txt       |  1 +
>>>>> .../dts/allwinner/sun50i-h6-orangepi-3.dts    | 19 +++++++++++++++++++
>>>>> arch/arm64/boot/dts/allwinner/sun50i-h6.dtsi  | 10 ++++++++++
>>>>> drivers/bluetooth/btbcm.c                     |  3 +++
>>>>> drivers/bluetooth/hci_bcm.c                   |  3 ++-
>>>>> 5 files changed, 35 insertions(+), 1 deletion(-)
>>>> 
>>>> all 5 patches have been applied to bluetooth-next tree.
>>> 
>>> The DTS patches (last 2) should go through the arm-soc tree, can you
>>> drop them?
>> 
>> why is that? We have included DTS changes for Bluetooth devices
>> directly all the time. What is different with this hardware?
> 
> I guess some maintainers are more relaxed with it than we are then,
> but for the why, well, it's the usual reasons, the most immediate one
> being that it reduces to a minimum the conflicts between trees.
> 
> The other being that it's not really usual to merge patches supposed
> to be handled by another maintainer without (at least) his
> consent. I'm pretty sure you would have asked the same request if I
> would have merged the bluetooth patches through my tree without
> notice.

I took the two DTS patches out now and let the submitter deal with getting these merged.

Regards

Marcel


^ permalink raw reply

* Re: [PATCH net-next] MAINTAINERS: add myself as maintainer for xilinx axiethernet driver
From: Michal Simek @ 2019-09-04 14:18 UTC (permalink / raw)
  To: Radhey Shyam Pandey, davem, netdev
  Cc: michal.simek, anirudha.sarangi, linux, mchehab+samsung, gregkh,
	nicolas.ferre, linux-arm-kernel, linux-kernel
In-Reply-To: <1567604658-9335-1-git-send-email-radhey.shyam.pandey@xilinx.com>

On 04. 09. 19 15:44, Radhey Shyam Pandey wrote:
> I am maintaining xilinx axiethernet driver in xilinx tree and would like
> to maintain it in the mainline kernel as well. Hence adding myself as a
> maintainer. Also Anirudha and John has moved to new roles, so based on
> request removing them from the maintainer list.
> 
> Signed-off-by: Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
> Acked-by: John Linn <john.linn@xilinx.com>
> ---
>  MAINTAINERS |    3 +--
>  1 files changed, 1 insertions(+), 2 deletions(-)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index a081c47..74d5566 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -17714,8 +17714,7 @@ F:	include/uapi/linux/dqblk_xfs.h
>  F:	include/uapi/linux/fsmap.h
>  
>  XILINX AXI ETHERNET DRIVER
> -M:	Anirudha Sarangi <anirudh@xilinx.com>
> -M:	John Linn <John.Linn@xilinx.com>
> +M:	Radhey Shyam Pandey <radhey.shyam.pandey@xilinx.com>
>  S:	Maintained
>  F:	drivers/net/ethernet/xilinx/xilinx_axienet*
>  
> 

Acked-by: Michal Simek <michal.simek@xilinx.com>

Thanks,
Michal

^ permalink raw reply

* [PATCH] net: qed: Move static keyword to the front of declaration
From: Krzysztof Wilczynski @ 2019-09-04 14:17 UTC (permalink / raw)
  To: Ariel Elior; +Cc: David S. Miller, GR-everest-linux-l2, netdev, linux-kernel

Move the static keyword to the front of declaration of iwarp_state_names,
and resolve the following compiler warning that can be seen when building
with warnings enabled (W=1):

drivers/net/ethernet/qlogic/qed/qed_iwarp.c:385:1: warning:
  ‘static’ is not at beginning of declaration [-Wold-style-declaration]

Also, resolve checkpatch.pl script warning:

WARNING: static const char * array should probably be
  static const char * const

Signed-off-by: Krzysztof Wilczynski <kw@linux.com>
---
Related: https://lore.kernel.org/r/20190827233017.GK9987@google.com

 drivers/net/ethernet/qlogic/qed/qed_iwarp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
index f380fae8799d..65ec16a31658 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_iwarp.c
@@ -382,7 +382,7 @@ qed_iwarp2roce_state(enum qed_iwarp_qp_state state)
 	}
 }
 
-const static char *iwarp_state_names[] = {
+static const char * const iwarp_state_names[] = {
 	"IDLE",
 	"RTS",
 	"TERMINATE",
-- 
2.22.1


^ permalink raw reply related

* [PATCH v3 net] net: Properly update v4 routes with v6 nexthop
From: Donald Sharp @ 2019-09-04 14:11 UTC (permalink / raw)
  To: netdev, dsahern, sworley

When creating a v4 route that uses a v6 nexthop from a nexthop group.
Allow the kernel to properly send the nexthop as v6 via the RTA_VIA
attribute.

Broken behavior:

$ ip nexthop add via fe80::9 dev eth0
$ ip nexthop show
id 1 via fe80::9 dev eth0 scope link
$ ip route add 4.5.6.7/32 nhid 1
$ ip route show
default via 10.0.2.2 dev eth0
4.5.6.7 nhid 1 via 254.128.0.0 dev eth0
10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15
$

Fixed behavior:

$ ip nexthop add via fe80::9 dev eth0
$ ip nexthop show
id 1 via fe80::9 dev eth0 scope link
$ ip route add 4.5.6.7/32 nhid 1
$ ip route show
default via 10.0.2.2 dev eth0
4.5.6.7 nhid 1 via inet6 fe80::9 dev eth0
10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15
$

v2, v3: Addresses code review comments from David Ahern

Fixes: dcb1ecb50edf (“ipv4: Prepare for fib6_nh from a nexthop object”)
Signed-off-by: Donald Sharp <sharpd@cumulusnetworks.com>
---
 include/net/ip_fib.h     |  4 ++--
 include/net/nexthop.h    |  5 +++--
 net/ipv4/fib_semantics.c | 15 ++++++++-------
 net/ipv6/route.c         | 11 ++++++-----
 4 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 4c81846ccce8..ab1ca9e238d2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -513,7 +513,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh,
 			  struct netlink_callback *cb);
 
 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh,
-		     unsigned char *flags, bool skip_oif);
+		     u8 rt_family, unsigned char *flags, bool skip_oif);
 int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh,
-		    int nh_weight);
+		    int nh_weight, u8 rt_family);
 #endif  /* _NET_FIB_H */
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 95f766c31c90..331ebbc94fe7 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -161,7 +161,8 @@ struct nexthop *nexthop_mpath_select(const struct nexthop *nh, int nhsel)
 }
 
 static inline
-int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh)
+int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh,
+			    u8 rt_family)
 {
 	struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
 	int i;
@@ -172,7 +173,7 @@ int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh)
 		struct fib_nh_common *nhc = &nhi->fib_nhc;
 		int weight = nhg->nh_entries[i].weight;
 
-		if (fib_add_nexthop(skb, nhc, weight) < 0)
+		if (fib_add_nexthop(skb, nhc, weight, rt_family) < 0)
 			return -EMSGSIZE;
 	}
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 2db089e10ba0..0913a090b2bf 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1582,7 +1582,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
 }
 
 int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
-		     unsigned char *flags, bool skip_oif)
+		     u8 rt_family, unsigned char *flags, bool skip_oif)
 {
 	if (nhc->nhc_flags & RTNH_F_DEAD)
 		*flags |= RTNH_F_DEAD;
@@ -1613,7 +1613,7 @@ int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nhc,
 		/* if gateway family does not match nexthop family
 		 * gateway is encoded as RTA_VIA
 		 */
-		if (nhc->nhc_gw_family != nhc->nhc_family) {
+		if (rt_family != nhc->nhc_gw_family) {
 			int alen = sizeof(struct in6_addr);
 			struct nlattr *nla;
 			struct rtvia *via;
@@ -1654,7 +1654,7 @@ EXPORT_SYMBOL_GPL(fib_nexthop_info);
 
 #if IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) || IS_ENABLED(CONFIG_IPV6)
 int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
-		    int nh_weight)
+		    int nh_weight, u8 rt_family)
 {
 	const struct net_device *dev = nhc->nhc_dev;
 	struct rtnexthop *rtnh;
@@ -1667,7 +1667,7 @@ int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nhc,
 	rtnh->rtnh_hops = nh_weight - 1;
 	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
 
-	if (fib_nexthop_info(skb, nhc, &flags, true) < 0)
+	if (fib_nexthop_info(skb, nhc, rt_family, &flags, true) < 0)
 		goto nla_put_failure;
 
 	rtnh->rtnh_flags = flags;
@@ -1693,13 +1693,14 @@ static int fib_add_multipath(struct sk_buff *skb, struct fib_info *fi)
 		goto nla_put_failure;
 
 	if (unlikely(fi->nh)) {
-		if (nexthop_mpath_fill_node(skb, fi->nh) < 0)
+		if (nexthop_mpath_fill_node(skb, fi->nh, AF_INET) < 0)
 			goto nla_put_failure;
 		goto mp_end;
 	}
 
 	for_nexthops(fi) {
-		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight) < 0)
+		if (fib_add_nexthop(skb, &nh->nh_common, nh->fib_nh_weight,
+				    AF_INET) < 0)
 			goto nla_put_failure;
 #ifdef CONFIG_IP_ROUTE_CLASSID
 		if (nh->nh_tclassid &&
@@ -1775,7 +1776,7 @@ int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
 		const struct fib_nh_common *nhc = fib_info_nhc(fi, 0);
 		unsigned char flags = 0;
 
-		if (fib_nexthop_info(skb, nhc, &flags, false) < 0)
+		if (fib_nexthop_info(skb, nhc, AF_INET, &flags, false) < 0)
 			goto nla_put_failure;
 
 		rtm->rtm_flags = flags;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index fd059e08785a..cfb969e68d45 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5329,7 +5329,7 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
 		if (!mp)
 			goto nla_put_failure;
 
-		if (nexthop_mpath_fill_node(skb, nh))
+		if (nexthop_mpath_fill_node(skb, nh, AF_INET6))
 			goto nla_put_failure;
 
 		nla_nest_end(skb, mp);
@@ -5337,7 +5337,7 @@ static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
 		struct fib6_nh *fib6_nh;
 
 		fib6_nh = nexthop_fib6_nh(nh);
-		if (fib_nexthop_info(skb, &fib6_nh->nh_common,
+		if (fib_nexthop_info(skb, &fib6_nh->nh_common, AF_INET6,
 				     flags, false) < 0)
 			goto nla_put_failure;
 	}
@@ -5466,13 +5466,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 			goto nla_put_failure;
 
 		if (fib_add_nexthop(skb, &rt->fib6_nh->nh_common,
-				    rt->fib6_nh->fib_nh_weight) < 0)
+				    rt->fib6_nh->fib_nh_weight, AF_INET6) < 0)
 			goto nla_put_failure;
 
 		list_for_each_entry_safe(sibling, next_sibling,
 					 &rt->fib6_siblings, fib6_siblings) {
 			if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common,
-					    sibling->fib6_nh->fib_nh_weight) < 0)
+					    sibling->fib6_nh->fib_nh_weight,
+					    AF_INET6) < 0)
 				goto nla_put_failure;
 		}
 
@@ -5489,7 +5490,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
 
 		rtm->rtm_flags |= nh_flags;
 	} else {
-		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
+		if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common, AF_INET6,
 				     &nh_flags, false) < 0)
 			goto nla_put_failure;
 
-- 
2.21.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox