Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [PATCH 2/4] net: phy: Add mdio-aspeed
From: Andrew Lunn @ 2019-07-29 13:32 UTC (permalink / raw)
  To: Andrew Jeffery
  Cc: netdev, davem, robh+dt, mark.rutland, joel, f.fainelli,
	hkallweit1, devicetree, linux-arm-kernel, linux-aspeed,
	linux-kernel
In-Reply-To: <20190729043926.32679-3-andrew@aj.id.au>

On Mon, Jul 29, 2019 at 02:09:24PM +0930, Andrew Jeffery wrote:
> The AST2600 design separates the MDIO controllers from the MAC, which is
> where they were placed in the AST2400 and AST2500. Further, the register
> interface is reworked again, so now we have three possible different
> interface implementations, however this driver only supports the
> interface provided by the AST2600. The AST2400 and AST2500 will continue
> to be supported by the MDIO support embedded in the FTGMAC100 driver.
> 
> Signed-off-by: Andrew Jeffery <andrew@aj.id.au>
> ---
>  drivers/net/phy/Kconfig       |  13 +++
>  drivers/net/phy/Makefile      |   1 +
>  drivers/net/phy/mdio-aspeed.c | 159 ++++++++++++++++++++++++++++++++++
>  3 files changed, 173 insertions(+)
>  create mode 100644 drivers/net/phy/mdio-aspeed.c
> 
> diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
> index 20f14c5fbb7e..206d8650ee7f 100644
> --- a/drivers/net/phy/Kconfig
> +++ b/drivers/net/phy/Kconfig
> @@ -21,6 +21,19 @@ config MDIO_BUS
>  
>  if MDIO_BUS
>  
> +config MDIO_ASPEED
> +	tristate "ASPEED MDIO bus controller"
> +	depends on ARCH_ASPEED || COMPILE_TEST
> +	depends on OF_MDIO && HAS_IOMEM
> +	help
> +	  This module provides a driver for the independent MDIO bus
> +	  controllers found in the ASPEED AST2600 SoC. This is a driver for the
> +	  third revision of the ASPEED MDIO register interface - the first two
> +	  revisions are the "old" and "new" interfaces found in the AST2400 and
> +	  AST2500, embedded in the MAC. For legacy reasons, FTGMAC100 driver
> +	  continues to drive the embedded MDIO controller for the AST2400 and
> +	  AST2500 SoCs, so say N if AST2600 support is not required.
> +
>  config MDIO_BCM_IPROC
>  	tristate "Broadcom iProc MDIO bus controller"
>  	depends on ARCH_BCM_IPROC || COMPILE_TEST
> diff --git a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
> index 839acb292c38..ba07c27e4208 100644
> --- a/drivers/net/phy/Makefile
> +++ b/drivers/net/phy/Makefile
> @@ -22,6 +22,7 @@ libphy-$(CONFIG_LED_TRIGGER_PHY)	+= phy_led_triggers.o
>  obj-$(CONFIG_PHYLINK)		+= phylink.o
>  obj-$(CONFIG_PHYLIB)		+= libphy.o
>  
> +obj-$(CONFIG_MDIO_ASPEED)	+= mdio-aspeed.o
>  obj-$(CONFIG_MDIO_BCM_IPROC)	+= mdio-bcm-iproc.o
>  obj-$(CONFIG_MDIO_BCM_UNIMAC)	+= mdio-bcm-unimac.o
>  obj-$(CONFIG_MDIO_BITBANG)	+= mdio-bitbang.o
> diff --git a/drivers/net/phy/mdio-aspeed.c b/drivers/net/phy/mdio-aspeed.c
> new file mode 100644
> index 000000000000..71496a9ff54a
> --- /dev/null
> +++ b/drivers/net/phy/mdio-aspeed.c
> @@ -0,0 +1,159 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/* Copyright (C) 2019 IBM Corp. */
> +
> +#include <linux/bitfield.h>
> +#include <linux/delay.h>
> +#include <linux/mdio.h>
> +#include <linux/module.h>
> +#include <linux/of.h>
> +#include <linux/of_mdio.h>
> +#include <linux/phy.h>
> +#include <linux/platform_device.h>
> +
> +#define DRV_NAME "mdio-aspeed"
> +
> +#define ASPEED_MDIO_CTRL		0x0
> +#define   ASPEED_MDIO_CTRL_FIRE		BIT(31)
> +#define   ASPEED_MDIO_CTRL_ST		BIT(28)
> +#define     ASPEED_MDIO_CTRL_ST_C45	0
> +#define     ASPEED_MDIO_CTRL_ST_C22	1
> +#define   ASPEED_MDIO_CTRL_OP		GENMASK(27, 26)
> +#define     MDIO_C22_OP_WRITE		0b01
> +#define     MDIO_C22_OP_READ		0b10
> +#define   ASPEED_MDIO_CTRL_PHYAD	GENMASK(25, 21)
> +#define   ASPEED_MDIO_CTRL_REGAD	GENMASK(20, 16)
> +#define   ASPEED_MDIO_CTRL_MIIWDATA	GENMASK(15, 0)
> +
> +#define ASPEED_MDIO_DATA		0x4
> +#define   ASPEED_MDIO_DATA_MDC_THRES	GENMASK(31, 24)
> +#define   ASPEED_MDIO_DATA_MDIO_EDGE	BIT(23)
> +#define   ASPEED_MDIO_DATA_MDIO_LATCH	GENMASK(22, 20)
> +#define   ASPEED_MDIO_DATA_IDLE		BIT(16)
> +#define   ASPEED_MDIO_DATA_MIIRDATA	GENMASK(15, 0)
> +
> +#define ASPEED_MDIO_RETRIES		10
> +
> +struct aspeed_mdio {
> +	void __iomem *base;
> +};
> +
> +static int aspeed_mdio_read(struct mii_bus *bus, int addr, int regnum)
> +{
> +	struct aspeed_mdio *ctx = bus->priv;
> +	u32 ctrl;
> +	int i;
> +
> +	dev_dbg(&bus->dev, "%s: addr: %d, regnum: %d\n", __func__, addr,
> +		regnum);
> +
> +	/* Just clause 22 for the moment */
> +	ctrl = ASPEED_MDIO_CTRL_FIRE

Hi Andrew

In the binding, you say C45 is supported. Here you don't. It would be
nice to be consistent.


> +		| FIELD_PREP(ASPEED_MDIO_CTRL_ST, ASPEED_MDIO_CTRL_ST_C22)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_OP, MDIO_C22_OP_READ)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_PHYAD, addr)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_REGAD, regnum);
> +
> +	iowrite32(ctrl, ctx->base + ASPEED_MDIO_CTRL);
> +
> +	for (i = 0; i < ASPEED_MDIO_RETRIES; i++) {
> +		u32 data;
> +
> +		data = ioread32(ctx->base + ASPEED_MDIO_DATA);
> +		if (data & ASPEED_MDIO_DATA_IDLE)
> +			return FIELD_GET(ASPEED_MDIO_DATA_MIIRDATA, data);
> +
> +		udelay(100);
> +	}

One of the readx_poll_timeout functions could be used.

> +
> +	dev_err(&bus->dev, "MDIO read failed\n");
> +	return -EIO;
> +}
> +
> +static int aspeed_mdio_write(struct mii_bus *bus, int addr, int regnum, u16 val)
> +{
> +	struct aspeed_mdio *ctx = bus->priv;
> +	u32 ctrl;
> +	int i;
> +
> +	dev_dbg(&bus->dev, "%s: addr: %d, regnum: %d, val: 0x%x\n",
> +		__func__, addr, regnum, val);
> +
> +	/* Just clause 22 for the moment */
> +	ctrl = ASPEED_MDIO_CTRL_FIRE
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_ST, ASPEED_MDIO_CTRL_ST_C22)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_OP, MDIO_C22_OP_WRITE)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_PHYAD, addr)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_REGAD, regnum)
> +		| FIELD_PREP(ASPEED_MDIO_CTRL_MIIWDATA, val);
> +
> +	iowrite32(ctrl, ctx->base + ASPEED_MDIO_CTRL);
> +
> +	for (i = 0; i < ASPEED_MDIO_RETRIES; i++) {
> +		ctrl = ioread32(ctx->base + ASPEED_MDIO_CTRL);
> +		if (!(ctrl & ASPEED_MDIO_CTRL_FIRE))
> +			return 0;
> +
> +		udelay(100);
> +	}

readx_poll_timeout() here as well.

Otherwise this looks good.

	  Andrew

^ permalink raw reply

* Re: [PATCH v3] net: dsa: qca8k: enable port flow control
From: Andrew Lunn @ 2019-07-29 13:23 UTC (permalink / raw)
  To: xiaofeis
  Cc: davem, vkoul, netdev, linux-arm-msm, bjorn.andersson,
	vivien.didelot, f.fainelli, niklas.cassel, xiazha
In-Reply-To: <fa444b03b42a2cb72037bc73a62f1976@codeaurora.org>

> But our qca8k HW can auto sync the pause status to MAC from phy with the
> auto-negotiated result.
> So no need to set in qca8k_adjust_link, since there is one setting in
> qca8k_port_set_status: mask |= QCA8K_PORT_STATUS_LINK_AUTO;

How does the auto-sync actually work? Does the MAC make MDIO reads to
the PHY? That is generally unsafe, since some PHYs support pages, and
the PHY driver might be using a different page while the MAC tries to
access the auto-neg results.

Do any of the ports support an external PHY? The auto-sync might not
work in that condition as well. Different register layout, c45 not
c22, etc.

The safest option is to explicitly set the MAC flow configuration
based on the values in phydev.

      Andrew

^ permalink raw reply

* [PATCH stable 4.4] tcp: reset sk_send_head in tcp_write_queue_purge
From: Mao Wenan @ 2019-07-29 13:22 UTC (permalink / raw)
  To: gregkh, stable; +Cc: netdev, linux-kernel

From: Soheil Hassas Yeganeh <soheil@google.com>

tcp_write_queue_purge clears all the SKBs in the write queue
but does not reset the sk_send_head. As a result, we can have
a NULL pointer dereference anywhere that we use tcp_send_head
instead of the tcp_write_queue_tail.

For example, after a27fd7a8ed38 (tcp: purge write queue upon RST),
we can purge the write queue on RST. Prior to
75c119afe14f (tcp: implement rb-tree based retransmit queue),
tcp_push will only check tcp_send_head and then accesses
tcp_write_queue_tail to send the actual SKB. As a result, it will
dereference a NULL pointer.

This has been reported twice for 4.14 where we don't have
75c119afe14f:

By Timofey Titovets:

[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

By Yongjian Xu:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: tcp_push+0x48/0x120
PGD 80000007ff77b067 P4D 80000007ff77b067 PUD 7fd989067 PMD 0
Oops: 0002 [#18] SMP PTI
Modules linked in: tcp_diag inet_diag tcp_bbr sch_fq iTCO_wdt
iTCO_vendor_support pcspkr ixgbe mdio i2c_i801 lpc_ich joydev input_leds shpchp
e1000e igb dca ptp pps_core hwmon mei_me mei ipmi_si ipmi_msghandler sg ses
scsi_transport_sas enclosure ext4 jbd2 mbcache sd_mod ahci libahci megaraid_sas
wmi ast ttm dm_mirror dm_region_hash dm_log dm_mod dax
CPU: 6 PID: 14156 Comm: [ET_NET 6] Tainted: G D 4.14.26-1.el6.x86_64 #1
Hardware name: LENOVO ThinkServer RD440 /ThinkServer RD440, BIOS A0TS80A
09/22/2014
task: ffff8807d78d8140 task.stack: ffffc9000e944000
RIP: 0010:tcp_push+0x48/0x120
RSP: 0018:ffffc9000e947a88 EFLAGS: 00010246
RAX: 00000000000005b4 RBX: ffff880f7cce9c00 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff8807d00f5000
RBP: ffffc9000e947aa8 R08: 0000000000001c84 R09: 0000000000000000
R10: ffff8807d00f5158 R11: 0000000000000000 R12: ffff8807d00f5000
R13: 0000000000000020 R14: 00000000000256d4 R15: 0000000000000000
FS: 00007f5916de9700(0000) GS:ffff88107fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000038 CR3: 00000007f8226004 CR4: 00000000001606e0
Call Trace:
tcp_sendmsg_locked+0x33d/0xe50
tcp_sendmsg+0x37/0x60
inet_sendmsg+0x39/0xc0
sock_sendmsg+0x49/0x60
sock_write_iter+0xb6/0x100
do_iter_readv_writev+0xec/0x130
? rw_verify_area+0x49/0xb0
do_iter_write+0x97/0xd0
vfs_writev+0x7e/0xe0
? __wake_up_common_lock+0x80/0xa0
? __fget_light+0x2c/0x70
? __do_page_fault+0x1e7/0x530
do_writev+0x60/0xf0
? inet_shutdown+0xac/0x110
SyS_writev+0x10/0x20
do_syscall_64+0x6f/0x140
? prepare_exit_to_usermode+0x8b/0xa0
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x3135ce0c57
RSP: 002b:00007f5916de4b00 EFLAGS: 00000293 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000003135ce0c57
RDX: 0000000000000002 RSI: 00007f5916de4b90 RDI: 000000000000606f
RBP: 0000000000000000 R08: 0000000000000000 R09: 00007f5916de8c38
R10: 0000000000000000 R11: 0000000000000293 R12: 00000000000464cc
R13: 00007f5916de8c30 R14: 00007f58d8bef080 R15: 0000000000000002
Code: 48 8b 97 60 01 00 00 4c 8d 97 58 01 00 00 41 b9 00 00 00 00 41 89 f3 4c 39
d2 49 0f 44 d1 41 81 e3 00 80 00 00 0f 85 b0 00 00 00 <80> 4a 38 08 44 8b 8f 74
06 00 00 44 89 8f 7c 06 00 00 83 e6 01
RIP: tcp_push+0x48/0x120 RSP: ffffc9000e947a88
CR2: 0000000000000038
---[ end trace 8d545c2e93515549 ]---

There is other scenario which found in stable 4.4:
Allocated:
 [<ffffffff82f380a6>] __alloc_skb+0xe6/0x600 net/core/skbuff.c:218
 [<ffffffff832466c3>] alloc_skb_fclone include/linux/skbuff.h:856 [inline]
 [<ffffffff832466c3>] sk_stream_alloc_skb+0xa3/0x5d0 net/ipv4/tcp.c:833
 [<ffffffff83249164>] tcp_sendmsg+0xd34/0x2b00 net/ipv4/tcp.c:1178
 [<ffffffff83300ef3>] inet_sendmsg+0x203/0x4d0 net/ipv4/af_inet.c:755
Freed:
 [<ffffffff82f372fd>] __kfree_skb+0x1d/0x20 net/core/skbuff.c:676
 [<ffffffff83288834>] sk_wmem_free_skb include/net/sock.h:1447 [inline]
 [<ffffffff83288834>] tcp_write_queue_purge include/net/tcp.h:1460 [inline]
 [<ffffffff83288834>] tcp_connect_init net/ipv4/tcp_output.c:3122 [inline]
 [<ffffffff83288834>] tcp_connect+0xb24/0x30c0 net/ipv4/tcp_output.c:3261
 [<ffffffff8329b991>] tcp_v4_connect+0xf31/0x1890 net/ipv4/tcp_ipv4.c:246

BUG: KASAN: use-after-free in tcp_skb_pcount include/net/tcp.h:796 [inline]
BUG: KASAN: use-after-free in tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
BUG: KASAN: use-after-free in tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
 [<ffffffff81515cd5>] kasan_report.cold.7+0x175/0x2f7 mm/kasan/report.c:408
 [<ffffffff814f9784>] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:427
 [<ffffffff83286582>] tcp_skb_pcount include/net/tcp.h:796 [inline]
 [<ffffffff83286582>] tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
 [<ffffffff83286582>] tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
 [<ffffffff83287a40>] __tcp_push_pending_frames+0xa0/0x290 net/ipv4/tcp_output.c:2307

stable 4.4 and stable 4.9 don't have the commit abb4a8b870b5 ("tcp: purge write queue upon RST")
which is referred in dbbf2d1e4077,
in tcp_connect_init, it calls tcp_write_queue_purge, and does not reset sk_send_head, then UAF.

stable 4.14 have the commit abb4a8b870b5 ("tcp: purge write queue upon RST"),
in tcp_reset, it calls tcp_write_queue_purge(sk), and does not reset sk_send_head, then UAF.

So this patch can be used to fix stable 4.4 and 4.9.

Fixes: a27fd7a8ed38 (tcp: purge write queue upon RST)
Reported-by: Timofey Titovets <nefelim4ag@gmail.com>
Reported-by: Yongjian Xu <yongjianchn@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Yongjian Xu <yongjianchn@gmail.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 include/net/tcp.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index bf8a0dae977a..77438a8406ec 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1443,6 +1443,11 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
+static inline void tcp_init_send_head(struct sock *sk)
+{
+	sk->sk_send_head = NULL;
+}
+
 /* Fastopen key context */
 struct tcp_fastopen_context {
 	struct crypto_cipher	*tfm;
@@ -1459,6 +1464,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	tcp_init_send_head(sk);
 	inet_csk(sk)->icsk_backoff = 0;
 }
 
@@ -1520,11 +1526,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 		tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-	sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_queue_tail(&sk->sk_write_queue, skb);
-- 
2.20.1


^ permalink raw reply related

* [PATCH stable 4.9] tcp: reset sk_send_head in tcp_write_queue_purge
From: Mao Wenan @ 2019-07-29 13:21 UTC (permalink / raw)
  To: gregkh, stable; +Cc: netdev, linux-kernel

From: Soheil Hassas Yeganeh <soheil@google.com>

tcp_write_queue_purge clears all the SKBs in the write queue
but does not reset the sk_send_head. As a result, we can have
a NULL pointer dereference anywhere that we use tcp_send_head
instead of the tcp_write_queue_tail.

For example, after a27fd7a8ed38 (tcp: purge write queue upon RST),
we can purge the write queue on RST. Prior to
75c119afe14f (tcp: implement rb-tree based retransmit queue),
tcp_push will only check tcp_send_head and then accesses
tcp_write_queue_tail to send the actual SKB. As a result, it will
dereference a NULL pointer.

This has been reported twice for 4.14 where we don't have
75c119afe14f:

By Timofey Titovets:

[  422.081094] BUG: unable to handle kernel NULL pointer dereference
at 0000000000000038
[  422.081254] IP: tcp_push+0x42/0x110
[  422.081314] PGD 0 P4D 0
[  422.081364] Oops: 0002 [#1] SMP PTI

By Yongjian Xu:

BUG: unable to handle kernel NULL pointer dereference at 0000000000000038
IP: tcp_push+0x48/0x120
PGD 80000007ff77b067 P4D 80000007ff77b067 PUD 7fd989067 PMD 0
Oops: 0002 [#18] SMP PTI
Modules linked in: tcp_diag inet_diag tcp_bbr sch_fq iTCO_wdt
iTCO_vendor_support pcspkr ixgbe mdio i2c_i801 lpc_ich joydev input_leds shpchp
e1000e igb dca ptp pps_core hwmon mei_me mei ipmi_si ipmi_msghandler sg ses
scsi_transport_sas enclosure ext4 jbd2 mbcache sd_mod ahci libahci megaraid_sas
wmi ast ttm dm_mirror dm_region_hash dm_log dm_mod dax
CPU: 6 PID: 14156 Comm: [ET_NET 6] Tainted: G D 4.14.26-1.el6.x86_64 #1
Hardware name: LENOVO ThinkServer RD440 /ThinkServer RD440, BIOS A0TS80A
09/22/2014
task: ffff8807d78d8140 task.stack: ffffc9000e944000
RIP: 0010:tcp_push+0x48/0x120
RSP: 0018:ffffc9000e947a88 EFLAGS: 00010246
RAX: 00000000000005b4 RBX: ffff880f7cce9c00 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 0000000000000040 RDI: ffff8807d00f5000
RBP: ffffc9000e947aa8 R08: 0000000000001c84 R09: 0000000000000000
R10: ffff8807d00f5158 R11: 0000000000000000 R12: ffff8807d00f5000
R13: 0000000000000020 R14: 00000000000256d4 R15: 0000000000000000
FS: 00007f5916de9700(0000) GS:ffff88107fd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000000000038 CR3: 00000007f8226004 CR4: 00000000001606e0
Call Trace:
tcp_sendmsg_locked+0x33d/0xe50
tcp_sendmsg+0x37/0x60
inet_sendmsg+0x39/0xc0
sock_sendmsg+0x49/0x60
sock_write_iter+0xb6/0x100
do_iter_readv_writev+0xec/0x130
? rw_verify_area+0x49/0xb0
do_iter_write+0x97/0xd0
vfs_writev+0x7e/0xe0
? __wake_up_common_lock+0x80/0xa0
? __fget_light+0x2c/0x70
? __do_page_fault+0x1e7/0x530
do_writev+0x60/0xf0
? inet_shutdown+0xac/0x110
SyS_writev+0x10/0x20
do_syscall_64+0x6f/0x140
? prepare_exit_to_usermode+0x8b/0xa0
entry_SYSCALL_64_after_hwframe+0x3d/0xa2
RIP: 0033:0x3135ce0c57
RSP: 002b:00007f5916de4b00 EFLAGS: 00000293 ORIG_RAX: 0000000000000014
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000003135ce0c57
RDX: 0000000000000002 RSI: 00007f5916de4b90 RDI: 000000000000606f
RBP: 0000000000000000 R08: 0000000000000000 R09: 00007f5916de8c38
R10: 0000000000000000 R11: 0000000000000293 R12: 00000000000464cc
R13: 00007f5916de8c30 R14: 00007f58d8bef080 R15: 0000000000000002
Code: 48 8b 97 60 01 00 00 4c 8d 97 58 01 00 00 41 b9 00 00 00 00 41 89 f3 4c 39
d2 49 0f 44 d1 41 81 e3 00 80 00 00 0f 85 b0 00 00 00 <80> 4a 38 08 44 8b 8f 74
06 00 00 44 89 8f 7c 06 00 00 83 e6 01
RIP: tcp_push+0x48/0x120 RSP: ffffc9000e947a88
CR2: 0000000000000038
---[ end trace 8d545c2e93515549 ]---

There is other scenario which found in stable 4.4:
Allocated:
 [<ffffffff82f380a6>] __alloc_skb+0xe6/0x600 net/core/skbuff.c:218
 [<ffffffff832466c3>] alloc_skb_fclone include/linux/skbuff.h:856 [inline]
 [<ffffffff832466c3>] sk_stream_alloc_skb+0xa3/0x5d0 net/ipv4/tcp.c:833
 [<ffffffff83249164>] tcp_sendmsg+0xd34/0x2b00 net/ipv4/tcp.c:1178
 [<ffffffff83300ef3>] inet_sendmsg+0x203/0x4d0 net/ipv4/af_inet.c:755
Freed:
 [<ffffffff82f372fd>] __kfree_skb+0x1d/0x20 net/core/skbuff.c:676
 [<ffffffff83288834>] sk_wmem_free_skb include/net/sock.h:1447 [inline]
 [<ffffffff83288834>] tcp_write_queue_purge include/net/tcp.h:1460 [inline]
 [<ffffffff83288834>] tcp_connect_init net/ipv4/tcp_output.c:3122 [inline]
 [<ffffffff83288834>] tcp_connect+0xb24/0x30c0 net/ipv4/tcp_output.c:3261
 [<ffffffff8329b991>] tcp_v4_connect+0xf31/0x1890 net/ipv4/tcp_ipv4.c:246

BUG: KASAN: use-after-free in tcp_skb_pcount include/net/tcp.h:796 [inline]
BUG: KASAN: use-after-free in tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
BUG: KASAN: use-after-free in tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
 [<ffffffff81515cd5>] kasan_report.cold.7+0x175/0x2f7 mm/kasan/report.c:408
 [<ffffffff814f9784>] __asan_report_load2_noabort+0x14/0x20 mm/kasan/report.c:427
 [<ffffffff83286582>] tcp_skb_pcount include/net/tcp.h:796 [inline]
 [<ffffffff83286582>] tcp_init_tso_segs net/ipv4/tcp_output.c:1619 [inline]
 [<ffffffff83286582>] tcp_write_xmit+0x3fc2/0x4cb0 net/ipv4/tcp_output.c:2056
 [<ffffffff83287a40>] __tcp_push_pending_frames+0xa0/0x290 net/ipv4/tcp_output.c:2307

stable 4.4 and stable 4.9 don't have the commit abb4a8b870b5 ("tcp: purge write queue upon RST")
which is referred in dbbf2d1e4077,
in tcp_connect_init, it calls tcp_write_queue_purge, and does not reset sk_send_head, then UAF.

stable 4.14 have the commit abb4a8b870b5 ("tcp: purge write queue upon RST"),
in tcp_reset, it calls tcp_write_queue_purge(sk), and does not reset sk_send_head, then UAF.

So this patch can be used to fix stable 4.4 and 4.9.

Fixes: a27fd7a8ed38 (tcp: purge write queue upon RST)
Reported-by: Timofey Titovets <nefelim4ag@gmail.com>
Reported-by: Yongjian Xu <yongjianchn@gmail.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Tested-by: Yongjian Xu <yongjianchn@gmail.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Mao Wenan <maowenan@huawei.com>
---
 include/net/tcp.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d7047de952f0..1eda31f7f013 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1512,6 +1512,11 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
 void tcp_fastopen_init_key_once(bool publish);
 #define TCP_FASTOPEN_KEY_LENGTH 16
 
+static inline void tcp_init_send_head(struct sock *sk)
+{
+	sk->sk_send_head = NULL;
+}
+
 /* Fastopen key context */
 struct tcp_fastopen_context {
 	struct crypto_cipher	*tfm;
@@ -1528,6 +1533,7 @@ static inline void tcp_write_queue_purge(struct sock *sk)
 		sk_wmem_free_skb(sk, skb);
 	sk_mem_reclaim(sk);
 	tcp_clear_all_retrans_hints(tcp_sk(sk));
+	tcp_init_send_head(sk);
 	inet_csk(sk)->icsk_backoff = 0;
 }
 
@@ -1589,11 +1595,6 @@ static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unli
 		tcp_sk(sk)->highest_sack = NULL;
 }
 
-static inline void tcp_init_send_head(struct sock *sk)
-{
-	sk->sk_send_head = NULL;
-}
-
 static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
 {
 	__skb_queue_tail(&sk->sk_write_queue, skb);
-- 
2.20.1


^ permalink raw reply related

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-29 13:14 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <95315f9e-0d31-2d34-ba50-11e1bbc1465c@cumulusnetworks.com>

The 07/29/2019 15:22, Nikolay Aleksandrov wrote:
> Yes, all of the multicast code is handled differently, it doesn't go through the fdb
> lookup or code at all. I don't see how you'll do a lookup in the fdb table with a
> multicast mac address, take a look at br_handle_frame_finish() and you'll notice
> that when a multicast dmac is detected then we use the bridge mcast code for lookups
> and forwarding.

Here is my thinking (needs much more elaboration, which will come if we do a
patch to test it out):

In br_pkt_type

Rename BR_PKT_MULTICAST to BR_PKT_MULTICAST_IP
Add a new type called BR_PKT_MULTICAST_L2

In br_handle_frame_finish

	if (is_multicast_ether_addr(dest)) {
		/* by definition the broadcast is also a multicast address */
		if (is_broadcast_ether_addr(dest)) {
			pkt_type = BR_PKT_BROADCAST;
			local_rcv = true;
		} else {
			pkt_type = BR_PKT_MULTICAST;
			if (br_multicast_rcv(br, p, skb, vid))
				goto drop;
		}
	}

Change the code above to detect if it is a BR_PKT_MULTICAST_IP or a
BR_PKT_MULTICAST_L2

In this section:

switch (pkt_type) {
....
}

if (dst) {
} else {
}

Add awareness to the BR_PKT_MULTICAST_L2 type, and allow it do forwarding
according to the static entry if it is there.

> If you're trying to achieve Rx only on the bridge of these then
> why not just use Ido's tc suggestion or even the ip maddr add offload for each port ?
> 
> If you add a multicast mac in the fdb (currently allowed, but has no effect) and you
> use dev_mc_add() as suggested that'd just be a hack to pass it down and it is already
> possible to achieve via other methods, no need to go through the bridge.

Well, I wanted the SW bridge implementation to behave the same with an without
HW offload.

And also, I believe that is conceptually belongs to the MAC tables.

/Allan

^ permalink raw reply

* Re: [PATCH net-next v4 1/3] flow_offload: move tc indirect block to flow offload
From: Jiri Pirko @ 2019-07-29 13:13 UTC (permalink / raw)
  To: wenxu; +Cc: pablo, fw, jakub.kicinski, netfilter-devel, netdev
In-Reply-To: <c218d9bb-1da7-2ed6-d5b0-afddbe3d0bd7@ucloud.cn>

Mon, Jul 29, 2019 at 02:47:07PM CEST, wenxu@ucloud.cn wrote:
>
>在 2019/7/29 19:13, Jiri Pirko 写道:
>> Sun, Jul 28, 2019 at 08:52:47AM CEST, wenxu@ucloud.cn wrote:
>>> From: wenxu <wenxu@ucloud.cn>
>>>
>>> move tc indirect block to flow_offload and rename
>>> it to flow indirect block.The nf_tables can use the
>>> indr block architecture.
>>>
>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>>> ---
>>> v3: subsys_initcall for init_flow_indr_rhashtable
>>> v4: no change
>>>
>> [...]
>>
>>
>>> diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
>>> index 00b9aab..66f89bc 100644
>>> --- a/include/net/flow_offload.h
>>> +++ b/include/net/flow_offload.h
>>> @@ -4,6 +4,7 @@
>>> #include <linux/kernel.h>
>>> #include <linux/list.h>
>>> #include <net/flow_dissector.h>
>>> +#include <linux/rhashtable.h>
>>>
>>> struct flow_match {
>>> 	struct flow_dissector	*dissector;
>>> @@ -366,4 +367,42 @@ static inline void flow_block_init(struct flow_block *flow_block)
>>> 	INIT_LIST_HEAD(&flow_block->cb_list);
>>> }
>>>
>>> +typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
>>> +				      enum tc_setup_type type, void *type_data);
>>> +
>>> +struct flow_indr_block_cb {
>>> +	struct list_head list;
>>> +	void *cb_priv;
>>> +	flow_indr_block_bind_cb_t *cb;
>>> +	void *cb_ident;
>>> +};
>> I don't understand why are you pushing this struct out of the c file to
>> the header. Please don't.
>>
>>
>>> +
>>> +typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
>>> +				       struct flow_block *flow_block,
>>> +				       struct flow_indr_block_cb *indr_block_cb,
>>> +				       enum flow_block_command command);
>>> +
>>> +struct flow_indr_block_dev {
>>> +	struct rhash_head ht_node;
>>> +	struct net_device *dev;
>>> +	unsigned int refcnt;
>>> +	struct list_head cb_list;
>>> +	flow_indr_block_ing_cmd_t *ing_cmd_cb;
>>> +	struct flow_block *flow_block;
>> I don't understand why are you pushing this struct out of the c file to
>> the header. Please don't.
>
>the flow_indr_block_dev and indr_block_cb in the h file used for the function

You don't need it, same as before. Please don't expose this struct.


>
>tc_indr_block_ing_cmd in cls_api.c
>
>>> -static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
>>> -				  struct tc_indr_block_cb *indr_block_cb,
>>> +static void tc_indr_block_ing_cmd(struct net_device *dev,
>> I don't understand why you change struct tc_indr_block_dev * to
>> struct net_device * here. If you want to do that, please do that in a
>> separate patch, not it this one where only "the move" should happen.


Did you see the rest of my comments???



>>

^ permalink raw reply

* Re: [PATCH v4 0/5] vsock/virtio: optimizations to increase the throughput
From: Stefan Hajnoczi @ 2019-07-29 13:12 UTC (permalink / raw)
  To: Stefano Garzarella
  Cc: Stefan Hajnoczi, kvm, Michael S. Tsirkin, netdev, linux-kernel,
	virtualization, David S. Miller
In-Reply-To: <20190722091434.tzf7lxw3tvrs5w5v@steredhat>

[-- Attachment #1: Type: text/plain, Size: 1683 bytes --]

On Mon, Jul 22, 2019 at 11:14:34AM +0200, Stefano Garzarella wrote:
> On Mon, Jul 22, 2019 at 10:08:35AM +0100, Stefan Hajnoczi wrote:
> > On Wed, Jul 17, 2019 at 01:30:25PM +0200, Stefano Garzarella wrote:
> > > This series tries to increase the throughput of virtio-vsock with slight
> > > changes.
> > > While I was testing the v2 of this series I discovered an huge use of memory,
> > > so I added patch 1 to mitigate this issue. I put it in this series in order
> > > to better track the performance trends.
> > > 
> > > v4:
> > > - rebased all patches on current master (conflicts is Patch 4)
> > > - Patch 1: added Stefan's R-b
> > > - Patch 3: removed lock when buf_alloc is written [David];
> > >            moved this patch after "vsock/virtio: reduce credit update messages"
> > >            to make it clearer
> > > - Patch 4: vhost_exceeds_weight() is recently introduced, so I've solved some
> > >            conflicts
> > 
> > Stefano: Do you want to continue experimenting before we merge this
> > patch series?  The code looks functionally correct and the performance
> > increases, so I'm happy for it to be merged.
> 
> I think we can merge this series.
> 
> I'll continue to do other experiments (e.g. removing TX workers, allocating
> pages, etc.) but I think these changes are prerequisites for the other patches,
> so we can merge them.
> 
> Thank you very much for the reviews!

All patches have been reviewed by here.  Have an Ack for good measure:

Acked-by: Stefan Hajnoczi <stefanha@redhat.com>

The topics discussed in sub-threads relate to longer-term optimization
work that doesn't block this series.  Please merge.

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* Re: [PATCH] tcp: add new tcp_mtu_probe_floor sysctl
From: Neal Cardwell @ 2019-07-29 13:12 UTC (permalink / raw)
  To: Josh Hunt; +Cc: Eric Dumazet, netdev, David Miller
In-Reply-To: <5a054ca5-4077-5e91-69d5-f1add8dc8bfa@akamai.com>

On Sun, Jul 28, 2019 at 5:14 PM Josh Hunt <johunt@akamai.com> wrote:
>
> On 7/28/19 6:54 AM, Eric Dumazet wrote:
> > On Sun, Jul 28, 2019 at 1:21 AM Josh Hunt <johunt@akamai.com> wrote:
> >>
> >> On 7/27/19 12:05 AM, Eric Dumazet wrote:
> >>> On Sat, Jul 27, 2019 at 4:23 AM Josh Hunt <johunt@akamai.com> wrote:
> >>>>
> >>>> The current implementation of TCP MTU probing can considerably
> >>>> underestimate the MTU on lossy connections allowing the MSS to get down to
> >>>> 48. We have found that in almost all of these cases on our networks these
> >>>> paths can handle much larger MTUs meaning the connections are being
> >>>> artificially limited. Even though TCP MTU probing can raise the MSS back up
> >>>> we have seen this not to be the case causing connections to be "stuck" with
> >>>> an MSS of 48 when heavy loss is present.
> >>>>
> >>>> Prior to pushing out this change we could not keep TCP MTU probing enabled
> >>>> b/c of the above reasons. Now with a reasonble floor set we've had it
> >>>> enabled for the past 6 months.
> >>>
> >>> And what reasonable value have you used ???
> >>
> >> Reasonable for some may not be reasonable for others hence the new
> >> sysctl :) We're currently running with a fairly high value based off of
> >> the v6 min MTU minus headers and options, etc. We went conservative with
> >> our setting initially as it seemed a reasonable first step when
> >> re-enabling TCP MTU probing since with no configurable floor we saw a #
> >> of cases where connections were using severely reduced mss b/c of loss
> >> and not b/c of actual path restriction. I plan to reevaluate the setting
> >> at some point, but since the probing method is still the same it means
> >> the same clients who got stuck with mss of 48 before will land at
> >> whatever floor we set. Looking forward we are interested in trying to
> >> improve TCP MTU probing so it does not penalize clients like this.
> >>
> >> A suggestion for a more reasonable floor default would be 512, which is
> >> the same as the min_pmtu. Given both mechanisms are trying to achieve
> >> the same goal it seems like they should have a similar min/floor.
> >>
> >>>
> >>>>
> >>>> The new sysctl will still default to TCP_MIN_SND_MSS (48), but gives
> >>>> administrators the ability to control the floor of MSS probing.
> >>>>
> >>>> Signed-off-by: Josh Hunt <johunt@akamai.com>
> >>>> ---
> >>>>    Documentation/networking/ip-sysctl.txt | 6 ++++++
> >>>>    include/net/netns/ipv4.h               | 1 +
> >>>>    net/ipv4/sysctl_net_ipv4.c             | 9 +++++++++
> >>>>    net/ipv4/tcp_ipv4.c                    | 1 +
> >>>>    net/ipv4/tcp_timer.c                   | 2 +-
> >>>>    5 files changed, 18 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> >>>> index df33674799b5..49e95f438ed7 100644
> >>>> --- a/Documentation/networking/ip-sysctl.txt
> >>>> +++ b/Documentation/networking/ip-sysctl.txt
> >>>> @@ -256,6 +256,12 @@ tcp_base_mss - INTEGER
> >>>>           Path MTU discovery (MTU probing).  If MTU probing is enabled,
> >>>>           this is the initial MSS used by the connection.
> >>>>
> >>>> +tcp_mtu_probe_floor - INTEGER
> >>>> +       If MTU probing is enabled this caps the minimum MSS used for search_low
> >>>> +       for the connection.
> >>>> +
> >>>> +       Default : 48
> >>>> +
> >>>>    tcp_min_snd_mss - INTEGER
> >>>>           TCP SYN and SYNACK messages usually advertise an ADVMSS option,
> >>>>           as described in RFC 1122 and RFC 6691.
> >>>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> >>>> index bc24a8ec1ce5..c0c0791b1912 100644
> >>>> --- a/include/net/netns/ipv4.h
> >>>> +++ b/include/net/netns/ipv4.h
> >>>> @@ -116,6 +116,7 @@ struct netns_ipv4 {
> >>>>           int sysctl_tcp_l3mdev_accept;
> >>>>    #endif
> >>>>           int sysctl_tcp_mtu_probing;
> >>>> +       int sysctl_tcp_mtu_probe_floor;
> >>>>           int sysctl_tcp_base_mss;
> >>>>           int sysctl_tcp_min_snd_mss;
> >>>>           int sysctl_tcp_probe_threshold;
> >>>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> >>>> index 0b980e841927..59ded25acd04 100644
> >>>> --- a/net/ipv4/sysctl_net_ipv4.c
> >>>> +++ b/net/ipv4/sysctl_net_ipv4.c
> >>>> @@ -820,6 +820,15 @@ static struct ctl_table ipv4_net_table[] = {
> >>>>                   .extra2         = &tcp_min_snd_mss_max,
> >>>>           },
> >>>>           {
> >>>> +               .procname       = "tcp_mtu_probe_floor",
> >>>> +               .data           = &init_net.ipv4.sysctl_tcp_mtu_probe_floor,
> >>>> +               .maxlen         = sizeof(int),
> >>>> +               .mode           = 0644,
> >>>> +               .proc_handler   = proc_dointvec_minmax,
> >>>> +               .extra1         = &tcp_min_snd_mss_min,
> >>>> +               .extra2         = &tcp_min_snd_mss_max,
> >>>> +       },
> >>>> +       {
> >>>>                   .procname       = "tcp_probe_threshold",
> >>>>                   .data           = &init_net.ipv4.sysctl_tcp_probe_threshold,
> >>>>                   .maxlen         = sizeof(int),
> >>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> >>>> index d57641cb3477..e0a372676329 100644
> >>>> --- a/net/ipv4/tcp_ipv4.c
> >>>> +++ b/net/ipv4/tcp_ipv4.c
> >>>> @@ -2637,6 +2637,7 @@ static int __net_init tcp_sk_init(struct net *net)
> >>>>           net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
> >>>>           net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
> >>>>           net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
> >>>> +       net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
> >>>>
> >>>>           net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
> >>>>           net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
> >>>> diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
> >>>> index c801cd37cc2a..dbd9d2d0ee63 100644
> >>>> --- a/net/ipv4/tcp_timer.c
> >>>> +++ b/net/ipv4/tcp_timer.c
> >>>> @@ -154,7 +154,7 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
> >>>>           } else {
> >>>>                   mss = tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low) >> 1;
> >>>>                   mss = min(net->ipv4.sysctl_tcp_base_mss, mss);
> >>>> -               mss = max(mss, 68 - tcp_sk(sk)->tcp_header_len);
> >>>> +               mss = max(mss, net->ipv4.sysctl_tcp_mtu_probe_floor);
> >>>>                   mss = max(mss, net->ipv4.sysctl_tcp_min_snd_mss);
> >>>>                   icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
> >>>>           }
> >>>
> >>>
> >>> Existing sysctl should be enough ?
> >>
> >> I don't think so. Changing tcp_min_snd_mss could impact clients that
> >> really want/need a small mss. When you added the new sysctl I tried to
> >> analyze the mss values we're seeing to understand what we could possibly
> >> raise it to. While not a huge amount, we see more clients than I
> >> expected announcing mss values in the 180-512 range. Given that I would
> >> not feel comfortable setting tcp_min_snd_mss to say 512 as I suggested
> >> above.
> >
> > If these clients need mss values in 180-512 ranges, how MTU probing
> > would work for them,
> > if you set a floor to 512 ?
>
> First, we already seem to be fine with ignoring these paths with ICMP
> based PMTU discovery b/c of our min_pmtu default of 512 and that is
> configurable. Second by adding this sysctl we're giving administrators
> the choice to decide if they'd like to attempt to support these very
> very small # of paths which may be below 512 (MSS <= 512 does not mean
> MTU <= 512) or cover themselves by being able to raise the floor to not
> penalize clients who may be on very lossy networks.
>
> >
> > Are we sure the intent of tcp_base_mss was not to act as a floor ?
>
> My understanding is that tcp_base_mss is meant to be the initial value
> of search_low (as per Docs). Then in RFC 4821 [1] Sections 7.2, shows
> search_low should be configurable, and 7.7 we see that in response to
> successive black hole detection search_low should be halved. So I don't
> think it was meant to be a floor, but just the initial search_low param.

That matches my reading of the RFC and code as well. But in that case
IMHO an additional commit should fix this comment to reflect the fact
thatTCP_BASE_MSS is the initial value, rather than a floor:

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 42728239cdbe..05575ac70333 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -75,7 +75,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 /* Minimal accepted MSS. It is (60+60+8) - (20+20). */
 #define TCP_MIN_MSS            88U

-/* The least MTU to use for probing */
+/* The initial MTU to use for probing */
 #define TCP_BASE_MSS           1024

 /* probing interval, default to 10 minutes as per RFC4821 */

neal

^ permalink raw reply related

* [PATCH ethtool] gitignore: ignore vim swapfiles and patches
From: Michal Kubecek @ 2019-07-29 13:10 UTC (permalink / raw)
  To: John W. Linville; +Cc: netdev

The .*.swp files are created by vim to hold the undo/redo log. Add them to
.gitignore to prevent "git status" or "git gui" from showing them whenever
some file is open in editor.

Add also *.patch to hide patches created by e.g. "git format-patch".

Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index f1165a2c9037..c4df588c37ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,3 +27,6 @@ autom4te.cache
 .deps
 test-*.log
 test-*.trs
+
+.*.swp
+*.patch
-- 
2.22.0

^ permalink raw reply related

* Re: [PATCH] net: stmmac: manage errors returned by of_get_mac_address()
From: Neil Armstrong @ 2019-07-29 13:03 UTC (permalink / raw)
  To: Martin Blumenstingl, peppe.cavallaro, alexandre.torgue, joabreu,
	davem, netdev
  Cc: linux-amlogic, linux-kernel, linux-arm-kernel
In-Reply-To: <20190727192137.27881-1-martin.blumenstingl@googlemail.com>

On 27/07/2019 21:21, Martin Blumenstingl wrote:
> Commit d01f449c008a ("of_net: add NVMEM support to of_get_mac_address")
> added support for reading the MAC address from an nvmem-cell. This
> required changing the logic to return an error pointer upon failure.
> 
> If stmmac is loaded before the nvmem provider driver then
> of_get_mac_address() return an error pointer with -EPROBE_DEFER.
> 
> Propagate this error so the stmmac driver will be probed again after the
> nvmem provider driver is loaded.
> Default to a random generated MAC address in case of any other error,
> instead of using the error pointer as MAC address.
> 
> Fixes: d01f449c008a ("of_net: add NVMEM support to of_get_mac_address")
> Signed-off-by: Martin Blumenstingl <martin.blumenstingl@googlemail.com>
> ---
>  drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
> index 73fc2524372e..154daf4d1072 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c
> @@ -370,6 +370,13 @@ stmmac_probe_config_dt(struct platform_device *pdev, const char **mac)
>  		return ERR_PTR(-ENOMEM);
>  
>  	*mac = of_get_mac_address(np);
> +	if (IS_ERR(*mac)) {
> +		if (PTR_ERR(*mac) == -EPROBE_DEFER)
> +			return ERR_CAST(*mac);
> +
> +		*mac = NULL;
> +	}
> +
>  	plat->interface = of_get_phy_mode(np);
>  
>  	/* Some wrapper drivers still rely on phy_node. Let's save it while
> 

Reviewed-by: Neil Armstrong <narmstrong@baylibre.com>

^ permalink raw reply

* RE: [PATCH v6 rdma-next 1/6] RDMA/core: Create mmap database and cookie helper functions
From: Michal Kalderon @ 2019-07-29 12:58 UTC (permalink / raw)
  To: Jason Gunthorpe, galpress@amazon.com
  Cc: Ariel Elior, dledford@redhat.com, galpress@amazon.com,
	linux-rdma@vger.kernel.org, davem@davemloft.net,
	netdev@vger.kernel.org
In-Reply-To: <20190725175540.GA18757@ziepe.ca>

> From: linux-rdma-owner@vger.kernel.org <linux-rdma-
> owner@vger.kernel.org> On Behalf Of Jason Gunthorpe
> 
> > +	xa_lock(&ucontext->mmap_xa);
> > +	if (check_add_overflow(ucontext->mmap_xa_page,
> > +			       (u32)(length >> PAGE_SHIFT),
> > +			       &next_mmap_page))
> > +		goto err_unlock;
> 
> I still don't like that this algorithm latches into a permanent failure when the
> xa_page wraps.
> 
> It seems worth spending a bit more time here to tidy this.. Keep using the
> mmap_xa_page scheme, but instead do something like
> 
> alloc_cyclic_range():
> 
> while () {
>    // Find first empty element in a cyclic way
>    xa_page_first = mmap_xa_page;
>    xa_find(xa, &xa_page_first, U32_MAX, XA_FREE_MARK)
> 
>    // Is there a enough room to have the range?
>    if (check_add_overflow(xa_page_first, npages, &xa_page_end)) {
>       mmap_xa_page = 0;
>       continue;
>    }
> 
>    // See if the element before intersects
>    elm = xa_find(xa, &zero, xa_page_end, 0);
>    if (elm && intersects(xa_page_first, xa_page_last, elm->first, elm->last)) {
>       mmap_xa_page = elm->last + 1;
>       continue
>    }
> 
>    // xa_page_first -> xa_page_end should now be free
>    xa_insert(xa, xa_page_start, entry);
>    mmap_xa_page = xa_page_end + 1;
>    return xa_page_start;
> }
> 
> Approximately, please check it.
Gal & Jason, 

Coming back to the mmap_xa_page algorithm. I couldn't find some background on this. 
Why do you need the length to be represented in the mmap_xa_page ?  
Why not simply use xa_alloc_cyclic ( like in siw ) 
This is simply a key to a mmap object... 

Thanks,
Michal


^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Nikolay Aleksandrov @ 2019-07-29 12:50 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <95315f9e-0d31-2d34-ba50-11e1bbc1465c@cumulusnetworks.com>

On 29/07/2019 15:22, Nikolay Aleksandrov wrote:
> Hi Allan,
> On 29/07/2019 15:14, Allan W. Nielsen wrote:
>> Hi Nikolay,
>>
>> First of all, as mentioned further down in this thread, I realized that our
>> implementation of the multicast floodmasks does not align with the existing SW
>> implementation. We will change this, such that all multicast packets goes to the
>> SW bridge.
>>
>> This changes things a bit, not that much.
>>
>> I actually think you summarized the issue we have (after changing to multicast
>> flood-masks) right here:
>>
>> The 07/26/2019 12:26, Nikolay Aleksandrov wrote:
>>>>> Actually you mentioned non-IP traffic, so the querier stuff is not a problem. This
>>>>> traffic will always be flooded by the bridge (and also a copy will be locally sent up).
>>>>> Thus only the flooding may need to be controlled.
>>
>> This seems to be exactly what we need.
>>
>> Assuming we have a SW bridge (br0) with 4 slave interfaces (eth0-3). We use this
>> on a network where we want to limit the flooding of frames with dmac
>> 01:21:6C:00:00:01 (which is non IP traffic) to eth0 and eth1.
>>
>> One way of doing this could potentially be to support the following command:
>>
>> bridge fdb add    01:21:6C:00:00:01 port eth0
>> bridge fdb append 01:21:6C:00:00:01 port eth1
>>

And the fdbs become linked lists ? So we'll increase the complexity for something
that is already supported by ACLs (e.g. tc) and also bridge per-port multicast
flood flag ?

I'm sorry but that doesn't sound good to me for a case which is very rare and
there are existing ways to solve without incurring performance hits or increasing
code complexity.

>> On 25/07/2019 16:06, Nikolay Aleksandrov wrote:
>>>>>>>>>  In general NLM_F_APPEND is only used in vxlan, the bridge does not
>>>>>>>>>  handle that flag at all.  FDB is only for *unicast*, nothing is joined
>>>>>>>>>  and no multicast should be used with fdbs. MDB is used for multicast
>>>>>>>>>  handling, but both of these are used for forwarding.
>> This is true, and this should have been addressed in the patch, we were too
>> focused on setting up the offload patch in the driver, and forgot to do the SW
>> implementation.
>>
>> Do you see any issues in supporting this flag, and updating the SW
>> forwarding in br_handle_frame_finish such that it can support/allow a FDB entry
>> to be a multicast?
>>
> 
> Yes, all of the multicast code is handled differently, it doesn't go through the fdb
> lookup or code at all. I don't see how you'll do a lookup in the fdb table with a
> multicast mac address, take a look at br_handle_frame_finish() and you'll notice
> that when a multicast dmac is detected then we use the bridge mcast code for lookups
> and forwarding. If you're trying to achieve Rx only on the bridge of these then
> why not just use Ido's tc suggestion or even the ip maddr add offload for each port ?
> 
> If you add a multicast mac in the fdb (currently allowed, but has no effect) and you
> use dev_mc_add() as suggested that'd just be a hack to pass it down and it is already
> possible to achieve via other methods, no need to go through the bridge.
> 
>> /Allan
>>
> 


^ permalink raw reply

* Re: [PATCH net-next v4 1/3] flow_offload: move tc indirect block to flow offload
From: wenxu @ 2019-07-29 12:47 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: pablo, fw, jakub.kicinski, netfilter-devel, netdev
In-Reply-To: <20190729111350.GE2211@nanopsycho>


在 2019/7/29 19:13, Jiri Pirko 写道:
> Sun, Jul 28, 2019 at 08:52:47AM CEST, wenxu@ucloud.cn wrote:
>> From: wenxu <wenxu@ucloud.cn>
>>
>> move tc indirect block to flow_offload and rename
>> it to flow indirect block.The nf_tables can use the
>> indr block architecture.
>>
>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>> ---
>> v3: subsys_initcall for init_flow_indr_rhashtable
>> v4: no change
>>
> [...]
>
>
>> diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
>> index 00b9aab..66f89bc 100644
>> --- a/include/net/flow_offload.h
>> +++ b/include/net/flow_offload.h
>> @@ -4,6 +4,7 @@
>> #include <linux/kernel.h>
>> #include <linux/list.h>
>> #include <net/flow_dissector.h>
>> +#include <linux/rhashtable.h>
>>
>> struct flow_match {
>> 	struct flow_dissector	*dissector;
>> @@ -366,4 +367,42 @@ static inline void flow_block_init(struct flow_block *flow_block)
>> 	INIT_LIST_HEAD(&flow_block->cb_list);
>> }
>>
>> +typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
>> +				      enum tc_setup_type type, void *type_data);
>> +
>> +struct flow_indr_block_cb {
>> +	struct list_head list;
>> +	void *cb_priv;
>> +	flow_indr_block_bind_cb_t *cb;
>> +	void *cb_ident;
>> +};
> I don't understand why are you pushing this struct out of the c file to
> the header. Please don't.
>
>
>> +
>> +typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
>> +				       struct flow_block *flow_block,
>> +				       struct flow_indr_block_cb *indr_block_cb,
>> +				       enum flow_block_command command);
>> +
>> +struct flow_indr_block_dev {
>> +	struct rhash_head ht_node;
>> +	struct net_device *dev;
>> +	unsigned int refcnt;
>> +	struct list_head cb_list;
>> +	flow_indr_block_ing_cmd_t *ing_cmd_cb;
>> +	struct flow_block *flow_block;
> I don't understand why are you pushing this struct out of the c file to
> the header. Please don't.

the flow_indr_block_dev and indr_block_cb in the h file used for the function

tc_indr_block_ing_cmd in cls_api.c

>> -static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
>> -				  struct tc_indr_block_cb *indr_block_cb,
>> +static void tc_indr_block_ing_cmd(struct net_device *dev,
> I don't understand why you change struct tc_indr_block_dev * to
> struct net_device * here. If you want to do that, please do that in a
> separate patch, not it this one where only "the move" should happen.
>

^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-29 12:43 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Andrew Lunn, Horatiu Vultur, Nikolay Aleksandrov, roopa, davem,
	bridge, netdev, linux-kernel
In-Reply-To: <20190729060923.GA16938@splinter>

Hi Ido,

The 07/29/2019 09:09, Ido Schimmel wrote:
> External E-Mail
> 
> 
> On Sun, Jul 28, 2019 at 09:15:59PM +0200, Allan W. Nielsen wrote:
> > If we assume that the SwitchDev driver implemented such that all multicast
> > traffic goes to the CPU, then we should really have a way to install a HW
> > offload path in the silicon, such that these packets does not go to the CPU (as
> > they are known not to be use full, and a frame every 3 us is a significant load
> > on small DMA connections and CPU resources).
> > 
> > If we assume that the SwitchDev driver implemented such that only "needed"
> > multicast packets goes to the CPU, then we need a way to get these packets in
> > case we want to implement the DLR protocol.
> 
> I'm not familiar with the HW you're working with, so the below might not
> be relevant.
> 
> In case you don't want to send all multicast traffic to the CPU (I'll
> refer to it later), you can install an ingress tc filter that traps to
> the CPU the packets you do want to receive. Something like:
> 
> # tc qdisc add dev swp1 clsact
> # tc filter add dev swp1 pref 1 ingress flower skip_sw dst_mac \
> 	01:21:6C:00:00:01 action trap
I have actually been looking at this, and it may an idea to go down this road.
But so far we have chosen not to for the following reasons:
- It is not only about trapping traffic to the CPU, we also needs to capability
  to limit the flooding on the front ports.
- In our case (the silicon), this feature really belongs to the MAC-table, which
  is why we did prefer to do it via the FDB entries.
  - But the HW does have TCAM resources, and we are planning on exposing these
    resources via the tc-flower interface. It is just that we have more MAC
    table resoruces than TCAM resources, which is another argument for using the
    MAC table.

> If your HW supports sharing the same filter among multiple ports, then
> you can install your filter in a tc shared block and bind multiple ports
> to it.
It does, thanks for making us aware of this optimization option.

> Another option is to always send a *copy* of multicast packets to the
> CPU, but make sure the HW uses a policer that prevents the CPU from
> being overwhelmed. To avoid packets being forwarded twice (by HW and
> SW), you will need to mark such packets in your driver with
> 'skb->offload_fwd_mark = 1'.
Understood

> Now, in case user wants to allow the CPU to receive certain packets at a
> higher rate, a tc filter can be used. It will be identical to the filter
> I mentioned earlier, but with a 'police' action chained before 'trap'.
I see.

> I don't think this is currently supported by any driver, but I believe
> it's the right way to go: By default the CPU receives all the traffic it
> should receive and user can fine-tune it using ACLs.
If all the frames goes to the CPU, then how can I fine-tune frames not to go to
the CPU?? I can do a TRAP (to get it to the CPU) a DROP (to drop it before
forwarding), but how can I forward a multicast packet, but prevent it from going
to the CPU?

I have seen that the mirror command can do re-direction, but not to a list of
ports...

All in all, thanks a lot for the suggestions, but to begin with I think we will
explore the MAC table option a bit more. But we will get back to TC to support
the ACL functions.

/Allan



^ permalink raw reply

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Nikolay Aleksandrov @ 2019-07-29 12:22 UTC (permalink / raw)
  To: Allan W. Nielsen
  Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <20190729121409.wa47uelw5f6l4vs4@lx-anielsen.microsemi.net>

Hi Allan,
On 29/07/2019 15:14, Allan W. Nielsen wrote:
> Hi Nikolay,
> 
> First of all, as mentioned further down in this thread, I realized that our
> implementation of the multicast floodmasks does not align with the existing SW
> implementation. We will change this, such that all multicast packets goes to the
> SW bridge.
> 
> This changes things a bit, not that much.
> 
> I actually think you summarized the issue we have (after changing to multicast
> flood-masks) right here:
> 
> The 07/26/2019 12:26, Nikolay Aleksandrov wrote:
>>>> Actually you mentioned non-IP traffic, so the querier stuff is not a problem. This
>>>> traffic will always be flooded by the bridge (and also a copy will be locally sent up).
>>>> Thus only the flooding may need to be controlled.
> 
> This seems to be exactly what we need.
> 
> Assuming we have a SW bridge (br0) with 4 slave interfaces (eth0-3). We use this
> on a network where we want to limit the flooding of frames with dmac
> 01:21:6C:00:00:01 (which is non IP traffic) to eth0 and eth1.
> 
> One way of doing this could potentially be to support the following command:
> 
> bridge fdb add    01:21:6C:00:00:01 port eth0
> bridge fdb append 01:21:6C:00:00:01 port eth1
> 
> On 25/07/2019 16:06, Nikolay Aleksandrov wrote:
>>>>>>>>  In general NLM_F_APPEND is only used in vxlan, the bridge does not
>>>>>>>>  handle that flag at all.  FDB is only for *unicast*, nothing is joined
>>>>>>>>  and no multicast should be used with fdbs. MDB is used for multicast
>>>>>>>>  handling, but both of these are used for forwarding.
> This is true, and this should have been addressed in the patch, we were too
> focused on setting up the offload patch in the driver, and forgot to do the SW
> implementation.
> 
> Do you see any issues in supporting this flag, and updating the SW
> forwarding in br_handle_frame_finish such that it can support/allow a FDB entry
> to be a multicast?
> 

Yes, all of the multicast code is handled differently, it doesn't go through the fdb
lookup or code at all. I don't see how you'll do a lookup in the fdb table with a
multicast mac address, take a look at br_handle_frame_finish() and you'll notice
that when a multicast dmac is detected then we use the bridge mcast code for lookups
and forwarding. If you're trying to achieve Rx only on the bridge of these then
why not just use Ido's tc suggestion or even the ip maddr add offload for each port ?

If you add a multicast mac in the fdb (currently allowed, but has no effect) and you
use dev_mc_add() as suggested that'd just be a hack to pass it down and it is already
possible to achieve via other methods, no need to go through the bridge.

> /Allan
> 

^ permalink raw reply

* [PATCH net-next] MAINTAINERS: Remove mailing-list entry for XDP (eXpress Data Path)
From: Jesper Dangaard Brouer @ 2019-07-29 12:16 UTC (permalink / raw)
  Cc: Jesper Dangaard Brouer, xdp-newbies, netdev, bpf, ast, daniel,
	davem, jakub.kicinski, john.fastabend

This removes the mailing list xdp-newbies@vger.kernel.org from the XDP
kernel maintainers entry.

Being in the kernel MAINTAINERS file successfully caused the list to
receive kbuild bot warnings, syzbot reports and sometimes developer
patches. The level of details in these messages, doesn't match the
target audience of the XDP-newbies list. This is based on a survey on
the mailing list, where 73% voted for removal from MAINTAINERS file.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 MAINTAINERS |    1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9cc156c58f0c..45cb4237eddc 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17560,7 +17560,6 @@ M:	Jakub Kicinski <jakub.kicinski@netronome.com>
 M:	Jesper Dangaard Brouer <hawk@kernel.org>
 M:	John Fastabend <john.fastabend@gmail.com>
 L:	netdev@vger.kernel.org
-L:	xdp-newbies@vger.kernel.org
 L:	bpf@vger.kernel.org
 S:	Supported
 F:	net/core/xdp.c

^ permalink raw reply related

* Re: [PATCH] net: bridge: Allow bridge to joing multicast groups
From: Allan W. Nielsen @ 2019-07-29 12:14 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: Horatiu Vultur, roopa, davem, bridge, netdev, linux-kernel
In-Reply-To: <b755f613-e6d8-a2e6-16cd-6f13ec0a6ddc@cumulusnetworks.com>

Hi Nikolay,

First of all, as mentioned further down in this thread, I realized that our
implementation of the multicast floodmasks does not align with the existing SW
implementation. We will change this, such that all multicast packets goes to the
SW bridge.

This changes things a bit, not that much.

I actually think you summarized the issue we have (after changing to multicast
flood-masks) right here:

The 07/26/2019 12:26, Nikolay Aleksandrov wrote:
> >> Actually you mentioned non-IP traffic, so the querier stuff is not a problem. This
> >> traffic will always be flooded by the bridge (and also a copy will be locally sent up).
> >> Thus only the flooding may need to be controlled.

This seems to be exactly what we need.

Assuming we have a SW bridge (br0) with 4 slave interfaces (eth0-3). We use this
on a network where we want to limit the flooding of frames with dmac
01:21:6C:00:00:01 (which is non IP traffic) to eth0 and eth1.

One way of doing this could potentially be to support the following command:

bridge fdb add    01:21:6C:00:00:01 port eth0
bridge fdb append 01:21:6C:00:00:01 port eth1

On 25/07/2019 16:06, Nikolay Aleksandrov wrote:
> >>>>>>  In general NLM_F_APPEND is only used in vxlan, the bridge does not
> >>>>>>  handle that flag at all.  FDB is only for *unicast*, nothing is joined
> >>>>>>  and no multicast should be used with fdbs. MDB is used for multicast
> >>>>>>  handling, but both of these are used for forwarding.
This is true, and this should have been addressed in the patch, we were too
focused on setting up the offload patch in the driver, and forgot to do the SW
implementation.

Do you see any issues in supporting this flag, and updating the SW
forwarding in br_handle_frame_finish such that it can support/allow a FDB entry
to be a multicast?

/Allan

^ permalink raw reply

* Re: [PATCH net-next 3/3] net: stmmac: Introducing support for Page Pool
From: Robin Murphy @ 2019-07-29 11:52 UTC (permalink / raw)
  To: Jose Abreu, Jon Hunter, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org, Catalin Marinas,
	Will Deacon
  Cc: Joao Pinto, Alexandre Torgue, Maxime Ripard, Chen-Yu Tsai,
	Maxime Coquelin, linux-tegra, Giuseppe Cavallaro,
	David S . Miller
In-Reply-To: <MN2PR12MB327997BDF2EA5CEE00F45AC3D3DD0@MN2PR12MB3279.namprd12.prod.outlook.com>

On 29/07/2019 12:29, Jose Abreu wrote:
> ++ Catalin, Will (ARM64 Maintainers)
> 
> From: Jon Hunter <jonathanh@nvidia.com>
> Date: Jul/29/2019, 11:55:18 (UTC+00:00)
> 
>>
>> On 29/07/2019 09:16, Jose Abreu wrote:
>>> From: Jose Abreu <joabreu@synopsys.com>
>>> Date: Jul/27/2019, 16:56:37 (UTC+00:00)
>>>
>>>> From: Jon Hunter <jonathanh@nvidia.com>
>>>> Date: Jul/26/2019, 15:11:00 (UTC+00:00)
>>>>
>>>>>
>>>>> On 25/07/2019 16:12, Jose Abreu wrote:
>>>>>> From: Jon Hunter <jonathanh@nvidia.com>
>>>>>> Date: Jul/25/2019, 15:25:59 (UTC+00:00)
>>>>>>
>>>>>>>
>>>>>>> On 25/07/2019 14:26, Jose Abreu wrote:
>>>>>>>
>>>>>>> ...
>>>>>>>
>>>>>>>> Well, I wasn't expecting that :/
>>>>>>>>
>>>>>>>> Per documentation of barriers I think we should set descriptor fields
>>>>>>>> and then barrier and finally ownership to HW so that remaining fields
>>>>>>>> are coherent before owner is set.
>>>>>>>>
>>>>>>>> Anyway, can you also add a dma_rmb() after the call to
>>>>>>>> stmmac_rx_status() ?
>>>>>>>
>>>>>>> Yes. I removed the debug print added the barrier, but that did not help.
>>>>>>
>>>>>> So, I was finally able to setup NFS using your replicated setup and I
>>>>>> can't see the issue :(
>>>>>>
>>>>>> The only difference I have from yours is that I'm using TCP in NFS
>>>>>> whilst you (I believe from the logs), use UDP.
>>>>>
>>>>> So I tried TCP by setting the kernel boot params to 'nfsvers=3' and
>>>>> 'proto=tcp' and this does appear to be more stable, but not 100% stable.
>>>>> It still appears to fail in the same place about 50% of the time.
>>>>>
>>>>>> You do have flow control active right ? And your HW FIFO size is >= 4k ?
>>>>>
>>>>> How can I verify if flow control is active?
>>>>
>>>> You can check it by dumping register MTL_RxQ_Operation_Mode (0xd30).
>>
>> Where would be the appropriate place to dump this? After probe? Maybe
>> best if you can share a code snippet of where to dump this.
>>
>>>> Can you also add IOMMU debug in file "drivers/iommu/iommu.c" ?
>>
>> You can find a boot log here:
>>
>> https://urldefense.proofpoint.com/v2/url?u=https-3A__paste.ubuntu.com_p_qtRqtYKHGF_&d=DwICaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=WHDsc6kcWAl4i96Vm5hJ_19IJiuxx_p_Rzo2g-uHDKw&m=NrxsR2etpZHGb7HkN4XdgaGmKM1XYyldihNPL6qVSv0&s=CMATEcHVoqZw4sIrNOXc7SFE_kV_5CO5EU21-yJez6c&e=
>>
>>> And, please try attached debug patch.
>>
>> With this patch it appears to boot fine. So far no issues seen.
> 
> Thank you for testing.
> 
> Hi Catalin and Will,
> 
> Sorry to add you in such a long thread but we are seeing a DMA issue
> with stmmac driver in an ARM64 platform with IOMMU enabled.
> 
> The issue seems to be solved when buffers allocation for DMA based
> transfers are *not* mapped with the DMA_ATTR_SKIP_CPU_SYNC flag *OR*
> when IOMMU is disabled.
> 
> Notice that after transfer is done we do use
> dma_sync_single_for_{cpu,device} and then we reuse *the same* page for
> another transfer.
> 
> Can you please comment on whether DMA_ATTR_SKIP_CPU_SYNC can not be used
> in ARM64 platforms with IOMMU ?

In terms of what they do, there should be no difference on arm64 between:

dma_map_page(..., dir);
...
dma_unmap_page(..., dir);

and:

dma_map_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);
dma_sync_single_for_device(..., dir);
...
dma_sync_single_for_cpu(..., dir);
dma_unmap_page_attrs(..., dir, DMA_ATTR_SKIP_CPU_SYNC);

provided that the first sync covers the whole buffer and any subsequent 
ones cover at least the parts of the buffer which may have changed. Plus 
for coherent hardware it's entirely moot either way.

Given Jon's previous findings, I would lean towards the idea that 
performing the extra (redundant) cache maintenance plus barrier in 
dma_unmap is mostly just perturbing timing in the same way as the debug 
print which also made things seem OK.

Robin.

^ permalink raw reply

* [PATCH] arcnet: arc-rimi: Mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2019-07-29 11:15 UTC (permalink / raw)
  To: Michael Grzeschik, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
	Kees Cook

Mark switch cases where we are expecting to fall through.

This patch fixes the following warnings (Building: powerpc allyesconfig):

drivers/net/arcnet/arc-rimi.c: In function 'arcrimi_setup':
include/linux/printk.h:304:2: warning: this statement may fall through [-Wimplicit-fallthrough=]
  printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/arcnet/arc-rimi.c:365:3: note: in expansion of macro 'pr_err'
   pr_err("Too many arguments\n");
   ^~~~~~
drivers/net/arcnet/arc-rimi.c:366:2: note: here
  case 3:  /* Node ID */
  ^~~~
drivers/net/arcnet/arc-rimi.c:367:8: warning: this statement may fall through [-Wimplicit-fallthrough=]
   node = ints[3];
   ~~~~~^~~~~~~~~
drivers/net/arcnet/arc-rimi.c:368:2: note: here
  case 2:  /* IRQ */
  ^~~~
drivers/net/arcnet/arc-rimi.c:369:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
   irq = ints[2];
   ~~~~^~~~~~~~~
drivers/net/arcnet/arc-rimi.c:370:2: note: here
  case 1:  /* IO address */
  ^~~~

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/arcnet/arc-rimi.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c
index 11c5bad95226..14a5fb378145 100644
--- a/drivers/net/arcnet/arc-rimi.c
+++ b/drivers/net/arcnet/arc-rimi.c
@@ -363,10 +363,13 @@ static int __init arcrimi_setup(char *s)
 	switch (ints[0]) {
 	default:		/* ERROR */
 		pr_err("Too many arguments\n");
+		/* Fall through */
 	case 3:		/* Node ID */
 		node = ints[3];
+		/* Fall through */
 	case 2:		/* IRQ */
 		irq = ints[2];
+		/* Fall through */
 	case 1:		/* IO address */
 		io = ints[1];
 	}
-- 
2.22.0


^ permalink raw reply related

* [PATCH] arcnet: com90io: Mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2019-07-29 11:13 UTC (permalink / raw)
  To: Michael Grzeschik, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
	Kees Cook

Mark switch cases where we are expecting to fall through.

This patch fixes the following warnings (Building: powerpc allyesconfig):

drivers/net/arcnet/com90io.c: In function 'com90io_setup':
include/linux/printk.h:304:2: warning: this statement may fall through [-Wimplicit-fallthrough=]
  printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/arcnet/com90io.c:365:3: note: in expansion of macro 'pr_err'
   pr_err("Too many arguments\n");
   ^~~~~~
drivers/net/arcnet/com90io.c:366:2: note: here
  case 2:  /* IRQ */
  ^~~~
drivers/net/arcnet/com90io.c:367:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
   irq = ints[2];
   ~~~~^~~~~~~~~
drivers/net/arcnet/com90io.c:368:2: note: here
  case 1:  /* IO address */
  ^~~~

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/arcnet/com90io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c
index 2c546013a980..186bbf87bc84 100644
--- a/drivers/net/arcnet/com90io.c
+++ b/drivers/net/arcnet/com90io.c
@@ -363,8 +363,10 @@ static int __init com90io_setup(char *s)
 	switch (ints[0]) {
 	default:		/* ERROR */
 		pr_err("Too many arguments\n");
+		/* Fall through */
 	case 2:		/* IRQ */
 		irq = ints[2];
+		/* Fall through */
 	case 1:		/* IO address */
 		io = ints[1];
 	}
-- 
2.22.0


^ permalink raw reply related

* [PATCH] arcnet: com90xx: Mark expected switch fall-throughs
From: Gustavo A. R. Silva @ 2019-07-29 11:09 UTC (permalink / raw)
  To: Michael Grzeschik, David S. Miller
  Cc: netdev, linux-kernel, Gustavo A. R. Silva, Stephen Rothwell,
	Kees Cook

Mark switch cases where we are expecting to fall through.

This patch fixes the following warnings (Building: powerpc allyesconfig):

drivers/net/arcnet/com90xx.c: In function 'com90xx_setup':
include/linux/printk.h:304:2: warning: this statement may fall through [-Wimplicit-fallthrough=]
  printk(KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/arcnet/com90xx.c:695:3: note: in expansion of macro 'pr_err'
   pr_err("Too many arguments\n");
   ^~~~~~
drivers/net/arcnet/com90xx.c:696:2: note: here
  case 3:  /* Mem address */
  ^~~~
drivers/net/arcnet/com90xx.c:697:9: warning: this statement may fall through [-Wimplicit-fallthrough=]
   shmem = ints[3];
   ~~~~~~^~~~~~~~~
drivers/net/arcnet/com90xx.c:698:2: note: here
  case 2:  /* IRQ */
  ^~~~
drivers/net/arcnet/com90xx.c:699:7: warning: this statement may fall through [-Wimplicit-fallthrough=]
   irq = ints[2];
   ~~~~^~~~~~~~~
drivers/net/arcnet/com90xx.c:700:2: note: here
  case 1:  /* IO address */
  ^~~~

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gustavo A. R. Silva <gustavo@embeddedor.com>
---
 drivers/net/arcnet/com90xx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/arcnet/com90xx.c b/drivers/net/arcnet/com90xx.c
index ca4a57c30bf8..bd75d06ad7df 100644
--- a/drivers/net/arcnet/com90xx.c
+++ b/drivers/net/arcnet/com90xx.c
@@ -693,10 +693,13 @@ static int __init com90xx_setup(char *s)
 	switch (ints[0]) {
 	default:		/* ERROR */
 		pr_err("Too many arguments\n");
+		/* Fall through */
 	case 3:		/* Mem address */
 		shmem = ints[3];
+		/* Fall through */
 	case 2:		/* IRQ */
 		irq = ints[2];
+		/* Fall through */
 	case 1:		/* IO address */
 		io = ints[1];
 	}
-- 
2.22.0


^ permalink raw reply related

* RE: [PATCH net-next 3/3] net: stmmac: Introducing support for Page Pool
From: Jose Abreu @ 2019-07-29 11:29 UTC (permalink / raw)
  To: Jon Hunter, Jose Abreu, linux-kernel@vger.kernel.org,
	netdev@vger.kernel.org, linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org, Catalin Marinas,
	Will Deacon
  Cc: Joao Pinto, Alexandre Torgue, Maxime Ripard, Chen-Yu Tsai,
	Maxime Coquelin, linux-tegra, Giuseppe Cavallaro, Robin Murphy,
	David S . Miller
In-Reply-To: <b99b1e49-0cbc-2c66-6325-50fa6f263d91@nvidia.com>

++ Catalin, Will (ARM64 Maintainers)

From: Jon Hunter <jonathanh@nvidia.com>
Date: Jul/29/2019, 11:55:18 (UTC+00:00)

> 
> On 29/07/2019 09:16, Jose Abreu wrote:
> > From: Jose Abreu <joabreu@synopsys.com>
> > Date: Jul/27/2019, 16:56:37 (UTC+00:00)
> > 
> >> From: Jon Hunter <jonathanh@nvidia.com>
> >> Date: Jul/26/2019, 15:11:00 (UTC+00:00)
> >>
> >>>
> >>> On 25/07/2019 16:12, Jose Abreu wrote:
> >>>> From: Jon Hunter <jonathanh@nvidia.com>
> >>>> Date: Jul/25/2019, 15:25:59 (UTC+00:00)
> >>>>
> >>>>>
> >>>>> On 25/07/2019 14:26, Jose Abreu wrote:
> >>>>>
> >>>>> ...
> >>>>>
> >>>>>> Well, I wasn't expecting that :/
> >>>>>>
> >>>>>> Per documentation of barriers I think we should set descriptor fields 
> >>>>>> and then barrier and finally ownership to HW so that remaining fields 
> >>>>>> are coherent before owner is set.
> >>>>>>
> >>>>>> Anyway, can you also add a dma_rmb() after the call to 
> >>>>>> stmmac_rx_status() ?
> >>>>>
> >>>>> Yes. I removed the debug print added the barrier, but that did not help.
> >>>>
> >>>> So, I was finally able to setup NFS using your replicated setup and I 
> >>>> can't see the issue :(
> >>>>
> >>>> The only difference I have from yours is that I'm using TCP in NFS 
> >>>> whilst you (I believe from the logs), use UDP.
> >>>
> >>> So I tried TCP by setting the kernel boot params to 'nfsvers=3' and
> >>> 'proto=tcp' and this does appear to be more stable, but not 100% stable.
> >>> It still appears to fail in the same place about 50% of the time.
> >>>
> >>>> You do have flow control active right ? And your HW FIFO size is >= 4k ?
> >>>
> >>> How can I verify if flow control is active?
> >>
> >> You can check it by dumping register MTL_RxQ_Operation_Mode (0xd30).
> 
> Where would be the appropriate place to dump this? After probe? Maybe
> best if you can share a code snippet of where to dump this.
> 
> >> Can you also add IOMMU debug in file "drivers/iommu/iommu.c" ?
> 
> You can find a boot log here:
> 
> https://urldefense.proofpoint.com/v2/url?u=https-3A__paste.ubuntu.com_p_qtRqtYKHGF_&d=DwICaQ&c=DPL6_X_6JkXFx7AXWqB0tg&r=WHDsc6kcWAl4i96Vm5hJ_19IJiuxx_p_Rzo2g-uHDKw&m=NrxsR2etpZHGb7HkN4XdgaGmKM1XYyldihNPL6qVSv0&s=CMATEcHVoqZw4sIrNOXc7SFE_kV_5CO5EU21-yJez6c&e= 
> 
> > And, please try attached debug patch.
> 
> With this patch it appears to boot fine. So far no issues seen.

Thank you for testing.

Hi Catalin and Will,

Sorry to add you in such a long thread but we are seeing a DMA issue 
with stmmac driver in an ARM64 platform with IOMMU enabled.

The issue seems to be solved when buffers allocation for DMA based 
transfers are *not* mapped with the DMA_ATTR_SKIP_CPU_SYNC flag *OR* 
when IOMMU is disabled.

Notice that after transfer is done we do use 
dma_sync_single_for_{cpu,device} and then we reuse *the same* page for 
another transfer.

Can you please comment on whether DMA_ATTR_SKIP_CPU_SYNC can not be used 
in ARM64 platforms with IOMMU ?

---
Thanks,
Jose Miguel Abreu

^ permalink raw reply

* tcan4x5x on a Raspberry Pi
From: FIXED-TERM Buecheler Konstantin (ETAS-SEC/ECT-Mu) @ 2019-07-29 11:19 UTC (permalink / raw)
  To: linux-can@vger.kernel.org, netdev@vger.kernel.org; +Cc: dmurphy@ti.com

Hi all, 

I am currently working on a project where I am trying to use the tcan4550 chip with a Raspberry PI 3B.
I am struggling to create a working device tree overlay file for the Raspberry Pi.
Has anyone here tried this already? I would appreciate any help. 

Thanks,
Konstantin

^ permalink raw reply

* Re: [PATCH net-next v4 1/3] flow_offload: move tc indirect block to flow offload
From: Jiri Pirko @ 2019-07-29 11:15 UTC (permalink / raw)
  To: wenxu; +Cc: pablo, fw, jakub.kicinski, netfilter-devel, netdev
In-Reply-To: <1564296769-32294-2-git-send-email-wenxu@ucloud.cn>

Sun, Jul 28, 2019 at 08:52:47AM CEST, wenxu@ucloud.cn wrote:
>From: wenxu <wenxu@ucloud.cn>
>
>move tc indirect block to flow_offload and rename

A sentence should start with capital letter.


>it to flow indirect block.The nf_tables can use the

There should be a space between "." and first letter of the next
sensence.


>indr block architecture.
>

[...]

^ permalink raw reply

* Re: [PATCH net-next v4 1/3] flow_offload: move tc indirect block to flow offload
From: Jiri Pirko @ 2019-07-29 11:13 UTC (permalink / raw)
  To: wenxu; +Cc: pablo, fw, jakub.kicinski, netfilter-devel, netdev
In-Reply-To: <1564296769-32294-2-git-send-email-wenxu@ucloud.cn>

Sun, Jul 28, 2019 at 08:52:47AM CEST, wenxu@ucloud.cn wrote:
>From: wenxu <wenxu@ucloud.cn>
>
>move tc indirect block to flow_offload and rename
>it to flow indirect block.The nf_tables can use the
>indr block architecture.
>
>Signed-off-by: wenxu <wenxu@ucloud.cn>
>---
>v3: subsys_initcall for init_flow_indr_rhashtable
>v4: no change
>

[...]


>diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
>index 00b9aab..66f89bc 100644
>--- a/include/net/flow_offload.h
>+++ b/include/net/flow_offload.h
>@@ -4,6 +4,7 @@
> #include <linux/kernel.h>
> #include <linux/list.h>
> #include <net/flow_dissector.h>
>+#include <linux/rhashtable.h>
> 
> struct flow_match {
> 	struct flow_dissector	*dissector;
>@@ -366,4 +367,42 @@ static inline void flow_block_init(struct flow_block *flow_block)
> 	INIT_LIST_HEAD(&flow_block->cb_list);
> }
> 
>+typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv,
>+				      enum tc_setup_type type, void *type_data);
>+
>+struct flow_indr_block_cb {
>+	struct list_head list;
>+	void *cb_priv;
>+	flow_indr_block_bind_cb_t *cb;
>+	void *cb_ident;
>+};

I don't understand why are you pushing this struct out of the c file to
the header. Please don't.


>+
>+typedef void flow_indr_block_ing_cmd_t(struct net_device *dev,
>+				       struct flow_block *flow_block,
>+				       struct flow_indr_block_cb *indr_block_cb,
>+				       enum flow_block_command command);
>+
>+struct flow_indr_block_dev {
>+	struct rhash_head ht_node;
>+	struct net_device *dev;
>+	unsigned int refcnt;
>+	struct list_head cb_list;
>+	flow_indr_block_ing_cmd_t *ing_cmd_cb;
>+	struct flow_block *flow_block;

I don't understand why are you pushing this struct out of the c file to
the header. Please don't.


>+};
>+
>+struct flow_indr_block_dev *flow_indr_block_dev_lookup(struct net_device *dev);
>+
>+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
>+				  flow_indr_block_bind_cb_t *cb, void *cb_ident);
>+
>+void __flow_indr_block_cb_unregister(struct net_device *dev,
>+				     flow_indr_block_bind_cb_t *cb, void *cb_ident);
>+
>+int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
>+				flow_indr_block_bind_cb_t *cb, void *cb_ident);
>+
>+void flow_indr_block_cb_unregister(struct net_device *dev,
>+				   flow_indr_block_bind_cb_t *cb, void *cb_ident);
>+
	
[...]

	
>+
>+int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv,
>+				  flow_indr_block_bind_cb_t *cb,
>+				  void *cb_ident)
>+{
>+	struct flow_indr_block_cb *indr_block_cb;
>+	struct flow_indr_block_dev *indr_dev;
>+	int err;
>+
>+	indr_dev = flow_indr_block_dev_get(dev);
>+	if (!indr_dev)
>+		return -ENOMEM;
>+
>+	indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident);
>+	err = PTR_ERR_OR_ZERO(indr_block_cb);
>+	if (err)
>+		goto err_dev_put;
>+
>+	if (indr_dev->ing_cmd_cb)
>+		indr_dev->ing_cmd_cb(indr_dev->dev, indr_dev->flow_block, indr_block_cb,

This line is over 80cols. Please run checkpatch script for your patch
and obey the warnings.


>+				     FLOW_BLOCK_BIND);
>+
>+	return 0;
>+
>+err_dev_put:
>+	flow_indr_block_dev_put(indr_dev);
>+	return err;
>+}
>+EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register);

[...]


> 
>-static void tc_indr_block_ing_cmd(struct tc_indr_block_dev *indr_dev,
>-				  struct tc_indr_block_cb *indr_block_cb,
>+static void tc_indr_block_ing_cmd(struct net_device *dev,

I don't understand why you change struct tc_indr_block_dev * to
struct net_device * here. If you want to do that, please do that in a
separate patch, not it this one where only "the move" should happen.


>+				  struct flow_block *flow_block,
>+				  struct flow_indr_block_cb *indr_block_cb,
> 				  enum flow_block_command command)
> {
>+	struct tcf_block *block = flow_block ?
>+				  container_of(flow_block,
>+					       struct tcf_block,
>+					       flow_block) : NULL;
> 	struct flow_block_offload bo = {
> 		.command	= command,
> 		.binder_type	= FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS,
>-		.net		= dev_net(indr_dev->dev),
>-		.block_shared	= tcf_block_non_null_shared(indr_dev->block),
>+		.net		= dev_net(dev),
>+		.block_shared	= tcf_block_non_null_shared(block),
> 	};
> 	INIT_LIST_HEAD(&bo.cb_list);
> 

[...]

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox