Netdev List
 help / color / mirror / Atom feed
* [PATCH AUTOSEL 4.20 037/105] selftests: forwarding: Add a test for VLAN deletion
From: Sasha Levin @ 2019-02-13  2:32 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Ido Schimmel, David S . Miller, Sasha Levin, netdev,
	linux-kselftest
In-Reply-To: <20190213023336.19019-1-sashal@kernel.org>

From: Ido Schimmel <idosch@mellanox.com>

[ Upstream commit 4fabf3bf93a194c7fa5288da3e0af37e4b943cf3 ]

Add a VLAN on a bridge port, delete it and make sure the PVID VLAN is
not affected.

Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 .../selftests/net/forwarding/bridge_vlan_aware.sh | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
index d8313d0438b7..04c6431b2bd8 100755
--- a/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
+++ b/tools/testing/selftests/net/forwarding/bridge_vlan_aware.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 # SPDX-License-Identifier: GPL-2.0
 
-ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding"
+ALL_TESTS="ping_ipv4 ping_ipv6 learning flooding vlan_deletion"
 NUM_NETIFS=4
 CHECK_TC="yes"
 source lib.sh
@@ -96,6 +96,19 @@ flooding()
 	flood_test $swp2 $h1 $h2
 }
 
+vlan_deletion()
+{
+	# Test that the deletion of a VLAN on a bridge port does not affect
+	# the PVID VLAN
+	log_info "Add and delete a VLAN on bridge port $swp1"
+
+	bridge vlan add vid 10 dev $swp1
+	bridge vlan del vid 10 dev $swp1
+
+	ping_ipv4
+	ping_ipv6
+}
+
 trap cleanup EXIT
 
 setup_prepare
-- 
2.19.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.20 036/105] mlxsw: spectrum: Add VXLAN dependency for spectrum
From: Sasha Levin @ 2019-02-13  2:32 UTC (permalink / raw)
  To: linux-kernel, stable; +Cc: Ido Schimmel, David S . Miller, Sasha Levin, netdev
In-Reply-To: <20190213023336.19019-1-sashal@kernel.org>

From: Ido Schimmel <idosch@mellanox.com>

[ Upstream commit 143a8e038ac599ca73c6354c8af6a8fdeee9fa7d ]

When VXLAN is a loadable module, MLXSW_SPECTRUM must not be built-in:

drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c:2547: undefined
reference to `vxlan_fdb_find_uc'

Add Kconfig dependency to enforce usable configurations.

Fixes: 1231e04f5bba ("mlxsw: spectrum_switchdev: Add support for VxLAN encapsulation")
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Reported-by: kbuild test robot <lkp@intel.com>
Reviewed-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/ethernet/mellanox/mlxsw/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/Kconfig b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
index 8a291eb36c64..7338c9bac4e6 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/Kconfig
+++ b/drivers/net/ethernet/mellanox/mlxsw/Kconfig
@@ -78,6 +78,7 @@ config MLXSW_SPECTRUM
 	depends on IPV6 || IPV6=n
 	depends on NET_IPGRE || NET_IPGRE=n
 	depends on IPV6_GRE || IPV6_GRE=n
+	depends on VXLAN || VXLAN=n
 	select GENERIC_ALLOCATOR
 	select PARMAN
 	select MLXFW
-- 
2.19.1


^ permalink raw reply related

* [PATCH AUTOSEL 4.20 035/105] mlxsw: spectrum_acl: Add cleanup after C-TCAM update error condition
From: Sasha Levin @ 2019-02-13  2:32 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nir Dotan, Ido Schimmel, David S . Miller, Sasha Levin, netdev
In-Reply-To: <20190213023336.19019-1-sashal@kernel.org>

From: Nir Dotan <nird@mellanox.com>

[ Upstream commit ff0db43cd6c530ff944773ccf48ece55d32d0c22 ]

When writing to C-TCAM, mlxsw driver uses cregion->ops->entry_insert().
In case of C-TCAM HW insertion error, the opposite action should take
place.
Add error handling case in which the C-TCAM region entry is removed, by
calling cregion->ops->entry_remove().

Fixes: a0a777b9409f ("mlxsw: spectrum_acl: Start using A-TCAM")
Signed-off-by: Nir Dotan <nird@mellanox.com>
Reviewed-by: Jiri Pirko <jiri@mellanox.com>
Signed-off-by: Ido Schimmel <idosch@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 .../net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c   | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c
index e3c6fe8b1d40..1dcf152b2813 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_acl_ctcam.c
@@ -75,7 +75,15 @@ mlxsw_sp_acl_ctcam_region_entry_insert(struct mlxsw_sp *mlxsw_sp,
 	act_set = mlxsw_afa_block_first_set(rulei->act_block);
 	mlxsw_reg_ptce2_flex_action_set_memcpy_to(ptce2_pl, act_set);
 
-	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ptce2), ptce2_pl);
+	if (err)
+		goto err_ptce2_write;
+
+	return 0;
+
+err_ptce2_write:
+	cregion->ops->entry_remove(cregion, centry);
+	return err;
 }
 
 static void
-- 
2.19.1


^ permalink raw reply related

* Re: [bpf-next 1/2] tcp: replace SOCK_DEBUG() with tcp_stats()
From: Yafang Shao @ 2019-02-13  3:04 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Eric Dumazet, Eric Dumazet, Daniel Borkmann, Alexei Starovoitov,
	Yonghong Song, Lawrence Brakmo, David Miller, netdev, LKML,
	shaoyafang
In-Reply-To: <CAADnVQKhEshkWSDeYJ92pWnJjqawGSScBH+LU4iXsqwwxYjO9g@mail.gmail.com>

On Wed, Feb 13, 2019 at 10:49 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Feb 12, 2019 at 6:15 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > Do not add more debugging stuff unless you can demonstrate
> > they actually allowed you to find a real bug and that you sent a
> > public fix for it.
> >
> > Just adding "cool stuff" in TCP stack does not please me, it is only
> > more complexity for unproven gain.
>
> I agree.
> I don't see why this debugging of 'abnormal TCP' cannot be done
> with kprobes and tracepoints.

kprobe is not suitable, because we have to use the line number, that's
a trouble cross all kernel versions.
Regarding  tracepoints, I don't think it is a good idea to introduce
more and more tcp tracepoints and make them crowed as well.

> Instrumenting every tcp counter increment is overkill.

^ permalink raw reply

* [PATCH AUTOSEL 4.20 022/105] soc/fsl/qe: fix err handling of ucc_of_parse_tdm
From: Sasha Levin @ 2019-02-13  2:32 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Wen Yang, Julia Lawall, Zhao Qiang, David S . Miller, netdev,
	linuxppc-dev, Sasha Levin
In-Reply-To: <20190213023336.19019-1-sashal@kernel.org>

From: Wen Yang <wen.yang99@zte.com.cn>

[ Upstream commit 8d68100ab4ad92560a16a68b72e068613ac4d573 ]

Currently there are some issues with the ucc_of_parse_tdm function:
1, a possible null pointer dereference in ucc_of_parse_tdm,
detected by the semantic patch deref_null.cocci,
with the following warning:
drivers/soc/fsl/qe/qe_tdm.c:177:21-24: ERROR: pdev is NULL but dereferenced.
2, dev gets modified, so in any case that devm_iounmap() will fail
even when the new pdev is valid, because the iomap was done with a
 different pdev.
3, there is no driver bind with the "fsl,t1040-qe-si" or
"fsl,t1040-qe-siram" device. So allocating resources using devm_*()
with these devices won't provide a cleanup path for these resources
when the caller fails.

This patch fixes them.

Suggested-by: Li Yang <leoyang.li@nxp.com>
Suggested-by: Christophe LEROY <christophe.leroy@c-s.fr>
Signed-off-by: Wen Yang <wen.yang99@zte.com.cn>
Reviewed-by: Peng Hao <peng.hao2@zte.com.cn>
CC: Julia Lawall <julia.lawall@lip6.fr>
CC: Zhao Qiang <qiang.zhao@nxp.com>
CC: David S. Miller <davem@davemloft.net>
CC: netdev@vger.kernel.org
CC: linuxppc-dev@lists.ozlabs.org
CC: linux-kernel@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 62 +++++++++++++++++++++++++++++++++-
 drivers/soc/fsl/qe/qe_tdm.c    | 55 ------------------------------
 2 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 4d6409605207..af13d8cf94ad 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -1049,6 +1049,54 @@ static const struct net_device_ops uhdlc_ops = {
 	.ndo_tx_timeout	= uhdlc_tx_timeout,
 };
 
+static int hdlc_map_iomem(char *name, int init_flag, void __iomem **ptr)
+{
+	struct device_node *np;
+	struct platform_device *pdev;
+	struct resource *res;
+	static int siram_init_flag;
+	int ret = 0;
+
+	np = of_find_compatible_node(NULL, NULL, name);
+	if (!np)
+		return -EINVAL;
+
+	pdev = of_find_device_by_node(np);
+	if (!pdev) {
+		pr_err("%pOFn: failed to lookup pdev\n", np);
+		of_node_put(np);
+		return -EINVAL;
+	}
+
+	of_node_put(np);
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		ret = -EINVAL;
+		goto error_put_device;
+	}
+	*ptr = ioremap(res->start, resource_size(res));
+	if (!*ptr) {
+		ret = -ENOMEM;
+		goto error_put_device;
+	}
+
+	/* We've remapped the addresses, and we don't need the device any
+	 * more, so we should release it.
+	 */
+	put_device(&pdev->dev);
+
+	if (init_flag && siram_init_flag == 0) {
+		memset_io(*ptr, 0, resource_size(res));
+		siram_init_flag = 1;
+	}
+	return  0;
+
+error_put_device:
+	put_device(&pdev->dev);
+
+	return ret;
+}
+
 static int ucc_hdlc_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
@@ -1143,6 +1191,15 @@ static int ucc_hdlc_probe(struct platform_device *pdev)
 		ret = ucc_of_parse_tdm(np, utdm, ut_info);
 		if (ret)
 			goto free_utdm;
+
+		ret = hdlc_map_iomem("fsl,t1040-qe-si", 0,
+				     (void __iomem **)&utdm->si_regs);
+		if (ret)
+			goto free_utdm;
+		ret = hdlc_map_iomem("fsl,t1040-qe-siram", 1,
+				     (void __iomem **)&utdm->siram);
+		if (ret)
+			goto unmap_si_regs;
 	}
 
 	if (of_property_read_u16(np, "fsl,hmask", &uhdlc_priv->hmask))
@@ -1151,7 +1208,7 @@ static int ucc_hdlc_probe(struct platform_device *pdev)
 	ret = uhdlc_init(uhdlc_priv);
 	if (ret) {
 		dev_err(&pdev->dev, "Failed to init uhdlc\n");
-		goto free_utdm;
+		goto undo_uhdlc_init;
 	}
 
 	dev = alloc_hdlcdev(uhdlc_priv);
@@ -1181,6 +1238,9 @@ static int ucc_hdlc_probe(struct platform_device *pdev)
 free_dev:
 	free_netdev(dev);
 undo_uhdlc_init:
+	iounmap(utdm->siram);
+unmap_si_regs:
+	iounmap(utdm->si_regs);
 free_utdm:
 	if (uhdlc_priv->tsa)
 		kfree(utdm);
diff --git a/drivers/soc/fsl/qe/qe_tdm.c b/drivers/soc/fsl/qe/qe_tdm.c
index f78c34647ca2..76480df195a8 100644
--- a/drivers/soc/fsl/qe/qe_tdm.c
+++ b/drivers/soc/fsl/qe/qe_tdm.c
@@ -44,10 +44,6 @@ int ucc_of_parse_tdm(struct device_node *np, struct ucc_tdm *utdm,
 	const char *sprop;
 	int ret = 0;
 	u32 val;
-	struct resource *res;
-	struct device_node *np2;
-	static int siram_init_flag;
-	struct platform_device *pdev;
 
 	sprop = of_get_property(np, "fsl,rx-sync-clock", NULL);
 	if (sprop) {
@@ -124,57 +120,6 @@ int ucc_of_parse_tdm(struct device_node *np, struct ucc_tdm *utdm,
 	utdm->siram_entry_id = val;
 
 	set_si_param(utdm, ut_info);
-
-	np2 = of_find_compatible_node(NULL, NULL, "fsl,t1040-qe-si");
-	if (!np2)
-		return -EINVAL;
-
-	pdev = of_find_device_by_node(np2);
-	if (!pdev) {
-		pr_err("%pOFn: failed to lookup pdev\n", np2);
-		of_node_put(np2);
-		return -EINVAL;
-	}
-
-	of_node_put(np2);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	utdm->si_regs = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(utdm->si_regs)) {
-		ret = PTR_ERR(utdm->si_regs);
-		goto err_miss_siram_property;
-	}
-
-	np2 = of_find_compatible_node(NULL, NULL, "fsl,t1040-qe-siram");
-	if (!np2) {
-		ret = -EINVAL;
-		goto err_miss_siram_property;
-	}
-
-	pdev = of_find_device_by_node(np2);
-	if (!pdev) {
-		ret = -EINVAL;
-		pr_err("%pOFn: failed to lookup pdev\n", np2);
-		of_node_put(np2);
-		goto err_miss_siram_property;
-	}
-
-	of_node_put(np2);
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	utdm->siram = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(utdm->siram)) {
-		ret = PTR_ERR(utdm->siram);
-		goto err_miss_siram_property;
-	}
-
-	if (siram_init_flag == 0) {
-		memset_io(utdm->siram, 0,  resource_size(res));
-		siram_init_flag = 1;
-	}
-
-	return ret;
-
-err_miss_siram_property:
-	devm_iounmap(&pdev->dev, utdm->si_regs);
 	return ret;
 }
 EXPORT_SYMBOL(ucc_of_parse_tdm);
-- 
2.19.1


^ permalink raw reply related

* Re: mmotm 2019-02-12-15-37 uploaded (net/ipv4/devinit.c)
From: Randy Dunlap @ 2019-02-13  3:09 UTC (permalink / raw)
  To: akpm, broonie, mhocko, sfr, linux-next, linux-fsdevel, linux-mm,
	linux-kernel, mm-commits, netdev@vger.kernel.org
In-Reply-To: <20190212233743.mGzbg%akpm@linux-foundation.org>

On 2/12/19 3:37 PM, akpm@linux-foundation.org wrote:
> The mm-of-the-moment snapshot 2019-02-12-15-37 has been uploaded to
> 
>    http://www.ozlabs.org/~akpm/mmotm/
> 
> mmotm-readme.txt says
> 
> README for mm-of-the-moment:
> 
> http://www.ozlabs.org/~akpm/mmotm/
> 
> This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
> more than once a week.

on x86_64:

when CONFIG_SYSCTL is not enabled:

ld: net/ipv4/devinet.o: in function `devinet_init_net':
devinet.c:(.text+0x3ad): undefined reference to `sysctl_devconf_inherit_init_net'



-- 
~Randy

^ permalink raw reply

* RE: [PATCH] net: stmmac: Add SMC support for EMAC System Manager register
From: Ooi, Joyce @ 2019-02-13  3:22 UTC (permalink / raw)
  To: thor.thayer@linux.intel.com, Giuseppe Cavallaro, Alexandre Torgue,
	Jose Abreu, David S. Miller, Maxime Coquelin
  Cc: netdev@vger.kernel.org, linux-stm32@st-md-mailman.stormreply.com,
	linux-arm-kernel@lists.infradead.org,
	linux-kernel@vger.kernel.org, See, Chin Liang
In-Reply-To: <0c93846a-48ab-5cde-6d27-291606dd8c31@linux.intel.com>

> -----Original Message-----
> From: Thor Thayer [mailto:thor.thayer@linux.intel.com]
> Sent: Wednesday, February 13, 2019 4:53 AM
> To: Ooi, Joyce <joyce.ooi@intel.com>; Giuseppe Cavallaro
> <peppe.cavallaro@st.com>; Alexandre Torgue <alexandre.torgue@st.com>;
> Jose Abreu <joabreu@synopsys.com>; David S. Miller <davem@davemloft.net>;
> Maxime Coquelin <mcoquelin.stm32@gmail.com>
> Cc: netdev@vger.kernel.org; linux-stm32@st-md-mailman.stormreply.com;
> linux-arm-kernel@lists.infradead.org; linux-kernel@vger.kernel.org; See, Chin
> Liang <chin.liang.see@intel.com>
> Subject: Re: [PATCH] net: stmmac: Add SMC support for EMAC System Manager
> register
> 
> Hi Joyce,
> 
> On 2/12/19 10:24 AM, Ooi, Joyce wrote:
> > As there is restriction to access to EMAC System Manager registers in
> > the kernel for Intel Stratix10, the use of SMC calls are required and
> > added in dwmac-socfpga driver.
> >
> > Signed-off-by: Ooi, Joyce <joyce.ooi@intel.com>
> 
> <snip>
> 
> I have a pending patchset[1] that addresses this but can be used by other drivers
> as well.
> 
> [1] https://patchwork.kernel.org/cover/10612891/[] 
Oh, didn't realize your patchset. I guess my patch can be ignored then. Thanks!


^ permalink raw reply

* [PATCH net-next 0/2] uapi: Add a new header for time types
From: Deepa Dinamani @ 2019-02-13  3:26 UTC (permalink / raw)
  To: davem, linux-kernel; +Cc: netdev, willemb, tglx, arnd, y2038

The series aims at adding a new time header: time_types.h.  This header
is what will eventually hold all the uapi time types that we plan to
leave across the interfaces after the y2038 cleanup.

The series was discussed with Arnd Bergmann.

The second patch fixes the errqueue.h header, which has a dependency on
these types.

Note that there may be a trivial merge conflict with linux-next
c70a772fda11 ("y2038: remove struct definition redirects").

Deepa Dinamani (2):
  time: Add time_types.h
  errqueue.h: Include time_types.h

 include/uapi/linux/errqueue.h   |  1 +
 include/uapi/linux/time.h       | 36 +----------------------------
 include/uapi/linux/time_types.h | 40 +++++++++++++++++++++++++++++++++
 3 files changed, 42 insertions(+), 35 deletions(-)
 create mode 100644 include/uapi/linux/time_types.h

-- 
2.17.1


^ permalink raw reply

* [PATCH net-next 2/2] errqueue.h: Include time_types.h
From: Deepa Dinamani @ 2019-02-13  3:26 UTC (permalink / raw)
  To: davem, linux-kernel; +Cc: netdev, willemb, tglx, arnd, y2038
In-Reply-To: <20190213032604.2655-1-deepa.kernel@gmail.com>

Now that we have a separate header for struct __kernel_timespec,
include it directly without relying on userspace to do it.

Reported-by: Ran Rozenstein <ranro@mellanox.com>
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
---
 include/uapi/linux/errqueue.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h
index d955b9e32288..28491dac074b 100644
--- a/include/uapi/linux/errqueue.h
+++ b/include/uapi/linux/errqueue.h
@@ -3,6 +3,7 @@
 #define _UAPI_LINUX_ERRQUEUE_H
 
 #include <linux/types.h>
+#include <linux/time_types.h>
 
 struct sock_extended_err {
 	__u32	ee_errno;	
-- 
2.17.1


^ permalink raw reply related

* [PATCH net-next 1/2] time: Add time_types.h
From: Deepa Dinamani @ 2019-02-13  3:26 UTC (permalink / raw)
  To: davem, linux-kernel; +Cc: netdev, willemb, tglx, arnd, y2038
In-Reply-To: <20190213032604.2655-1-deepa.kernel@gmail.com>

sys/time.h is the mandated include for many time related
defines. However, linux/time.h overlaps sys/time.h
significantly and this makes including both from userspace
or one from the other impossible.

This also means that userspace can get away with including
sys/time.h whenever it needs linux/time.h and this is what's
been happening in the user world usually.

But, we have new data types that we plan to use in the uapi time
interfaces also defined in the linux/time.h. But, we are unable
to use these types when sys/time.h is included.

Hence, move the new types to a new header, time_types.h.
We intend to eventually have all the uapi defines that the kernel
uses defined in this header.
Note that the plan is to replace uapi interfaces with timeval to
use __kernel_old_timeval, timespec to use __kernel_old_timespec etc.

Reported-by: Ran Rozenstein <ranro@mellanox.com>
Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW")
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
---
 include/uapi/linux/time.h       | 36 +----------------------------
 include/uapi/linux/time_types.h | 40 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 35 deletions(-)
 create mode 100644 include/uapi/linux/time_types.h

diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index b8ad1b86b942..958932effc5e 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -3,7 +3,7 @@
 #define _UAPI_LINUX_TIME_H
 
 #include <linux/types.h>
-
+#include <linux/time_types.h>
 
 #ifndef _STRUCT_TIMESPEC
 #define _STRUCT_TIMESPEC
@@ -23,7 +23,6 @@ struct timezone {
 	int	tz_dsttime;	/* type of dst correction */
 };
 
-
 /*
  * Names of the interval timers, and structure
  * defining a timer setting:
@@ -42,39 +41,6 @@ struct itimerval {
 	struct timeval it_value;	/* current value */
 };
 
-#ifndef __kernel_timespec
-struct __kernel_timespec {
-	__kernel_time64_t       tv_sec;                 /* seconds */
-	long long               tv_nsec;                /* nanoseconds */
-};
-#endif
-
-#ifndef __kernel_itimerspec
-struct __kernel_itimerspec {
-	struct __kernel_timespec it_interval;    /* timer period */
-	struct __kernel_timespec it_value;       /* timer expiration */
-};
-#endif
-
-/*
- * legacy timeval structure, only embedded in structures that
- * traditionally used 'timeval' to pass time intervals (not absolute
- * times). Do not add new users. If user space fails to compile
- * here, this is probably because it is not y2038 safe and needs to
- * be changed to use another interface.
- */
-#ifndef __kernel_old_timeval
-struct __kernel_old_timeval {
-	__kernel_long_t tv_sec;
-	__kernel_long_t tv_usec;
-};
-#endif
-
-struct __kernel_sock_timeval {
-	__s64 tv_sec;
-	__s64 tv_usec;
-};
-
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers):
  */
diff --git a/include/uapi/linux/time_types.h b/include/uapi/linux/time_types.h
new file mode 100644
index 000000000000..4a51ca37305b
--- /dev/null
+++ b/include/uapi/linux/time_types.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_TIME_TYPES_H
+#define _UAPI_LINUX_TIME_TYPES_H
+
+#include <linux/types.h>
+
+#ifndef __kernel_timespec
+struct __kernel_timespec {
+	__kernel_time64_t       tv_sec;                 /* seconds */
+	long long               tv_nsec;                /* nanoseconds */
+};
+#endif
+
+#ifndef __kernel_itimerspec 
+struct __kernel_itimerspec {
+	struct __kernel_timespec it_interval;    /* timer period */
+	struct __kernel_timespec it_value;       /* timer expiration */
+};
+#endif
+
+/*
+ * legacy timeval structure, only embedded in structures that
+ * traditionally used 'timeval' to pass time intervals (not absolute
+ * times). Do not add new users. If user space fails to compile
+ * here, this is probably because it is not y2038 safe and needs to
+ * be changed to use another interface.
+ */
+#ifndef __kernel_old_timeval
+struct __kernel_old_timeval {
+	__kernel_long_t tv_sec;
+	__kernel_long_t tv_usec;
+};
+#endif
+
+struct __kernel_sock_timeval {
+	__s64 tv_sec;
+	__s64 tv_usec;
+};
+
+#endif /* _UAPI_LINUX_TIME_TYPES_H */
-- 
2.17.1


^ permalink raw reply related

* Re: [PATCH v1 net-next 2/4] net: dsa: microchip: add MIB counter reading support
From: Andrew Lunn @ 2019-02-13  3:28 UTC (permalink / raw)
  To: Tristram.Ha; +Cc: sergio.paracuellos, f.fainelli, pavel, UNGLinuxDriver, netdev
In-Reply-To: <SN1PR11MB04465D50382C3F7438DA0FADEC660@SN1PR11MB0446.namprd11.prod.outlook.com>

> All of the MIB counters, except some that may be marked by driver,
> do not get updated when the link is down, so it is a waste of time
> to read them.

Hi Tristram

O.K, so make this clear in the code. Maybe rather than having this
link_just_down, have the adjust link callback update the cached
values for all counters? 

> My intention is the driver eventually reads the MIB counters at
> least every second or faster so that the ethtool API called to show
> MIB counters gets them from memory rather than starting a read
> operation.

The user expects to see the current counters, not some cached values.
For me it is O.K. to frequently read the counters to prevent wrap
around, but each ethtool call should update the counters before
returning them to user space.

> For simple switches that do not need to do anything special the MIB
> read operation does not cause any issue except CPU load, for more
> complicate switches that need to do some background operations too
> many read operation can affect some critical functions.

Sounds like a bad design of the switch, if reading statistics from it
can upset its operation. You might want to consider rate limiting the
ethtool call.

	Andrew

^ permalink raw reply

* [PATCH net-next] sock: consistent handling of extreme SO_SNDBUF/SO_RCVBUF values
From: Guillaume Nault @ 2019-02-13  3:30 UTC (permalink / raw)
  To: netdev; +Cc: linux-api, Eric Dumazet, Marcelo Leitner, Paolo Abeni

SO_SNDBUF and SO_RCVBUF (and their *BUFFORCE version) may overflow or
underflow their input value. This patch aims at providing explicit
handling of these extreme cases, to get a clear behaviour even with
values bigger than INT_MAX / 2 or lower than INT_MIN / 2.

For simplicity, only SO_SNDBUF and SO_SNDBUFFORCE are described here,
but the same explanation and fix apply to SO_RCVBUF and SO_RCVBUFFORCE
(with 'SNDBUF' replaced by 'RCVBUF' and 'wmem_max' by 'rmem_max').

Overflow of positive values
===========================

When handling SO_SNDBUF or SO_SNDBUFFORCE, if 'val' exceeds
INT_MAX / 2, the buffer size is set to its minimum value because
'val * 2' overflows, and max_t() considers that it's smaller than
SOCK_MIN_SNDBUF. For SO_SNDBUF, this can only happen with
net.core.wmem_max > INT_MAX / 2.

SO_SNDBUF and SO_SNDBUFFORCE are actually designed to let users probe
for the maximum buffer size by setting an arbitrary large number that
gets capped to the maximum allowed/possible size. Having the upper
half of the positive integer space to potentially reduce the buffer
size to its minimum value defeats this purpose.

This patch caps the base value to INT_MAX / 2, so that bigger values
don't overflow and keep setting the buffer size to its maximum.

Underflow of negative values
============================

For negative numbers, SO_SNDBUF always considers them bigger than
net.core.wmem_max, which is bounded by [SOCK_MIN_SNDBUF, INT_MAX].
Therefore such values are set to net.core.wmem_max and we're back to
the behaviour of positive integers described above (return maximum
buffer size if wmem_max <= INT_MAX / 2, return SOCK_MIN_SNDBUF
otherwise).

However, SO_SNDBUFFORCE behaves differently. The user value is
directly multiplied by two and compared with SOCK_MIN_SNDBUF. If
'val * 2' doesn't underflow or if it underflows to a value smaller
than SOCK_MIN_SNDBUF then buffer size is set to its minimum value.
Otherwise the buffer size is set to the underflowed value.

This patch treats negative values passed to SO_SNDBUFFORCE as null, to
prevent underflows. Therefore negative values now always set the buffer
size to its minimum value.

Even though SO_SNDBUF behaves inconsistently by setting buffer size to
the maximum value when passed a negative number, no attempt is made to
modify this behaviour. There may exist some programs that rely on using
negative numbers to set the maximum buffer size. Avoiding overflows
because of extreme net.core.wmem_max values is the most we can do here.

Summary of altered behaviours
=============================

val      : user-space value passed to setsockopt()
val_uf   : the underflowed value resulting from doubling val when
           val < INT_MIN / 2
wmem_max : short for net.core.wmem_max
val_cap  : min(val, wmem_max)
min_len  : minimal buffer length (that is, SOCK_MIN_SNDBUF)
max_len  : maximal possible buffer length, regardless of wmem_max (that
           is, INT_MAX - 1)
^^^^     : altered behaviour

SO_SNDBUF:
+-------------------------+-------------+------------+----------------+
|       CONDITION         | OLD RESULT  | NEW RESULT |    COMMENT     |
+-------------------------+-------------+------------+----------------+
| val < 0 &&              |             |            | No overflow,   |
| wmem_max <= INT_MAX/2   | wmem_max*2  | wmem_max*2 | keep original  |
|                         |             |            | behaviour      |
+-------------------------+-------------+------------+----------------+
| val < 0 &&              |             |            | Cap wmem_max   |
| INT_MAX/2 < wmem_max    | min_len     | max_len    | to prevent     |
|                         |             | ^^^^^^^    | overflow       |
+-------------------------+-------------+------------+----------------+
| 0 <= val <= min_len/2   | min_len     | min_len    | Ordinary case  |
+-------------------------+-------------+------------+----------------+
| min_len/2 < val &&      | val_cap*2   | val_cap*2  | Ordinary case  |
| val_cap <= INT_MAX/2    |             |            |                |
+-------------------------+-------------+------------+----------------+
| min_len < val &&        |             |            | Cap val_cap    |
| INT_MAX/2 < val_cap     | min_len     | max_len    | again to       |
| (implies that           |             | ^^^^^^^    | prevent        |
| INT_MAX/2 < wmem_max)   |             |            | overflow       |
+-------------------------+-------------+------------+----------------+

SO_SNDBUFFORCE:
+------------------------------+---------+---------+------------------+
|          CONDITION           | BEFORE  | AFTER   |     COMMENT      |
|                              | PATCH   | PATCH   |                  |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 &&           | min_len | min_len | Underflow with   |
| val_uf <= min_len            |         |         | no consequence   |
+------------------------------+---------+---------+------------------+
| val < INT_MIN/2 &&           | val_uf  | min_len | Set val to 0 to  |
| val_uf > min_len             |         | ^^^^^^^ | avoid underflow  |
+------------------------------+---------+---------+------------------+
| INT_MIN/2 <= val < 0         | min_len | min_len | No underflow     |
+------------------------------+---------+---------+------------------+
| 0 <= val <= min_len/2        | min_len | min_len | Ordinary case    |
+------------------------------+---------+---------+------------------+
| min_len/2 < val <= INT_MAX/2 | val*2   | val*2   | Ordinary case    |
+------------------------------+---------+---------+------------------+
| INT_MAX/2 < val              | min_len | max_len | Cap val to       |
|                              |         | ^^^^^^^ | prevent overflow |
+------------------------------+---------+---------+------------------+

Signed-off-by: Guillaume Nault <gnault@redhat.com>
---
 net/core/sock.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/net/core/sock.c b/net/core/sock.c
index 6aa2e7e0b4fb..5de90ed35968 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -713,6 +713,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		 */
 		val = min_t(u32, val, sysctl_wmem_max);
 set_sndbuf:
+		/* Ensure val * 2 fits into an int, to prevent max_t()
+		 * from treating it as a negative value.
+		 */
+		val = min_t(int, val, INT_MAX / 2);
 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 		/* Wake up sending tasks if we upped the value. */
@@ -724,6 +728,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			ret = -EPERM;
 			break;
 		}
+
+		/* No negative values (to prevent underflow, as val will be
+		 * multiplied by 2).
+		 */
+		if (val < 0)
+			val = 0;
 		goto set_sndbuf;
 
 	case SO_RCVBUF:
@@ -734,6 +744,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		 */
 		val = min_t(u32, val, sysctl_rmem_max);
 set_rcvbuf:
+		/* Ensure val * 2 fits into an int, to prevent max_t()
+		 * from treating it as a negative value.
+		 */
+		val = min_t(int, val, INT_MAX / 2);
 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 		/*
 		 * We double it on the way in to account for
@@ -758,6 +772,12 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 			ret = -EPERM;
 			break;
 		}
+
+		/* No negative values (to prevent underflow, as val will be
+		 * multiplied by 2).
+		 */
+		if (val < 0)
+			val = 0;
 		goto set_rcvbuf;
 
 	case SO_KEEPALIVE:
-- 
2.20.1


^ permalink raw reply related

* Re: [RFC 03/19] net/ice: Add support for ice peer devices and drivers
From: Jason Gunthorpe @ 2019-02-13  3:41 UTC (permalink / raw)
  To: Shiraz Saleem
  Cc: dledford, davem, linux-rdma, netdev, mustafa.ismail,
	jeffrey.t.kirsher, Anirudh Venkataramanan
In-Reply-To: <20190212214402.23284-4-shiraz.saleem@intel.com>

On Tue, Feb 12, 2019 at 03:43:46PM -0600, Shiraz Saleem wrote:
> From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com>
> 
> The E800 series of Ethernet devices has multiple hardware blocks, of
> which RDMA is one. The RDMA block isn't interfaced directly to PCI
> or any other bus. The RDMA driver (irdma) thus depends on the ice
> driver to provide access to the RDMA hardware block.
> 
> The ice driver first creates a pseudo bus and then creates and attaches
> a new device to the pseudo bus using device_register(). This new device
> is referred to as a "peer device" and the associated driver (i.e. irdma)
> is a "peer driver" to ice. Once the peer driver loads, it can call
> ice driver functions exposed to it via struct ice_ops. Similarly, ice can
> call peer driver functions exposed to it via struct ice_peer_ops.

This seems quite big for this straightforward description..
 
I was going to say I like the idea of using the driver model to
connect the drivers, but if it takes so much code ...

> +	/* check for reset in progress before proceeding */
> +	pf = pci_get_drvdata(peer_dev->pdev);
> +	for (i = 0; i < ICE_MAX_RESET_WAIT; i++) {
> +		if (!ice_is_reset_in_progress(pf->state))
> +			break;
> +		msleep(100);
> +	}

Use proper locking, not loops with sleeps.

Jason

^ permalink raw reply

* RE: [PATCH v1 net-next 2/4] net: dsa: microchip: add MIB counter reading support
From: Florian Fainelli @ 2019-02-13  3:51 UTC (permalink / raw)
  To: Tristram.Ha, andrew; +Cc: sergio.paracuellos, pavel, UNGLinuxDriver, netdev
In-Reply-To: <SN1PR11MB04465D50382C3F7438DA0FADEC660@SN1PR11MB0446.namprd11.prod.outlook.com>



On February 12, 2019 6:39:49 PM PST, Tristram.Ha@microchip.com wrote:
>> > +static void ksz9477_freeze_mib(struct ksz_device *dev, int port,
>bool
>> freeze)
>> > +{
>> > +	struct ksz_port *p = &dev->ports[port];
>> > +	u32 val = freeze ? MIB_COUNTER_FLUSH_FREEZE : 0;
>> 
>> Reverse Christmas tree.
>
>There was a checkpatch.pl patch in 2016 that tried to check this, but
>it was never accepted?
> 
>> > +		/* read only dropped counters when link is not up */
>> > +		if (p->link_just_down)
>> > +			p->link_just_down = 0;
>> > +		else if (!p->phydev.link)
>> > +			mib->cnt_ptr = dev->reg_mib_cnt;
>> 
>> This link_just_down stuff is not clear at all. Why can the drop
>> counters not be read when the link is up?
>
>All of the MIB counters, except some that may be marked by driver, do
>not get updated when the link is down, so it is a waste of time to read
>them.

Can you use netif_running() to determine that condition? Maintaining your own set of variables when the PHY state machine should already determine the link state sounds redundant if not error prone.

>
>My intention is the driver eventually reads the MIB counters at least
>every second or faster so that the ethtool API called to show MIB
>counters gets them from memory rather than starting a read operation. 
>For now that API is called from user space with the ethtool utility, so
>it may not be called too often and too fast.  But theoretically that
>API can be called from a program continually.
>
>For simple switches that do not need to do anything special the MIB
>read operation does not cause any issue except CPU load, for more
>complicate switches that need to do some background operations too many
>read operation can affect some critical functions.

Some switches have a MIB autocast feature taking a snapshot which AFAIR is internally implemented as a fast read register with no contention on other registers internally, do you have something similar?
-- 
Florian

^ permalink raw reply

* Re: [PATCH v2] rhashtable: make walk safe from softirq context
From: Herbert Xu @ 2019-02-13  4:35 UTC (permalink / raw)
  To: David Miller; +Cc: johannes, linux-wireless, netdev, j, tgraf, johannes.berg
In-Reply-To: <20190212.104339.1794719792249723582.davem@davemloft.net>

On Tue, Feb 12, 2019 at 10:43:39AM -0800, David Miller wrote:
>
> Herbert and Johannes, I need some guidance.
> 
> It seems Herbert wants the softirq usage of rhashtables removed, but
> since things have been like this for so long that's not the most
> reasonable requirement if we can fix it more simply with Johannes's
> patch especially for -stable.

I agree that we should provide a fix simple enough to backport.
However, I think the simplest fix is to simply use a linked list
for the walk.

I can prepare a patch for this if everybody else agrees with this
approach.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* Re: KASAN: use-after-free Read in sctp_outq_tail
From: Xin Long @ 2019-02-13  4:35 UTC (permalink / raw)
  To: syzbot
  Cc: davem, LKML, linux-sctp, Marcelo Ricardo Leitner, network dev,
	Neil Horman, syzkaller-bugs, Vlad Yasevich
In-Reply-To: <0000000000002015db0581b71858@google.com>

On Wed, Feb 13, 2019 at 4:00 AM syzbot
<syzbot+7823fa3f3e2d69341ea8@syzkaller.appspotmail.com> wrote:
>
> Hello,
>
> syzbot found the following crash on:
>
> HEAD commit:    d4104460aec1 Add linux-next specific files for 20190211
> git tree:       linux-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=14140124c00000
> kernel config:  https://syzkaller.appspot.com/x/.config?x=c8a112d3b0d6719b
> dashboard link: https://syzkaller.appspot.com/bug?extid=7823fa3f3e2d69341ea8
> compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
>
> Unfortunately, I don't have any reproducer for this crash yet.
>
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+7823fa3f3e2d69341ea8@syzkaller.appspotmail.com
>
> ==================================================================
> BUG: KASAN: use-after-free in list_add_tail include/linux/list.h:93 [inline]
> BUG: KASAN: use-after-free in sctp_outq_tail_data net/sctp/outqueue.c:105
> [inline]
> BUG: KASAN: use-after-free in sctp_outq_tail+0x816/0x930
> net/sctp/outqueue.c:313
> Read of size 8 at addr ffff88807b19a7b8 by task syz-executor.0/30745
I think https://patchwork.ozlabs.org/patch/1040500/ will fix this.


>
> CPU: 1 PID: 30745 Comm: syz-executor.0 Not tainted 5.0.0-rc5-next-20190211
> #32
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
> Google 01/01/2011
> Call Trace:
>   __dump_stack lib/dump_stack.c:77 [inline]
>   dump_stack+0x172/0x1f0 lib/dump_stack.c:113
>   print_address_description.cold+0x7c/0x20d mm/kasan/report.c:187
>   kasan_report.cold+0x1b/0x40 mm/kasan/report.c:317
>   __asan_report_load8_noabort+0x14/0x20 mm/kasan/generic_report.c:132
>   list_add_tail include/linux/list.h:93 [inline]
>   sctp_outq_tail_data net/sctp/outqueue.c:105 [inline]
>   sctp_outq_tail+0x816/0x930 net/sctp/outqueue.c:313
>   sctp_cmd_send_msg net/sctp/sm_sideeffect.c:1109 [inline]
>   sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1784 [inline]
>   sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>   sctp_do_sm+0x68e/0x5380 net/sctp/sm_sideeffect.c:1191
>   sctp_primitive_SEND+0xa0/0xd0 net/sctp/primitive.c:178
>   sctp_sendmsg_to_asoc+0xa63/0x17b0 net/sctp/socket.c:1955
>   sctp_sendmsg+0x10a9/0x17e0 net/sctp/socket.c:2113
>   inet_sendmsg+0x147/0x5d0 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xdd/0x130 net/socket.c:631
>   ___sys_sendmsg+0x806/0x930 net/socket.c:2136
>   __sys_sendmsg+0x105/0x1d0 net/socket.c:2174
>   __do_sys_sendmsg net/socket.c:2183 [inline]
>   __se_sys_sendmsg net/socket.c:2181 [inline]
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181
>   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x457e39
> Code: ad b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7
> 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
> ff 0f 83 7b b8 fb ff c3 66 2e 0f 1f 84 00 00 00 00
> RSP: 002b:00007fa9b8630c78 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000000457e39
> RDX: 0000000000000000 RSI: 0000000020000140 RDI: 0000000000000003
> RBP: 000000000073bf00 R08: 0000000000000000 R09: 0000000000000000
> R10: 0000000000000000 R11: 0000000000000246 R12: 00007fa9b86316d4
> R13: 00000000004c4e2b R14: 00000000004d8ab8 R15: 00000000ffffffff
>
> Allocated by task 30745:
>   save_stack+0x45/0xd0 mm/kasan/common.c:75
>   set_track mm/kasan/common.c:87 [inline]
>   __kasan_kmalloc mm/kasan/common.c:498 [inline]
>   __kasan_kmalloc.constprop.0+0xcf/0xe0 mm/kasan/common.c:471
>   kasan_kmalloc+0x9/0x10 mm/kasan/common.c:506
>   kmem_cache_alloc_trace+0x151/0x760 mm/slab.c:3615
>   kmalloc include/linux/slab.h:548 [inline]
>   kzalloc include/linux/slab.h:743 [inline]
>   sctp_stream_init_ext+0x51/0x110 net/sctp/stream.c:172
>   sctp_sendmsg_to_asoc+0x1273/0x17b0 net/sctp/socket.c:1896
>   sctp_sendmsg+0x10a9/0x17e0 net/sctp/socket.c:2113
>   inet_sendmsg+0x147/0x5d0 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xdd/0x130 net/socket.c:631
>   ___sys_sendmsg+0x806/0x930 net/socket.c:2136
>   __sys_sendmsg+0x105/0x1d0 net/socket.c:2174
>   __do_sys_sendmsg net/socket.c:2183 [inline]
>   __se_sys_sendmsg net/socket.c:2181 [inline]
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181
>   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> Freed by task 30745:
>   save_stack+0x45/0xd0 mm/kasan/common.c:75
>   set_track mm/kasan/common.c:87 [inline]
>   __kasan_slab_free+0x102/0x150 mm/kasan/common.c:460
>   kasan_slab_free+0xe/0x10 mm/kasan/common.c:468
>   __cache_free mm/slab.c:3491 [inline]
>   kfree+0xcf/0x230 mm/slab.c:3816
>   sctp_stream_outq_migrate+0x3e6/0x540 net/sctp/stream.c:88
>   sctp_stream_init+0xbc/0x410 net/sctp/stream.c:139
>   sctp_process_init+0x21c3/0x2b20 net/sctp/sm_make_chunk.c:2466
>   sctp_cmd_process_init net/sctp/sm_sideeffect.c:682 [inline]
>   sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1410 [inline]
>   sctp_side_effects net/sctp/sm_sideeffect.c:1220 [inline]
>   sctp_do_sm+0x3145/0x5380 net/sctp/sm_sideeffect.c:1191
>   sctp_assoc_bh_rcv+0x343/0x660 net/sctp/associola.c:1074
>   sctp_inq_push+0x1ea/0x290 net/sctp/inqueue.c:95
>   sctp_backlog_rcv+0x196/0xbe0 net/sctp/input.c:354
>   sk_backlog_rcv include/net/sock.h:937 [inline]
>   __release_sock+0x12e/0x3a0 net/core/sock.c:2379
>   release_sock+0x59/0x1c0 net/core/sock.c:2895
>   sctp_wait_for_connect+0x316/0x540 net/sctp/socket.c:8998
>   sctp_sendmsg_to_asoc+0x13e3/0x17b0 net/sctp/socket.c:1967
>   sctp_sendmsg+0x10a9/0x17e0 net/sctp/socket.c:2113
>   inet_sendmsg+0x147/0x5d0 net/ipv4/af_inet.c:798
>   sock_sendmsg_nosec net/socket.c:621 [inline]
>   sock_sendmsg+0xdd/0x130 net/socket.c:631
>   ___sys_sendmsg+0x806/0x930 net/socket.c:2136
>   __sys_sendmsg+0x105/0x1d0 net/socket.c:2174
>   __do_sys_sendmsg net/socket.c:2183 [inline]
>   __se_sys_sendmsg net/socket.c:2181 [inline]
>   __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2181
>   do_syscall_64+0x103/0x610 arch/x86/entry/common.c:290
>   entry_SYSCALL_64_after_hwframe+0x49/0xbe
>
> The buggy address belongs to the object at ffff88807b19a780
>   which belongs to the cache kmalloc-96 of size 96
> The buggy address is located 56 bytes inside of
>   96-byte region [ffff88807b19a780, ffff88807b19a7e0)
> The buggy address belongs to the page:
> page:ffffea0001ec6680 count:1 mapcount:0 mapping:ffff88812c3f04c0
> index:0xffff88807b19a800
> flags: 0x1fffc0000000200(slab)
> raw: 01fffc0000000200 ffffea000262acc8 ffffea0001448348 ffff88812c3f04c0
> raw: ffff88807b19a800 ffff88807b19a000 000000010000001d 0000000000000000
> page dumped because: kasan: bad access detected
>
> Memory state around the buggy address:
>   ffff88807b19a680: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>   ffff88807b19a700: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> > ffff88807b19a780: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>                                          ^
>   ffff88807b19a800: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
>   ffff88807b19a880: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc
> ==================================================================
>
>
> ---
> This bug is generated by a bot. It may contain errors.
> See https://goo.gl/tpsmEJ for more information about syzbot.
> syzbot engineers can be reached at syzkaller@googlegroups.com.
>
> syzbot will keep track of this bug report. See:
> https://goo.gl/tpsmEJ#bug-status-tracking for how to communicate with
> syzbot.

^ permalink raw reply

* [PATCH 0/2] mac80211: Fix incorrect usage of rhashtable walk API
From: Herbert Xu @ 2019-02-13  5:05 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg

Hi:

This patch fixes a number of issues with the use of the rhashtable API
in mac80211.  First of all it converts the use of rashtable walks over
to a simple linked list.  This is because an rhashtable walk is
inherently unstable and not meant for uses that require stability,
e.g., when you're trying to lookup an object to delete.

It also fixes a potential memory leak when the rhashtable insertion
fails (which can occur due to OOM).

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply

* [PATCH net-next 0/2] cxgb4/cxgb4vf:Support for SGE doorbell queue timer
From: Vishal Kulkarni @ 2019-02-13  5:14 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni

This series of patchs add SGE doorbell queue timer for faster DMA completions.

Patch 1 Implements SGE doorbell queue timer

Patch 2 Adds ethtool capability to set/get SGE doorbell queue timer tick


Vishal Kulkarni (2):
  cxgb4/cxgb4vf: Add support for SGE doorbell queue timer
  cxgb4: Add capability to get/set SGE Doorbell Queue Timer Tick

 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |  11 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 187 ++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  29 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c           | 308 +++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c         |  41 +++
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h         |   1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_values.h     |   6 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h      |  24 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c         |  23 +-
 9 files changed, 560 insertions(+), 70 deletions(-)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH net-next 1/2] cxgb4/cxgb4vf: Add support for SGE doorbell queue timer
From: Vishal Kulkarni @ 2019-02-13  5:14 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni
In-Reply-To: <1550034844-10850-1-git-send-email-vishal@chelsio.com>

T6 introduced a Timer Mechanism in SGE called the
SGE Doorbell Queue Timer. With this we can now configure
TX Queues to get CIDX Updates when:

    Time(CIDX == PIDX) >= Timer

Previously we rely on TX Queue Status Page updates by hardware
for DMA completions. This will make Hardware/Firmware actually
deliver the CIDX Updates as Ingress Queue messages with
commensurate Interrupts.

So we now have a new RX Path component for processing CIDX Updates
and reclaiming TX Descriptors faster.

Original work by: Casey Leedom <leedom@chelsio.com>

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h      |  10 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c |  19 +-
 drivers/net/ethernet/chelsio/cxgb4/sge.c        | 308 ++++++++++++++++++++----
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c      |  41 ++++
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.h      |   1 +
 drivers/net/ethernet/chelsio/cxgb4/t4_values.h  |   6 +
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h   |  24 +-
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c      |  23 +-
 8 files changed, 366 insertions(+), 66 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 568715a..68d0d45 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -617,6 +617,7 @@ enum {                                 /* adapter flags */
 	FW_OFLD_CONN       = (1 << 9),
 	ROOT_NO_RELAXED_ORDERING = (1 << 10),
 	SHUTTING_DOWN	   = (1 << 11),
+	SGE_DBQ_TIMER      = (1 << 12),
 };
 
 enum {
@@ -756,6 +757,8 @@ struct sge_eth_txq {                /* state for an SGE Ethernet Tx queue */
 #ifdef CONFIG_CHELSIO_T4_DCB
 	u8 dcb_prio;		    /* DCB Priority bound to queue */
 #endif
+	u8 dbqt;                    /* SGE Doorbell Queue Timer in use */
+	unsigned int dbqtimerix;    /* SGE Doorbell Queue Timer Index */
 	unsigned long tso;          /* # of TSO requests */
 	unsigned long tx_cso;       /* # of Tx checksum offloads */
 	unsigned long vlan_ins;     /* # of Tx VLAN insertions */
@@ -816,6 +819,7 @@ struct sge {
 	u16 nqs_per_uld;	    /* # of Rx queues per ULD */
 	u16 timer_val[SGE_NTIMERS];
 	u8 counter_val[SGE_NCOUNTERS];
+	u16 dbqtimer_val[SGE_NDBQTIMERS];
 	u32 fl_pg_order;            /* large page allocation size */
 	u32 stat_len;               /* length of status page at ring end */
 	u32 pktshift;               /* padding between CPL & packet data */
@@ -1402,7 +1406,7 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 		     rspq_flush_handler_t flush_handler, int cong);
 int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			 struct net_device *dev, struct netdev_queue *netdevq,
-			 unsigned int iqid);
+			 unsigned int iqid, u8 dbqt);
 int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 			  struct net_device *dev, unsigned int iqid,
 			  unsigned int cmplqid);
@@ -1415,6 +1419,8 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 int t4_sge_init(struct adapter *adap);
 void t4_sge_start(struct adapter *adap);
 void t4_sge_stop(struct adapter *adap);
+int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *q,
+				 int maxreclaim);
 void cxgb4_set_ethtool_ops(struct net_device *netdev);
 int cxgb4_write_rss(const struct port_info *pi, const u16 *queues);
 enum cpl_tx_tnl_lso_type cxgb_encap_offload_supported(struct sk_buff *skb);
@@ -1821,6 +1827,8 @@ int t4_ctrl_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
 int t4_ofld_eq_free(struct adapter *adap, unsigned int mbox, unsigned int pf,
 		    unsigned int vf, unsigned int eqid);
 int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type);
+int t4_read_sge_dbqtimers(struct adapter *adap, unsigned int ndbqtimers,
+			  u16 *dbqtimers);
 void t4_handle_get_port_info(struct port_info *pi, const __be64 *rpl);
 int t4_update_port_info(struct port_info *pi);
 int t4_get_link_params(struct port_info *pi, unsigned int *link_okp,
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index adf75d1..bdd11a6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -575,7 +575,7 @@ static int fwevtq_handler(struct sge_rspq *q, const __be64 *rsp,
 			struct sge_eth_txq *eq;
 
 			eq = container_of(txq, struct sge_eth_txq, q);
-			netif_tx_wake_queue(eq->txq);
+			t4_sge_eth_txq_egress_update(q->adap, eq, -1);
 		} else {
 			struct sge_uld_txq *oq;
 
@@ -933,10 +933,13 @@ static int setup_sge_queues(struct adapter *adap)
 			q->rspq.idx = j;
 			memset(&q->stats, 0, sizeof(q->stats));
 		}
-		for (j = 0; j < pi->nqsets; j++, t++) {
+
+		q = &s->ethrxq[pi->first_qset];
+		for (j = 0; j < pi->nqsets; j++, t++, q++) {
 			err = t4_sge_alloc_eth_txq(adap, t, dev,
 					netdev_get_tx_queue(dev, j),
-					s->fw_evtq.cntxt_id);
+					q->rspq.cntxt_id,
+					!!(adap->flags & SGE_DBQ_TIMER));
 			if (err)
 				goto freeout;
 		}
@@ -958,7 +961,7 @@ static int setup_sge_queues(struct adapter *adap)
 	if (!is_t4(adap->params.chip)) {
 		err = t4_sge_alloc_eth_txq(adap, &s->ptptxq, adap->port[0],
 					   netdev_get_tx_queue(adap->port[0], 0)
-					   , s->fw_evtq.cntxt_id);
+					   , s->fw_evtq.cntxt_id, false);
 		if (err)
 			goto freeout;
 	}
@@ -4325,6 +4328,14 @@ static int adap_init0(struct adapter *adap)
 	if (ret < 0)
 		goto bye;
 
+	/* Grab the SGE Doorbell Queue Timer values.  If successful, that
+	 * indicates that the Firmware and Hardware support this.
+	 */
+	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(adap->sge.dbqtimer_val),
+				    adap->sge.dbqtimer_val);
+	if (!ret)
+		adap->flags |= SGE_DBQ_TIMER;
+
 	if (is_bypass_device(adap->pdev->device))
 		adap->params.bypass = 1;
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index fc0bc64..ee6e4b2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -80,9 +80,10 @@
  * Max number of Tx descriptors we clean up at a time.  Should be modest as
  * freeing skbs isn't cheap and it happens while holding locks.  We just need
  * to free packets faster than they arrive, we eventually catch up and keep
- * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.
+ * the amortized cost reasonable.  Must be >= 2 * TXQ_STOP_THRES.  It should
+ * also match the CIDX Flush Threshold.
  */
-#define MAX_TX_RECLAIM 16
+#define MAX_TX_RECLAIM 32
 
 /*
  * Max number of Rx buffers we replenish at a time.  Again keep this modest,
@@ -401,31 +402,52 @@ static inline int reclaimable(const struct sge_txq *q)
 }
 
 /**
- *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
+ *	reclaim_completed_tx - reclaims completed TX Descriptors
  *	@adap: the adapter
  *	@q: the Tx queue to reclaim completed descriptors from
+ *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
  *	@unmap: whether the buffers should be unmapped for DMA
  *
- *	Reclaims Tx descriptors that the SGE has indicated it has processed,
- *	and frees the associated buffers if possible.  Called with the Tx
- *	queue locked.
+ *	Reclaims Tx Descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  If @max == -1, then
+ *	we'll use a defaiult maximum.  Called with the TX Queue locked.
  */
-inline void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
-					bool unmap)
+static inline int reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+				       int maxreclaim, bool unmap)
 {
-	int avail = reclaimable(q);
+	int reclaim = reclaimable(q);
 
-	if (avail) {
+	if (reclaim) {
 		/*
 		 * Limit the amount of clean up work we do at a time to keep
 		 * the Tx lock hold time O(1).
 		 */
-		if (avail > MAX_TX_RECLAIM)
-			avail = MAX_TX_RECLAIM;
+		if (maxreclaim < 0)
+			maxreclaim = MAX_TX_RECLAIM;
+		if (reclaim > maxreclaim)
+			reclaim = maxreclaim;
 
-		free_tx_desc(adap, q, avail, unmap);
-		q->in_use -= avail;
+		free_tx_desc(adap, q, reclaim, unmap);
+		q->in_use -= reclaim;
 	}
+
+	return reclaim;
+}
+
+/**
+ *	cxgb4_reclaim_completed_tx - reclaims completed Tx descriptors
+ *	@adap: the adapter
+ *	@q: the Tx queue to reclaim completed descriptors from
+ *	@unmap: whether the buffers should be unmapped for DMA
+ *
+ *	Reclaims Tx descriptors that the SGE has indicated it has processed,
+ *	and frees the associated buffers if possible.  Called with the Tx
+ *	queue locked.
+ */
+void cxgb4_reclaim_completed_tx(struct adapter *adap, struct sge_txq *q,
+				bool unmap)
+{
+	(void)reclaim_completed_tx(adap, q, -1, unmap);
 }
 EXPORT_SYMBOL(cxgb4_reclaim_completed_tx);
 
@@ -1288,6 +1310,44 @@ static inline void t6_fill_tnl_lso(struct sk_buff *skb,
 }
 
 /**
+ *	t4_sge_eth_txq_egress_update - handle Ethernet TX Queue update
+ *	@adap: the adapter
+ *	@eq: the Ethernet TX Queue
+ *	@maxreclaim: the maximum number of TX Descriptors to reclaim or -1
+ *
+ *	We're typically called here to update the state of an Ethernet TX
+ *	Queue with respect to the hardware's progress in consuming the TX
+ *	Work Requests that we've put on that Egress Queue.  This happens
+ *	when we get Egress Queue Update messages and also prophylactically
+ *	in regular timer-based Ethernet TX Queue maintenance.
+ */
+int t4_sge_eth_txq_egress_update(struct adapter *adap, struct sge_eth_txq *eq,
+				 int maxreclaim)
+{
+	struct sge_txq *q = &eq->q;
+	unsigned int reclaimed;
+
+	if (!q->in_use || !__netif_tx_trylock(eq->txq))
+		return 0;
+
+	/* Reclaim pending completed TX Descriptors. */
+	reclaimed = reclaim_completed_tx(adap, &eq->q, maxreclaim, true);
+
+	/* If the TX Queue is currently stopped and there's now more than half
+	 * the queue available, restart it.  Otherwise bail out since the rest
+	 * of what we want do here is with the possibility of shipping any
+	 * currently buffered Coalesced TX Work Request.
+	 */
+	if (netif_tx_queue_stopped(eq->txq) && txq_avail(q) > (q->size / 2)) {
+		netif_tx_wake_queue(eq->txq);
+		eq->q.restarts++;
+	}
+
+	__netif_tx_unlock(eq->txq);
+	return reclaimed;
+}
+
+/**
  *	cxgb4_eth_xmit - add a packet to an Ethernet Tx queue
  *	@skb: the packet
  *	@dev: the egress net device
@@ -1357,7 +1417,7 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 	skb_tx_timestamp(skb);
 
-	cxgb4_reclaim_completed_tx(adap, &q->q, true);
+	reclaim_completed_tx(adap, &q->q, -1, true);
 	cntrl = TXPKT_L4CSUM_DIS_F | TXPKT_IPCSUM_DIS_F;
 
 #ifdef CONFIG_CHELSIO_T4_FCOE
@@ -1400,8 +1460,25 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	wr_mid = FW_WR_LEN16_V(DIV_ROUND_UP(flits, 2));
 	if (unlikely(credits < ETHTXQ_STOP_THRES)) {
+		/* After we're done injecting the Work Request for this
+		 * packet, we'll be below our "stop threshold" so stop the TX
+		 * Queue now and schedule a request for an SGE Egress Queue
+		 * Update message. The queue will get started later on when
+		 * the firmware processes this Work Request and sends us an
+		 * Egress Queue Status Update message indicating that space
+		 * has opened up.
+		 */
 		eth_txq_stop(q);
-		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+
+		/* If we're using the SGE Doorbell Queue Timer facility, we
+		 * don't need to ask the Firmware to send us Egress Queue CIDX
+		 * Updates: the Hardware will do this automatically.  And
+		 * since we send the Ingress Queue CIDX Updates to the
+		 * corresponding Ethernet Response Queue, we'll get them very
+		 * quickly.
+		 */
+		if (!q->dbqt)
+			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 	}
 
 	wr = (void *)&q->q.desc[q->q.pidx];
@@ -1671,7 +1748,7 @@ static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
 	/* Take this opportunity to reclaim any TX Descriptors whose DMA
 	 * transfers have completed.
 	 */
-	cxgb4_reclaim_completed_tx(adapter, &txq->q, true);
+	reclaim_completed_tx(adapter, &txq->q, -1, true);
 
 	/* Calculate the number of flits and TX Descriptors we're going to
 	 * need along with how many TX Descriptors will be left over after
@@ -1715,7 +1792,16 @@ static netdev_tx_t cxgb4_vf_eth_xmit(struct sk_buff *skb,
 		 * has opened up.
 		 */
 		eth_txq_stop(txq);
-		wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
+
+		/* If we're using the SGE Doorbell Queue Timer facility, we
+		 * don't need to ask the Firmware to send us Egress Queue CIDX
+		 * Updates: the Hardware will do this automatically.  And
+		 * since we send the Ingress Queue CIDX Updates to the
+		 * corresponding Ethernet Response Queue, we'll get them very
+		 * quickly.
+		 */
+		if (!txq->dbqt)
+			wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F;
 	}
 
 	/* Start filling in our Work Request.  Note that we do _not_ handle
@@ -2794,6 +2880,74 @@ static int t4_tx_hststamp(struct adapter *adapter, struct sk_buff *skb,
 }
 
 /**
+ *	t4_tx_completion_handler - handle CPL_SGE_EGR_UPDATE messages
+ *	@rspq: Ethernet RX Response Queue associated with Ethernet TX Queue
+ *	@rsp: Response Entry pointer into Response Queue
+ *	@gl: Gather List pointer
+ *
+ *	For adapters which support the SGE Doorbell Queue Timer facility,
+ *	we configure the Ethernet TX Queues to send CIDX Updates to the
+ *	Associated Ethernet RX Response Queue with CPL_SGE_EGR_UPDATE
+ *	messages.  This adds a small load to PCIe Link RX bandwidth and,
+ *	potentially, higher CPU Interrupt load, but allows us to respond
+ *	much more quickly to the CIDX Updates.  This is important for
+ *	Upper Layer Software which isn't willing to have a large amount
+ *	of TX Data outstanding before receiving DMA Completions.
+ */
+static void t4_tx_completion_handler(struct sge_rspq *rspq,
+				     const __be64 *rsp,
+				     const struct pkt_gl *gl)
+{
+	struct adapter *adapter = rspq->adap;
+	struct port_info *pi = netdev_priv(rspq->netdev);
+	struct sge *s = &adapter->sge;
+	struct sge_eth_txq *txq;
+	u8 opcode = ((const struct rss_header *)rsp)->opcode;
+
+	/* skip RSS header */
+	rsp++;
+
+	/* FW can send EGR_UPDATEs encapsulated in a CPL_FW4_MSG.
+	 */
+	if (unlikely(opcode == CPL_FW4_MSG &&
+		     ((const struct cpl_fw4_msg *)rsp)->type ==
+							FW_TYPE_RSSCPL)) {
+		rsp++;
+		opcode = ((const struct rss_header *)rsp)->opcode;
+		rsp++;
+	}
+
+	if (unlikely(opcode != CPL_SGE_EGR_UPDATE)) {
+		pr_info("%s: unexpected FW4/CPL %#x on Rx queue\n",
+			__func__, opcode);
+		return;
+	}
+
+	txq = &s->ethtxq[pi->first_qset + rspq->idx];
+
+	/* We've got the Hardware Consumer Index Update in the Egress Update
+	 * message.  If we're using the SGE Doorbell Queue Timer mechanism,
+	 * these Egress Update messages will be our sole CIDX Updates we get
+	 * since we don't want to chew up PCIe bandwidth for both Ingress
+	 * Messages and Status Page writes.  However, The code which manages
+	 * reclaiming successfully DMA'ed TX Work Requests uses the CIDX value
+	 * stored in the Status Page at the end of the TX Queue.  It's easiest
+	 * to simply copy the CIDX Update value from the Egress Update message
+	 * to the Status Page.  Also note that no Endian issues need to be
+	 * considered here since both are Big Endian and we're just copying
+	 * bytes consistently ...
+	 */
+	if (txq->dbqt) {
+		struct cpl_sge_egr_update *egr;
+
+		egr = (struct cpl_sge_egr_update *)rsp;
+		WRITE_ONCE(txq->q.stat->cidx, egr->cidx);
+	}
+
+	t4_sge_eth_txq_egress_update(adapter, txq, -1);
+}
+
+/**
  *	t4_ethrx_handler - process an ingress ethernet packet
  *	@q: the response queue that received the packet
  *	@rsp: the response queue descriptor holding the RX_PKT message
@@ -2816,6 +2970,15 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 	struct port_info *pi;
 	int ret = 0;
 
+	/* If we're looking at TX Queue CIDX Update, handle that separately
+	 * and return.
+	 */
+	if (unlikely((*(u8 *)rsp == CPL_FW4_MSG) ||
+		     (*(u8 *)rsp == CPL_SGE_EGR_UPDATE))) {
+		t4_tx_completion_handler(q, rsp, si);
+		return 0;
+	}
+
 	if (unlikely(*(u8 *)rsp == cpl_trace_pkt))
 		return handle_trace_pkt(q->adap, si);
 
@@ -3289,7 +3452,7 @@ static void sge_rx_timer_cb(struct timer_list *t)
 
 static void sge_tx_timer_cb(struct timer_list *t)
 {
-	unsigned long m;
+	unsigned long m, period;
 	unsigned int i, budget;
 	struct adapter *adap = from_timer(adap, t, sge.tx_timer);
 	struct sge *s = &adap->sge;
@@ -3320,29 +3483,29 @@ static void sge_tx_timer_cb(struct timer_list *t)
 	budget = MAX_TIMER_TX_RECLAIM;
 	i = s->ethtxq_rover;
 	do {
-		struct sge_eth_txq *q = &s->ethtxq[i];
-
-		if (q->q.in_use &&
-		    time_after_eq(jiffies, q->txq->trans_start + HZ / 100) &&
-		    __netif_tx_trylock(q->txq)) {
-			int avail = reclaimable(&q->q);
-
-			if (avail) {
-				if (avail > budget)
-					avail = budget;
-
-				free_tx_desc(adap, &q->q, avail, true);
-				q->q.in_use -= avail;
-				budget -= avail;
-			}
-			__netif_tx_unlock(q->txq);
-		}
+		budget -= t4_sge_eth_txq_egress_update(adap, &s->ethtxq[i],
+						       budget);
+		if (!budget)
+			break;
 
 		if (++i >= s->ethqsets)
 			i = 0;
-	} while (budget && i != s->ethtxq_rover);
+	} while (i != s->ethtxq_rover);
 	s->ethtxq_rover = i;
-	mod_timer(&s->tx_timer, jiffies + (budget ? TX_QCHECK_PERIOD : 2));
+
+	if (budget == 0) {
+		/* If we found too many reclaimable packets schedule a timer
+		 * in the near future to continue where we left off.
+		 */
+		period = 2;
+	} else {
+		/* We reclaimed all reclaimable TX Descriptors, so reschedule
+		 * at the normal period.
+		 */
+		period = TX_QCHECK_PERIOD;
+	}
+
+	mod_timer(&s->tx_timer, jiffies + period);
 }
 
 /**
@@ -3421,7 +3584,8 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 							:  FW_IQ_IQTYPE_OFLD));
 
 	if (fl) {
-		enum chip_type chip = CHELSIO_CHIP_VERSION(adap->params.chip);
+		unsigned int chip_ver =
+			CHELSIO_CHIP_VERSION(adap->params.chip);
 
 		/* Allocate the ring for the hardware free list (with space
 		 * for its status page) along with the associated software
@@ -3459,10 +3623,10 @@ int t4_sge_alloc_rxq(struct adapter *adap, struct sge_rspq *iq, bool fwevtq,
 		 * the smaller 64-byte value there).
 		 */
 		c.fl0dcaen_to_fl0cidxfthresh =
-			htons(FW_IQ_CMD_FL0FBMIN_V(chip <= CHELSIO_T5 ?
+			htons(FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5 ?
 						   FETCHBURSTMIN_128B_X :
-						   FETCHBURSTMIN_64B_X) |
-			      FW_IQ_CMD_FL0FBMAX_V((chip <= CHELSIO_T5) ?
+						   FETCHBURSTMIN_64B_T6_X) |
+			      FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
 						   FETCHBURSTMAX_512B_X :
 						   FETCHBURSTMAX_256B_X));
 		c.fl0size = htons(flsz);
@@ -3584,14 +3748,24 @@ static void init_txq(struct adapter *adap, struct sge_txq *q, unsigned int id)
 	adap->sge.egr_map[id - adap->sge.egr_start] = q;
 }
 
+/**
+ *	t4_sge_alloc_eth_txq - allocate an Ethernet TX Queue
+ *	@adap: the adapter
+ *	@txq: the SGE Ethernet TX Queue to initialize
+ *	@dev: the Linux Network Device
+ *	@netdevq: the corresponding Linux TX Queue
+ *	@iqid: the Ingress Queue to which to deliver CIDX Update messages
+ *	@dbqt: whether this TX Queue will use the SGE Doorbell Queue Timers
+ */
 int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			 struct net_device *dev, struct netdev_queue *netdevq,
-			 unsigned int iqid)
+			 unsigned int iqid, u8 dbqt)
 {
 	int ret, nentries;
 	struct fw_eq_eth_cmd c;
 	struct sge *s = &adap->sge;
 	struct port_info *pi = netdev_priv(dev);
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 
 	/* Add status entries */
 	nentries = txq->q.size + s->stat_len / sizeof(struct tx_desc);
@@ -3610,19 +3784,47 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 			    FW_EQ_ETH_CMD_VFN_V(0));
 	c.alloc_to_len16 = htonl(FW_EQ_ETH_CMD_ALLOC_F |
 				 FW_EQ_ETH_CMD_EQSTART_F | FW_LEN16(c));
-	c.viid_pkd = htonl(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
-			   FW_EQ_ETH_CMD_VIID_V(pi->viid));
+
+	/* For TX Ethernet Queues using the SGE Doorbell Queue Timer
+	 * mechanism, we use Ingress Queue messages for Hardware Consumer
+	 * Index Updates on the TX Queue.  Otherwise we have the Hardware
+	 * write the CIDX Updates into the Status Page at the end of the
+	 * TX Queue.
+	 */
+	c.autoequiqe_to_viid = htonl((dbqt
+				      ? FW_EQ_ETH_CMD_AUTOEQUIQE_F
+				      : FW_EQ_ETH_CMD_AUTOEQUEQE_F) |
+				     FW_EQ_ETH_CMD_VIID_V(pi->viid));
+
 	c.fetchszm_to_iqid =
-		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(HOSTFCMODE_STATUS_PAGE_X) |
+		htonl(FW_EQ_ETH_CMD_HOSTFCMODE_V(dbqt
+						 ? HOSTFCMODE_INGRESS_QUEUE_X
+						 : HOSTFCMODE_STATUS_PAGE_X) |
 		      FW_EQ_ETH_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_ETH_CMD_FETCHRO_F | FW_EQ_ETH_CMD_IQID_V(iqid));
+
+	/* Note that the CIDX Flush Threshold should match MAX_TX_RECLAIM. */
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_ETH_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					    ? FETCHBURSTMIN_64B_X
+					    : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_ETH_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_ETH_CMD_EQSIZE_V(nentries));
+
 	c.eqaddr = cpu_to_be64(txq->q.phys_addr);
 
+	/* If we're using the SGE Doorbell Queue Timer mechanism, pass in the
+	 * currently configured Timer Index.  THis can be changed later via an
+	 * ethtool -C tx-usecs {Timer Val} command.  Note that the SGE
+	 * Doorbell Queue mode is currently automatically enabled in the
+	 * Firmware by setting either AUTOEQUEQE or AUTOEQUIQE ...
+	 */
+	if (dbqt)
+		c.timeren_timerix =
+			cpu_to_be32(FW_EQ_ETH_CMD_TIMEREN_F |
+				    FW_EQ_ETH_CMD_TIMERIX_V(txq->dbqtimerix));
+
 	ret = t4_wr_mbox(adap, adap->mbox, &c, sizeof(c), &c);
 	if (ret) {
 		kfree(txq->q.sdesc);
@@ -3639,6 +3841,8 @@ int t4_sge_alloc_eth_txq(struct adapter *adap, struct sge_eth_txq *txq,
 	txq->txq = netdevq;
 	txq->tso = txq->tx_cso = txq->vlan_ins = 0;
 	txq->mapping_err = 0;
+	txq->dbqt = dbqt;
+
 	return 0;
 }
 
@@ -3646,6 +3850,7 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 			  struct net_device *dev, unsigned int iqid,
 			  unsigned int cmplqid)
 {
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 	int ret, nentries;
 	struct fw_eq_ctrl_cmd c;
 	struct sge *s = &adap->sge;
@@ -3673,7 +3878,9 @@ int t4_sge_alloc_ctrl_txq(struct adapter *adap, struct sge_ctrl_txq *txq,
 		      FW_EQ_CTRL_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_CTRL_CMD_FETCHRO_F | FW_EQ_CTRL_CMD_IQID_V(iqid));
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_CTRL_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_CTRL_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					     ? FETCHBURSTMIN_64B_X
+					     : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_CTRL_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_CTRL_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_CTRL_CMD_EQSIZE_V(nentries));
@@ -3713,6 +3920,7 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 			 struct net_device *dev, unsigned int iqid,
 			 unsigned int uld_type)
 {
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adap->params.chip);
 	int ret, nentries;
 	struct fw_eq_ofld_cmd c;
 	struct sge *s = &adap->sge;
@@ -3743,7 +3951,9 @@ int t4_sge_alloc_uld_txq(struct adapter *adap, struct sge_uld_txq *txq,
 		      FW_EQ_OFLD_CMD_PCIECHN_V(pi->tx_chan) |
 		      FW_EQ_OFLD_CMD_FETCHRO_F | FW_EQ_OFLD_CMD_IQID_V(iqid));
 	c.dcaen_to_eqsize =
-		htonl(FW_EQ_OFLD_CMD_FBMIN_V(FETCHBURSTMIN_64B_X) |
+		htonl(FW_EQ_OFLD_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+					     ? FETCHBURSTMIN_64B_X
+					     : FETCHBURSTMIN_64B_T6_X) |
 		      FW_EQ_OFLD_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 		      FW_EQ_OFLD_CMD_CIDXFTHRESH_V(CIDXFLUSHTHRESH_32_X) |
 		      FW_EQ_OFLD_CMD_EQSIZE_V(nentries));
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index c5e5466..27af347 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -6713,6 +6713,47 @@ int t4_sge_ctxt_flush(struct adapter *adap, unsigned int mbox, int ctxt_type)
 }
 
 /**
+ *	t4_read_sge_dbqtimers - reag SGE Doorbell Queue Timer values
+ *	@adap - the adapter
+ *	@ndbqtimers: size of the provided SGE Doorbell Queue Timer table
+ *	@dbqtimers: SGE Doorbell Queue Timer table
+ *
+ *	Reads the SGE Doorbell Queue Timer values into the provided table.
+ *	Returns 0 on success (Firmware and Hardware support this feature),
+ *	an error on failure.
+ */
+int t4_read_sge_dbqtimers(struct adapter *adap, unsigned int ndbqtimers,
+			  u16 *dbqtimers)
+{
+	int ret, dbqtimerix;
+
+	ret = 0;
+	dbqtimerix = 0;
+	while (dbqtimerix < ndbqtimers) {
+		int nparams, param;
+		u32 params[7], vals[7];
+
+		nparams = ndbqtimers - dbqtimerix;
+		if (nparams > ARRAY_SIZE(params))
+			nparams = ARRAY_SIZE(params);
+
+		for (param = 0; param < nparams; param++)
+			params[param] =
+			  (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+			   FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMER) |
+			   FW_PARAMS_PARAM_Y_V(dbqtimerix + param));
+		ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+				      nparams, params, vals);
+		if (ret)
+			break;
+
+		for (param = 0; param < nparams; param++)
+			dbqtimers[dbqtimerix++] = vals[param];
+	}
+	return ret;
+}
+
+/**
  *      t4_fw_hello - establish communication with FW
  *      @adap: the adapter
  *      @mbox: mailbox to use for the FW command
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
index 361d503..002fc62 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.h
@@ -91,6 +91,7 @@ enum {
 	SGE_CTXT_SIZE = 24,       /* size of SGE context */
 	SGE_NTIMERS = 6,          /* # of interrupt holdoff timer values */
 	SGE_NCOUNTERS = 4,        /* # of interrupt packet counter values */
+	SGE_NDBQTIMERS = 8,       /* # of Doorbell Queue Timer values */
 	SGE_MAX_IQ_SIZE = 65520,
 
 	SGE_TIMER_RSTRT_CNTR = 6, /* restart RX packet threshold counter */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_values.h b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
index f6558cb..eb1aa82 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_values.h
@@ -71,12 +71,18 @@
 #define FETCHBURSTMIN_64B_X		2
 #define FETCHBURSTMIN_128B_X		3
 
+/* T6 and later use a single-bit encoding for FetchBurstMin */
+#define FETCHBURSTMIN_64B_T6_X		0
+#define FETCHBURSTMIN_128B_T6_X		1
+
 #define FETCHBURSTMAX_256B_X		2
 #define FETCHBURSTMAX_512B_X		3
 
+#define HOSTFCMODE_INGRESS_QUEUE_X	1
 #define HOSTFCMODE_STATUS_PAGE_X	2
 
 #define CIDXFLUSHTHRESH_32_X		5
+#define CIDXFLUSHTHRESH_128_X		7
 
 #define UPDATEDELIVERY_INTERRUPT_X	1
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 1d9b3e1..631f166 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -1254,6 +1254,8 @@ enum fw_params_param_dev {
 	FW_PARAMS_PARAM_DEV_RDMA_WRITE_WITH_IMM = 0x21,
 	FW_PARAMS_PARAM_DEV_RI_WRITE_CMPL_WR    = 0x24,
 	FW_PARAMS_PARAM_DEV_OPAQUE_VIID_SMT_EXTN = 0x27,
+	FW_PARAMS_PARAM_DEV_DBQ_TIMER	= 0x29,
+	FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK = 0x2A,
 };
 
 /*
@@ -1322,6 +1324,7 @@ enum fw_params_param_dmaq {
 	FW_PARAMS_PARAM_DMAQ_EQ_CMPLIQID_CTRL = 0x11,
 	FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH = 0x12,
 	FW_PARAMS_PARAM_DMAQ_EQ_DCBPRIO_ETH = 0x13,
+	FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX	= 0x15,
 	FW_PARAMS_PARAM_DMAQ_CONM_CTXT = 0x20,
 };
 
@@ -1751,8 +1754,8 @@ struct fw_eq_eth_cmd {
 	__be32 fetchszm_to_iqid;
 	__be32 dcaen_to_eqsize;
 	__be64 eqaddr;
-	__be32 viid_pkd;
-	__be32 r8_lo;
+	__be32 autoequiqe_to_viid;
+	__be32 timeren_timerix;
 	__be64 r9;
 };
 
@@ -1847,6 +1850,10 @@ struct fw_eq_eth_cmd {
 #define FW_EQ_ETH_CMD_EQSIZE_S		0
 #define FW_EQ_ETH_CMD_EQSIZE_V(x)	((x) << FW_EQ_ETH_CMD_EQSIZE_S)
 
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_S	31
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_V(x)	((x) << FW_EQ_ETH_CMD_AUTOEQUIQE_S)
+#define FW_EQ_ETH_CMD_AUTOEQUIQE_F	FW_EQ_ETH_CMD_AUTOEQUIQE_V(1U)
+
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_S	30
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_V(x)	((x) << FW_EQ_ETH_CMD_AUTOEQUEQE_S)
 #define FW_EQ_ETH_CMD_AUTOEQUEQE_F	FW_EQ_ETH_CMD_AUTOEQUEQE_V(1U)
@@ -1854,6 +1861,19 @@ struct fw_eq_eth_cmd {
 #define FW_EQ_ETH_CMD_VIID_S	16
 #define FW_EQ_ETH_CMD_VIID_V(x)	((x) << FW_EQ_ETH_CMD_VIID_S)
 
+#define FW_EQ_ETH_CMD_TIMEREN_S		3
+#define FW_EQ_ETH_CMD_TIMEREN_M		0x1
+#define FW_EQ_ETH_CMD_TIMEREN_V(x)	((x) << FW_EQ_ETH_CMD_TIMEREN_S)
+#define FW_EQ_ETH_CMD_TIMEREN_G(x)	\
+    (((x) >> FW_EQ_ETH_CMD_TIMEREN_S) & FW_EQ_ETH_CMD_TIMEREN_M)
+#define FW_EQ_ETH_CMD_TIMEREN_F	FW_EQ_ETH_CMD_TIMEREN_V(1U)
+
+#define FW_EQ_ETH_CMD_TIMERIX_S		0
+#define FW_EQ_ETH_CMD_TIMERIX_M		0x7
+#define FW_EQ_ETH_CMD_TIMERIX_V(x)	((x) << FW_EQ_ETH_CMD_TIMERIX_S)
+#define FW_EQ_ETH_CMD_TIMERIX_G(x)	\
+    (((x) >> FW_EQ_ETH_CMD_TIMERIX_S) & FW_EQ_ETH_CMD_TIMERIX_M)
+
 struct fw_eq_ctrl_cmd {
 	__be32 op_to_vfn;
 	__be32 alloc_to_len16;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 1d534f0..bd0683b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -2268,7 +2268,7 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq,
 	cmd.iqaddr = cpu_to_be64(rspq->phys_addr);
 
 	if (fl) {
-		enum chip_type chip =
+		unsigned int chip_ver =
 			CHELSIO_CHIP_VERSION(adapter->params.chip);
 		/*
 		 * Allocate the ring for the hardware free list (with space
@@ -2319,10 +2319,10 @@ int t4vf_sge_alloc_rxq(struct adapter *adapter, struct sge_rspq *rspq,
 		 */
 		cmd.fl0dcaen_to_fl0cidxfthresh =
 			cpu_to_be16(
-				FW_IQ_CMD_FL0FBMIN_V(chip <= CHELSIO_T5 ?
-						     FETCHBURSTMIN_128B_X :
-						     FETCHBURSTMIN_64B_X) |
-				FW_IQ_CMD_FL0FBMAX_V((chip <= CHELSIO_T5) ?
+				FW_IQ_CMD_FL0FBMIN_V(chip_ver <= CHELSIO_T5
+						     ? FETCHBURSTMIN_128B_X
+						     : FETCHBURSTMIN_64B_T6_X) |
+				FW_IQ_CMD_FL0FBMAX_V((chip_ver <= CHELSIO_T5) ?
 						     FETCHBURSTMAX_512B_X :
 						     FETCHBURSTMAX_256B_X));
 		cmd.fl0size = cpu_to_be16(flsz);
@@ -2411,6 +2411,7 @@ int t4vf_sge_alloc_eth_txq(struct adapter *adapter, struct sge_eth_txq *txq,
 			   struct net_device *dev, struct netdev_queue *devq,
 			   unsigned int iqid)
 {
+	unsigned int chip_ver = CHELSIO_CHIP_VERSION(adapter->params.chip);
 	struct sge *s = &adapter->sge;
 	int ret, nentries;
 	struct fw_eq_eth_cmd cmd, rpl;
@@ -2448,17 +2449,19 @@ int t4vf_sge_alloc_eth_txq(struct adapter *adapter, struct sge_eth_txq *txq,
 	cmd.alloc_to_len16 = cpu_to_be32(FW_EQ_ETH_CMD_ALLOC_F |
 					 FW_EQ_ETH_CMD_EQSTART_F |
 					 FW_LEN16(cmd));
-	cmd.viid_pkd = cpu_to_be32(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
-				   FW_EQ_ETH_CMD_VIID_V(pi->viid));
+	cmd.autoequiqe_to_viid = cpu_to_be32(FW_EQ_ETH_CMD_AUTOEQUEQE_F |
+					     FW_EQ_ETH_CMD_VIID_V(pi->viid));
 	cmd.fetchszm_to_iqid =
 		cpu_to_be32(FW_EQ_ETH_CMD_HOSTFCMODE_V(SGE_HOSTFCMODE_STPG) |
 			    FW_EQ_ETH_CMD_PCIECHN_V(pi->port_id) |
 			    FW_EQ_ETH_CMD_IQID_V(iqid));
 	cmd.dcaen_to_eqsize =
-		cpu_to_be32(FW_EQ_ETH_CMD_FBMIN_V(SGE_FETCHBURSTMIN_64B) |
-			    FW_EQ_ETH_CMD_FBMAX_V(SGE_FETCHBURSTMAX_512B) |
+		cpu_to_be32(FW_EQ_ETH_CMD_FBMIN_V(chip_ver <= CHELSIO_T5
+						  ? FETCHBURSTMIN_64B_X
+						  : FETCHBURSTMIN_64B_T6_X) |
+			    FW_EQ_ETH_CMD_FBMAX_V(FETCHBURSTMAX_512B_X) |
 			    FW_EQ_ETH_CMD_CIDXFTHRESH_V(
-						SGE_CIDXFLUSHTHRESH_32) |
+						CIDXFLUSHTHRESH_32_X) |
 			    FW_EQ_ETH_CMD_EQSIZE_V(nentries));
 	cmd.eqaddr = cpu_to_be64(txq->q.phys_addr);
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next 2/2] cxgb4: Add capability to get/set SGE Doorbell Queue Timer Tick
From: Vishal Kulkarni @ 2019-02-13  5:14 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni
In-Reply-To: <1550034844-10850-1-git-send-email-vishal@chelsio.com>

This patch gets/sets SGE Doorbell Queue timer ticks via ethtool

Original work by: Casey Leedom <leedom@chelsio.com>

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h         |   1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c | 187 ++++++++++++++++++++-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    |  14 +-
 3 files changed, 196 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 68d0d45..b7b0eb1 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -819,6 +819,7 @@ struct sge {
 	u16 nqs_per_uld;	    /* # of Rx queues per ULD */
 	u16 timer_val[SGE_NTIMERS];
 	u8 counter_val[SGE_NCOUNTERS];
+	u16 dbqtimer_tick;
 	u16 dbqtimer_val[SGE_NDBQTIMERS];
 	u32 fl_pg_order;            /* large page allocation size */
 	u32 stat_len;               /* length of status page at ring end */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
index 7960435..c075f65 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_ethtool.c
@@ -932,11 +932,188 @@ static int get_adaptive_rx_setting(struct net_device *dev)
 	return q->rspq.adaptive_rx;
 }
 
-static int set_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
+/* Return the current global Adapter SGE Doorbell Queue Timer Tick for all
+ * Ethernet TX Queues.
+ */
+static int get_dbqtimer_tick(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	return adap->sge.dbqtimer_tick;
+}
+
+/* Return the SGE Doorbell Queue Timer Value for the Ethernet TX Queues
+ * associated with a Network Device.
+ */
+static int get_dbqtimer(struct net_device *dev)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge_eth_txq *txq = &adap->sge.ethtxq[pi->first_qset];
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* all of the TX Queues use the same Timer Index */
+	return adap->sge.dbqtimer_val[txq->dbqtimerix];
+}
+
+/* Set the global Adapter SGE Doorbell Queue Timer Tick for all Ethernet TX
+ * Queues.  This is the fundamental "Tick" that sets the scale of values which
+ * can be used.  Individual Ethernet TX Queues index into a relatively small
+ * array of Tick Multipliers.  Changing the base Tick will thus change all of
+ * the resulting Timer Values associated with those multipliers for all
+ * Ethernet TX Queues.
+ */
+static int set_dbqtimer_tick(struct net_device *dev, int usecs)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	u32 param, val;
+	int ret;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* return early if it's the same Timer Tick we're already using */
+	if (s->dbqtimer_tick == usecs)
+		return 0;
+
+	/* attempt to set the new Timer Tick value */
+	param = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		 FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK));
+	val = usecs;
+	ret = t4_set_params(adap, adap->mbox, adap->pf, 0, 1, &param, &val);
+	if (ret)
+		return ret;
+	s->dbqtimer_tick = usecs;
+
+	/* if successful, reread resulting dependent Timer values */
+	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(s->dbqtimer_val),
+				    s->dbqtimer_val);
+	return ret;
+}
+
+/* Set the SGE Doorbell Queue Timer Value for the Ethernet TX Queues
+ * associated with a Network Device.  There is a relatively small array of
+ * possible Timer Values so we need to pick the closest value available.
+ */
+static int set_dbqtimer(struct net_device *dev, int usecs)
 {
-	set_adaptive_rx_setting(dev, c->use_adaptive_rx_coalesce);
-	return set_rx_intr_params(dev, c->rx_coalesce_usecs,
-				  c->rx_max_coalesced_frames);
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	struct sge *s = &adap->sge;
+	struct sge_eth_txq *txq;
+	u32 param, val;
+	int qix, timerix, min_timerix, delta, min_delta;
+	int ret;
+
+	if (!(adap->flags & SGE_DBQ_TIMER))
+		return 0;
+
+	/* Find the SGE Doorbell Timer Value that's closest to the requested
+	 * value.
+	 */
+	min_delta = INT_MAX;
+	min_timerix = 0;
+	for (timerix = 0; timerix < ARRAY_SIZE(s->dbqtimer_val); timerix++) {
+		delta = s->dbqtimer_val[timerix] - usecs;
+		if (delta < 0)
+			delta = -delta;
+		if (delta < min_delta) {
+			min_delta = delta;
+			min_timerix = timerix;
+		}
+	}
+
+	/* Return early if it's the same Timer Index we're already using.
+	 * We use the same Timer Index for all of the TX Queues for an
+	 * interface so it's only necessary to check the first one.
+	 */
+	txq = &s->ethtxq[pi->first_qset];
+	if (txq->dbqtimerix == min_timerix)
+		return 0;
+
+	for (qix = 0; qix < pi->nqsets; qix++, txq++) {
+		if (adap->flags & FULL_INIT_DONE) {
+			param =
+			 (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DMAQ) |
+			  FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DMAQ_EQ_TIMERIX) |
+			  FW_PARAMS_PARAM_YZ_V(txq->q.cntxt_id));
+			val = min_timerix;
+			ret = t4_set_params(adap, adap->mbox, adap->pf, 0,
+					    1, &param, &val);
+			if (ret)
+				return ret;
+		}
+		txq->dbqtimerix = min_timerix;
+	}
+	return 0;
+}
+
+/* Set the global Adapter SGE Doorbell Queue Timer Tick for all Ethernet TX
+ * Queues and the Timer Value for the Ethernet TX Queues associated with a
+ * Network Device.  Since changing the global Tick changes all of the
+ * available Timer Values, we need to do this first before selecting the
+ * resulting closest Timer Value.  Moreover, since the Tick is global,
+ * changing it affects the Timer Values for all Network Devices on the
+ * adapter.  So, before changing the Tick, we grab all of the current Timer
+ * Values for other Network Devices on this Adapter and then attempt to select
+ * new Timer Values which are close to the old values ...
+ */
+static int set_dbqtimer_tickval(struct net_device *dev,
+				int tick_usecs, int timer_usecs)
+{
+	struct port_info *pi = netdev_priv(dev);
+	struct adapter *adap = pi->adapter;
+	int timer[MAX_NPORTS];
+	unsigned int port;
+	int ret;
+
+	/* Grab the other adapter Network Interface current timers and fill in
+	 * the new one for this Network Interface.
+	 */
+	for_each_port(adap, port)
+		if (port == pi->port_id)
+			timer[port] = timer_usecs;
+		else
+			timer[port] = get_dbqtimer(adap->port[port]);
+
+	/* Change the global Tick first ... */
+	ret = set_dbqtimer_tick(dev, tick_usecs);
+	if (ret)
+		return ret;
+
+	/* ... and then set all of the Network Interface Timer Values ... */
+	for_each_port(adap, port) {
+		ret = set_dbqtimer(adap->port[port], timer[port]);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int set_coalesce(struct net_device *dev,
+			struct ethtool_coalesce *coalesce)
+{
+	int ret;
+
+	set_adaptive_rx_setting(dev, coalesce->use_adaptive_rx_coalesce);
+
+	ret = set_rx_intr_params(dev, coalesce->rx_coalesce_usecs,
+				 coalesce->rx_max_coalesced_frames);
+	if (ret)
+		return ret;
+
+	return set_dbqtimer_tickval(dev,
+				    coalesce->tx_coalesce_usecs_irq,
+				    coalesce->tx_coalesce_usecs);
 }
 
 static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
@@ -949,6 +1126,8 @@ static int get_coalesce(struct net_device *dev, struct ethtool_coalesce *c)
 	c->rx_max_coalesced_frames = (rq->intr_params & QINTR_CNT_EN_F) ?
 		adap->sge.counter_val[rq->pktcnt_idx] : 0;
 	c->use_adaptive_rx_coalesce = get_adaptive_rx_setting(dev);
+	c->tx_coalesce_usecs_irq = get_dbqtimer_tick(dev);
+	c->tx_coalesce_usecs = get_dbqtimer(dev);
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index bdd11a6..bcbac24 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -4331,8 +4331,18 @@ static int adap_init0(struct adapter *adap)
 	/* Grab the SGE Doorbell Queue Timer values.  If successful, that
 	 * indicates that the Firmware and Hardware support this.
 	 */
-	ret = t4_read_sge_dbqtimers(adap, ARRAY_SIZE(adap->sge.dbqtimer_val),
-				    adap->sge.dbqtimer_val);
+	params[0] = (FW_PARAMS_MNEM_V(FW_PARAMS_MNEM_DEV) |
+		    FW_PARAMS_PARAM_X_V(FW_PARAMS_PARAM_DEV_DBQ_TIMERTICK));
+	ret = t4_query_params(adap, adap->mbox, adap->pf, 0,
+			      1, params, val);
+
+	if (!ret) {
+		adap->sge.dbqtimer_tick = val[0];
+		ret = t4_read_sge_dbqtimers(adap,
+					    ARRAY_SIZE(adap->sge.dbqtimer_val),
+					    adap->sge.dbqtimer_val);
+	}
+
 	if (!ret)
 		adap->flags |= SGE_DBQ_TIMER;
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH 1/2] mac80211: Use linked list instead of rhashtable walk for mesh tables
From: Herbert Xu @ 2019-02-13  5:16 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg
In-Reply-To: <20190213050551.x3jffq3ipghw6g2m@gondor.apana.org.au>

The mesh table code walks over hash tables for two purposes.  First of
all it's used as part of a netlink dump process, but it is also used
for looking up entries to delete using criteria other than the hash
key.

The second purpose is directly contrary to the design specification
of rhashtable walks.  It is only meant for use by netlink dumps.

This is because rhashtable is resizable and you cannot obtain a
stable walk over it during a resize process.

In fact mesh's use of rhashtable for dumping is bogus too.  Rather
than using rhashtable walk's iterator to keep track of the current
position, it always converts the current position to an integer
which defeats the purpose of the iterator.

Therefore this patch converts all uses of rhashtable walk into a
simple linked list.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/mac80211/mesh.h         |    4 +
 net/mac80211/mesh_pathtbl.c |  119 +++++++-------------------------------------
 2 files changed, 24 insertions(+), 99 deletions(-)

diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index cad6592c52a1..d8e626d66055 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -70,6 +70,7 @@ enum mesh_deferred_task_flags {
  * @dst: mesh path destination mac address
  * @mpp: mesh proxy mac address
  * @rhash: rhashtable list pointer
+ * @walk_list: linked list containing all mesh_path objects.
  * @gate_list: list pointer for known gates list
  * @sdata: mesh subif
  * @next_hop: mesh neighbor to which frames for this destination will be
@@ -105,6 +106,7 @@ struct mesh_path {
 	u8 dst[ETH_ALEN];
 	u8 mpp[ETH_ALEN];	/* used for MPP or MAP */
 	struct rhash_head rhash;
+	struct hlist_node walk_list;
 	struct hlist_node gate_list;
 	struct ieee80211_sub_if_data *sdata;
 	struct sta_info __rcu *next_hop;
@@ -133,12 +135,14 @@ struct mesh_path {
  * gate's mpath may or may not be resolved and active.
  * @gates_lock: protects updates to known_gates
  * @rhead: the rhashtable containing struct mesh_paths, keyed by dest addr
+ * @walk_head: linked list containging all mesh_path objects
  * @entries: number of entries in the table
  */
 struct mesh_table {
 	struct hlist_head known_gates;
 	spinlock_t gates_lock;
 	struct rhashtable rhead;
+	struct hlist_head walk_head;
 	atomic_t entries;		/* Up to MAX_MESH_NEIGHBOURS */
 };
 
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index a5125624a76d..984ed433a8bc 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -249,28 +249,15 @@ mpp_path_lookup(struct ieee80211_sub_if_data *sdata, const u8 *dst)
 static struct mesh_path *
 __mesh_path_lookup_by_idx(struct mesh_table *tbl, int idx)
 {
-	int i = 0, ret;
-	struct mesh_path *mpath = NULL;
-	struct rhashtable_iter iter;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
-	if (ret)
-		return NULL;
-
-	rhashtable_walk_start(&iter);
+	int i = 0;
+	struct mesh_path *mpath;
 
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	hlist_for_each_entry(mpath, &tbl->walk_head, walk_list) {
 		if (i++ == idx)
 			break;
 	}
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 
-	if (IS_ERR(mpath) || !mpath)
+	if (!mpath)
 		return NULL;
 
 	if (mpath_expired(mpath)) {
@@ -441,7 +428,8 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 			mpath = rhashtable_lookup_fast(&tbl->rhead,
 						       dst,
 						       mesh_rht_params);
-
+		else if (!ret)
+			hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
 	} while (unlikely(ret == -EEXIST && !mpath));
 
 	if (ret && ret != -EEXIST)
@@ -483,6 +471,8 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
 	ret = rhashtable_lookup_insert_fast(&tbl->rhead,
 					    &new_mpath->rhash,
 					    mesh_rht_params);
+	if (!ret)
+		hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
 
 	sdata->u.mesh.mpp_paths_generation++;
 	return ret;
@@ -503,20 +493,8 @@ void mesh_plink_broken(struct sta_info *sta)
 	struct mesh_table *tbl = sdata->u.mesh.mesh_paths;
 	static const u8 bcast[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
 	struct mesh_path *mpath;
-	struct rhashtable_iter iter;
-	int ret;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
-	if (ret)
-		return;
-
-	rhashtable_walk_start(&iter);
 
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	hlist_for_each_entry(mpath, &tbl->walk_head, walk_list) {
 		if (rcu_access_pointer(mpath->next_hop) == sta &&
 		    mpath->flags & MESH_PATH_ACTIVE &&
 		    !(mpath->flags & MESH_PATH_FIXED)) {
@@ -530,8 +508,6 @@ void mesh_plink_broken(struct sta_info *sta)
 				WLAN_REASON_MESH_PATH_DEST_UNREACHABLE, bcast);
 		}
 	}
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 }
 
 static void mesh_path_free_rcu(struct mesh_table *tbl,
@@ -551,6 +527,7 @@ static void mesh_path_free_rcu(struct mesh_table *tbl,
 
 static void __mesh_path_del(struct mesh_table *tbl, struct mesh_path *mpath)
 {
+	hlist_del(&mpath->walk_list);
 	rhashtable_remove_fast(&tbl->rhead, &mpath->rhash, mesh_rht_params);
 	mesh_path_free_rcu(tbl, mpath);
 }
@@ -571,27 +548,12 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
 	struct ieee80211_sub_if_data *sdata = sta->sdata;
 	struct mesh_table *tbl = sdata->u.mesh.mesh_paths;
 	struct mesh_path *mpath;
-	struct rhashtable_iter iter;
-	int ret;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
-	if (ret)
-		return;
-
-	rhashtable_walk_start(&iter);
-
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	struct hlist_node *n;
 
+	hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) {
 		if (rcu_access_pointer(mpath->next_hop) == sta)
 			__mesh_path_del(tbl, mpath);
 	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 }
 
 static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
@@ -599,51 +561,22 @@ static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
 {
 	struct mesh_table *tbl = sdata->u.mesh.mpp_paths;
 	struct mesh_path *mpath;
-	struct rhashtable_iter iter;
-	int ret;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
-	if (ret)
-		return;
-
-	rhashtable_walk_start(&iter);
-
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	struct hlist_node *n;
 
+	hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) {
 		if (ether_addr_equal(mpath->mpp, proxy))
 			__mesh_path_del(tbl, mpath);
 	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 }
 
 static void table_flush_by_iface(struct mesh_table *tbl)
 {
 	struct mesh_path *mpath;
-	struct rhashtable_iter iter;
-	int ret;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_ATOMIC);
-	if (ret)
-		return;
-
-	rhashtable_walk_start(&iter);
+	struct hlist_node *n;
 
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) {
 		__mesh_path_del(tbl, mpath);
 	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 }
 
 /**
@@ -838,6 +771,8 @@ int mesh_pathtbl_init(struct ieee80211_sub_if_data *sdata)
 
 	rhashtable_init(&tbl_path->rhead, &mesh_rht_params);
 	rhashtable_init(&tbl_mpp->rhead, &mesh_rht_params);
+	INIT_HLIST_HEAD(&tbl_path->walk_head);
+	INIT_HLIST_HEAD(&tbl_mpp->walk_head);
 
 	sdata->u.mesh.mesh_paths = tbl_path;
 	sdata->u.mesh.mpp_paths = tbl_mpp;
@@ -854,28 +789,14 @@ void mesh_path_tbl_expire(struct ieee80211_sub_if_data *sdata,
 			  struct mesh_table *tbl)
 {
 	struct mesh_path *mpath;
-	struct rhashtable_iter iter;
-	int ret;
-
-	ret = rhashtable_walk_init(&tbl->rhead, &iter, GFP_KERNEL);
-	if (ret)
-		return;
-
-	rhashtable_walk_start(&iter);
+	struct hlist_node *n;
 
-	while ((mpath = rhashtable_walk_next(&iter))) {
-		if (IS_ERR(mpath) && PTR_ERR(mpath) == -EAGAIN)
-			continue;
-		if (IS_ERR(mpath))
-			break;
+	hlist_for_each_entry_safe(mpath, n, &tbl->walk_head, walk_list) {
 		if ((!(mpath->flags & MESH_PATH_RESOLVING)) &&
 		    (!(mpath->flags & MESH_PATH_FIXED)) &&
 		     time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
 			__mesh_path_del(tbl, mpath);
 	}
-
-	rhashtable_walk_stop(&iter);
-	rhashtable_walk_exit(&iter);
 }
 
 void mesh_path_expire(struct ieee80211_sub_if_data *sdata)

^ permalink raw reply related

* [PATCH 2/2] mac80211: Free mpath object when rhashtable insertion fails
From: Herbert Xu @ 2019-02-13  5:16 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg
In-Reply-To: <20190213050551.x3jffq3ipghw6g2m@gondor.apana.org.au>

When rhashtable insertion fails the mesh table code doesn't free
the now-orphan mesh path object.  This patch fixes that.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/mac80211/mesh_pathtbl.c |   11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 984ed433a8bc..93fdd9a7d72c 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -432,17 +432,18 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 			hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
 	} while (unlikely(ret == -EEXIST && !mpath));
 
-	if (ret && ret != -EEXIST)
+	if (ret)
+		kfree(new_mpath);
+
+	if (ret != -EEXIST)
 		return ERR_PTR(ret);
 
 	/* At this point either new_mpath was added, or we found a
 	 * matching entry already in the table; in the latter case
 	 * free the unnecessary new entry.
 	 */
-	if (ret == -EEXIST) {
-		kfree(new_mpath);
+	if (ret == -EEXIST)
 		new_mpath = mpath;
-	}
 	sdata->u.mesh.mesh_paths_generation++;
 	return new_mpath;
 }
@@ -473,6 +474,8 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
 					    mesh_rht_params);
 	if (!ret)
 		hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
+	else
+		kfree(new_mpath);
 
 	sdata->u.mesh.mpp_paths_generation++;
 	return ret;

^ permalink raw reply related

* [PATCH net-next] cxgb4vf: Few more link management changes.
From: Vishal Kulkarni @ 2019-02-13  5:18 UTC (permalink / raw)
  To: netdev, davem; +Cc: nirranjan, indranil, dt, Vishal Kulkarni

CR4_QSFP 10G Speed technology should be 10000baseKR_Full
And also report available FEC modes.

Signed-off-by: Vishal Kulkarni <vishal@chelsio.com>
---
 drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 84be331..3300b69 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1408,7 +1408,7 @@ static void fw_caps_to_lmm(enum fw_port_type port_type,
 	case FW_PORT_TYPE_CR4_QSFP:
 		SET_LMM(FIBRE);
 		FW_CAPS_TO_LMM(SPEED_1G,  1000baseT_Full);
-		FW_CAPS_TO_LMM(SPEED_10G, 10000baseSR_Full);
+		FW_CAPS_TO_LMM(SPEED_10G, 10000baseKR_Full);
 		FW_CAPS_TO_LMM(SPEED_40G, 40000baseSR4_Full);
 		FW_CAPS_TO_LMM(SPEED_25G, 25000baseCR_Full);
 		FW_CAPS_TO_LMM(SPEED_50G, 50000baseCR2_Full);
@@ -1419,6 +1419,13 @@ static void fw_caps_to_lmm(enum fw_port_type port_type,
 		break;
 	}
 
+	if (fw_caps & FW_PORT_CAP32_FEC_V(FW_PORT_CAP32_FEC_M)) {
+		FW_CAPS_TO_LMM(FEC_RS, FEC_RS);
+		FW_CAPS_TO_LMM(FEC_BASER_RS, FEC_BASER);
+	} else {
+		SET_LMM(FEC_NONE);
+	}
+
 	FW_CAPS_TO_LMM(ANEG, Autoneg);
 	FW_CAPS_TO_LMM(802_3_PAUSE, Pause);
 	FW_CAPS_TO_LMM(802_3_ASM_DIR, Asym_Pause);
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH] mac80211: Use rhashtable_lookup_get_insert_fast instead of racy code
From: Herbert Xu @ 2019-02-13  5:25 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg
In-Reply-To: <20190213050551.x3jffq3ipghw6g2m@gondor.apana.org.au>

On Wed, Feb 13, 2019 at 01:05:51PM +0800, Herbert Xu wrote:
> Hi:
> 
> This patch fixes a number of issues with the use of the rhashtable API
> in mac80211.  First of all it converts the use of rashtable walks over
> to a simple linked list.  This is because an rhashtable walk is
> inherently unstable and not meant for uses that require stability,
> e.g., when you're trying to lookup an object to delete.
> 
> It also fixes a potential memory leak when the rhashtable insertion
> fails (which can occur due to OOM).

On top these two fixes here is one more clean-up patch to use an
existing rhashtable API instead of rolling its own racy version:

---8<---
The code in mesh_path_add tries to handle the case where a duplicate
entry is added to the rhashtable by doing a lookup after a failed
insertion.  It also tries to handle races by repeating the insertion
should the lookup fail.

This is now unnecessary as we have rhashtable API functions that can
directly return the mathcing object.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index 93fdd9a7d72c..b06f066bfa01 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -402,7 +402,6 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 {
 	struct mesh_table *tbl;
 	struct mesh_path *mpath, *new_mpath;
-	int ret;
 
 	if (ether_addr_equal(dst, sdata->vif.addr))
 		/* never add ourselves as neighbours */
@@ -419,31 +418,21 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
 		return ERR_PTR(-ENOMEM);
 
 	tbl = sdata->u.mesh.mesh_paths;
-	do {
-		ret = rhashtable_lookup_insert_fast(&tbl->rhead,
-						    &new_mpath->rhash,
-						    mesh_rht_params);
-
-		if (ret == -EEXIST)
-			mpath = rhashtable_lookup_fast(&tbl->rhead,
-						       dst,
-						       mesh_rht_params);
-		else if (!ret)
-			hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
-	} while (unlikely(ret == -EEXIST && !mpath));
-
-	if (ret)
+	mpath = rhashtable_lookup_get_insert_fast(&tbl->rhead,
+						  &new_mpath->rhash,
+						  mesh_rht_params);
+	if (!mpath)
+		hlist_add_head(&new_mpath->walk_list, &tbl->walk_head);
+
+	if (mpath) {
 		kfree(new_mpath);
 
-	if (ret != -EEXIST)
-		return ERR_PTR(ret);
+		if (IS_ERR(mpath))
+			return mpath;
 
-	/* At this point either new_mpath was added, or we found a
-	 * matching entry already in the table; in the latter case
-	 * free the unnecessary new entry.
-	 */
-	if (ret == -EEXIST)
 		new_mpath = mpath;
+	}
+
 	sdata->u.mesh.mesh_paths_generation++;
 	return new_mpath;
 }
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related

* [PATCH] rhashtable: Remove obsolete rhashtable_walk_init function
From: Herbert Xu @ 2019-02-13  5:34 UTC (permalink / raw)
  To: David Miller, johannes, linux-wireless, netdev, j, tgraf,
	johannes.berg
In-Reply-To: <20190213050551.x3jffq3ipghw6g2m@gondor.apana.org.au>

Here is another patch on top the fixes to mac80211 to finally
remove the obsolete rhashtable_walk_init API.

---8<---
The rhashtable_walk_init function has been obsolete for more than
two years.  This patch finally converts its last users over to
rhashtable_walk_enter and removes it.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 20f9c6af7473..ae9c0f71f311 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -1113,14 +1113,6 @@ static inline int rhashtable_replace_fast(
 	return err;
 }
 
-/* Obsolete function, do not use in new code. */
-static inline int rhashtable_walk_init(struct rhashtable *ht,
-				       struct rhashtable_iter *iter, gfp_t gfp)
-{
-	rhashtable_walk_enter(ht, iter);
-	return 0;
-}
-
 /**
  * rhltable_walk_enter - Initialise an iterator
  * @hlt:	Table to walk over
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 852ffa5160f1..0a105d4af166 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -682,7 +682,7 @@ EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
  * rhashtable_walk_exit - Free an iterator
  * @iter:	Hash table Iterator
  *
- * This function frees resources allocated by rhashtable_walk_init.
+ * This function frees resources allocated by rhashtable_walk_enter.
  */
 void rhashtable_walk_exit(struct rhashtable_iter *iter)
 {
diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 6a8ac7626797..d96962eea942 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -177,16 +177,11 @@ static int __init test_rht_lookup(struct rhashtable *ht, struct test_obj *array,
 
 static void test_bucket_stats(struct rhashtable *ht, unsigned int entries)
 {
-	unsigned int err, total = 0, chain_len = 0;
+	unsigned int total = 0, chain_len = 0;
 	struct rhashtable_iter hti;
 	struct rhash_head *pos;
 
-	err = rhashtable_walk_init(ht, &hti, GFP_KERNEL);
-	if (err) {
-		pr_warn("Test failed: allocation error");
-		return;
-	}
-
+	rhashtable_walk_enter(ht, &hti);
 	rhashtable_walk_start(&hti);
 
 	while ((pos = rhashtable_walk_next(&hti))) {
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 17c455ff69ff..ae6cd4cef8db 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -385,10 +385,7 @@ int ila_xlat_nl_cmd_flush(struct sk_buff *skb, struct genl_info *info)
 	spinlock_t *lock;
 	int ret;
 
-	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter, GFP_KERNEL);
-	if (ret)
-		goto done;
-
+	rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter);
 	rhashtable_walk_start(&iter);
 
 	for (;;) {
@@ -509,23 +506,17 @@ int ila_xlat_nl_dump_start(struct netlink_callback *cb)
 	struct net *net = sock_net(cb->skb->sk);
 	struct ila_net *ilan = net_generic(net, ila_net_id);
 	struct ila_dump_iter *iter;
-	int ret;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
 		return -ENOMEM;
 
-	ret = rhashtable_walk_init(&ilan->xlat.rhash_table, &iter->rhiter,
-				   GFP_KERNEL);
-	if (ret) {
-		kfree(iter);
-		return ret;
-	}
+	rhashtable_walk_enter(&ilan->xlat.rhash_table, &iter->rhiter);
 
 	iter->skip = 0;
 	cb->args[0] = (long)iter;
 
-	return ret;
+	return 0;
 }
 
 int ila_xlat_nl_dump_done(struct netlink_callback *cb)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 3c023d6120f6..f369cc751cea 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2541,15 +2541,7 @@ struct nl_seq_iter {
 
 static int netlink_walk_start(struct nl_seq_iter *iter)
 {
-	int err;
-
-	err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti,
-				   GFP_KERNEL);
-	if (err) {
-		iter->link = MAX_LINKS;
-		return err;
-	}
-
+	rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
 	rhashtable_walk_start(&iter->hti);
 
 	return 0;
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox