* [PATCH v3 net-next] bnx2x: ethtool -x support for rss_key
From: Eric Dumazet @ 2016-12-21 17:48 UTC (permalink / raw)
To: Mintz, Yuval; +Cc: David Miller, netdev, Ariel Elior
In-Reply-To: <BL2PR07MB2306F104F51FD4494FDC172E8D830@BL2PR07MB2306.namprd07.prod.outlook.com>
From: Eric Dumazet <edumazet@google.com>
Implement ethtool -x full support, so that rss key can be fetched
instead of assuming it matches /proc/sys/net/core/netdev_rss_key
content.
We might add "ethtool --rxfh" support later to set a different rss key.
Tested:
lpk51:~# ethtool --show-rxfh eth0 | grep -A1 'hash key'
RSS hash key:
f0:7e:81:59:c7:c9:5b:26:f7:14:7e:9e:29:01:50:99:b6:e9:8d:e1:3a:13:0c:54:71:cf:9d:72:f5:d6:c7:e0:fd:26:e2:7c:52:db:ea:e8
lpk51:~# cat /proc/sys/net/core/netdev_rss_key
f0:7e:81:59:c7:c9:5b:26:f7:14:7e:9e:29:01:50:99:b6:e9:8d:e1:3a:13:0c:54:71:cf:9d:72:f5:d6:c7:e0:fd:26:e2:7c:52:db:ea:e8:f0:89:dc:69:2f:fa:c7:71:30:a0:0c:7a
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ariel Elior <ariel.elior@qlogic.com>
Cc: Yuval Mintz <Yuval.Mintz@cavium.com>
---
net-next is closed, merge can wait !
v3: Addressed Yuval feedback given on v2 (I hope I understood well)
v2: support CONFIG_BNX2X_SRIOV=y
drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c | 45 ++++++----
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c | 9 +-
drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h | 4
drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c | 2
4 files changed, 42 insertions(+), 18 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index 5f19427c7b278b99ee34a20b4d854e1d1293..5f31d0d52d3cdf02ded6670e032d335ac32c 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -3429,6 +3429,13 @@ static u32 bnx2x_get_rxfh_indir_size(struct net_device *dev)
return T_ETH_INDIRECTION_TABLE_SIZE;
}
+static u32 bnx2x_get_rxfh_key_size(struct net_device *dev)
+{
+ struct bnx2x *bp = netdev_priv(dev);
+
+ return (bp->port.pmf || !CHIP_IS_E1x(bp)) ? T_ETH_RSS_KEY * 4 : 0;
+}
+
static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
u8 *hfunc)
{
@@ -3438,23 +3445,28 @@ static int bnx2x_get_rxfh(struct net_device *dev, u32 *indir, u8 *key,
if (hfunc)
*hfunc = ETH_RSS_HASH_TOP;
- if (!indir)
- return 0;
- /* Get the current configuration of the RSS indirection table */
- bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
-
- /*
- * We can't use a memcpy() as an internal storage of an
- * indirection table is a u8 array while indir->ring_index
- * points to an array of u32.
- *
- * Indirection table contains the FW Client IDs, so we need to
- * align the returned table to the Client ID of the leading RSS
- * queue.
- */
- for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
- indir[i] = ind_table[i] - bp->fp->cl_id;
+ if (key) {
+ WARN_ON_ONCE(bnx2x_get_rxfh_key_size(dev) != T_ETH_RSS_KEY * 4);
+ bnx2x_get_rss_key(&bp->rss_conf_obj, key);
+ }
+
+ if (indir) {
+ /* Get the current configuration of the RSS indirection table */
+ bnx2x_get_rss_ind_table(&bp->rss_conf_obj, ind_table);
+
+ /*
+ * We can't use a memcpy() as an internal storage of an
+ * indirection table is a u8 array while indir->ring_index
+ * points to an array of u32.
+ *
+ * Indirection table contains the FW Client IDs, so we need to
+ * align the returned table to the Client ID of the leading RSS
+ * queue.
+ */
+ for (i = 0; i < T_ETH_INDIRECTION_TABLE_SIZE; i++)
+ indir[i] = ind_table[i] - bp->fp->cl_id;
+ }
return 0;
}
@@ -3636,6 +3648,7 @@ static const struct ethtool_ops bnx2x_ethtool_ops = {
.get_ethtool_stats = bnx2x_get_ethtool_stats,
.get_rxnfc = bnx2x_get_rxnfc,
.set_rxnfc = bnx2x_set_rxnfc,
+ .get_rxfh_key_size = bnx2x_get_rxfh_key_size,
.get_rxfh_indir_size = bnx2x_get_rxfh_indir_size,
.get_rxfh = bnx2x_get_rxfh,
.set_rxfh = bnx2x_set_rxfh,
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
index cea6bdcde33f0be220815378142ea9e8dfc7..08ba33ccef87ba1f0d29ae4bdd48261fb901 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c
@@ -4538,9 +4538,11 @@ static int bnx2x_setup_rss(struct bnx2x *bp,
/* RSS keys */
if (test_bit(BNX2X_RSS_SET_SRCH, &p->rss_flags)) {
u8 *dst = (u8 *)(data->rss_key) + sizeof(data->rss_key);
- const u8 *src = (const u8 *)p->rss_key;
+ const u8 *src = (const u8 *)o->rss_key;
int i;
+ /* Remember the last configuration */
+ memcpy(o->rss_key, p->rss_key, T_ETH_RSS_KEY * 4);
/* Apparently, bnx2x reads this array in reverse order
* We need to byte swap rss_key to comply with Toeplitz specs.
*/
@@ -4596,6 +4598,11 @@ void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj,
memcpy(ind_table, rss_obj->ind_table, sizeof(rss_obj->ind_table));
}
+void bnx2x_get_rss_key(const struct bnx2x_rss_config_obj *rss_obj, u8 *key)
+{
+ memcpy(key, rss_obj->rss_key, T_ETH_RSS_KEY * 4);
+}
+
int bnx2x_config_rss(struct bnx2x *bp,
struct bnx2x_config_rss_params *p)
{
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
index 0bf2fd470819e64d2d7788b57caee6569b9b..2244107e1da103559c899b1774b815bcc0f9 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.h
@@ -760,6 +760,8 @@ struct bnx2x_rss_config_obj {
/* Last configured indirection table */
u8 ind_table[T_ETH_INDIRECTION_TABLE_SIZE];
+ u32 rss_key[T_ETH_RSS_KEY];
+
/* flags for enabling 4-tupple hash on UDP */
u8 udp_rss_v4;
u8 udp_rss_v6;
@@ -1530,6 +1532,8 @@ int bnx2x_config_rss(struct bnx2x *bp,
void bnx2x_get_rss_ind_table(struct bnx2x_rss_config_obj *rss_obj,
u8 *ind_table);
+void bnx2x_get_rss_key(const struct bnx2x_rss_config_obj *rss_obj, u8 *key);
+
#define PF_MAC_CREDIT_E2(bp, func_num) \
((MAX_MAC_CREDIT_E2 - GET_NUM_VFS_PER_PATH(bp) * VF_MAC_CREDIT_CNT) / \
func_num + GET_NUM_VFS_PER_PF(bp) * VF_MAC_CREDIT_CNT)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
index bfae300cf25ff881292dc36ad56e51e37132..79e4e1890e94e244a72d50d76d41963a4b23 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_vfpf.c
@@ -1985,7 +1985,7 @@ static void bnx2x_vf_mbx_update_rss(struct bnx2x *bp, struct bnx2x_virtf *vf,
/* set vfop params according to rss tlv */
memcpy(rss.ind_table, rss_tlv->ind_table,
T_ETH_INDIRECTION_TABLE_SIZE);
- memcpy(rss.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
+ memcpy(&vf->rss_conf_obj.rss_key, rss_tlv->rss_key, sizeof(rss_tlv->rss_key));
rss.rss_obj = &vf->rss_conf_obj;
rss.rss_result_mask = rss_tlv->rss_result_mask;
^ permalink raw reply related
* Re: HalfSipHash Acceptable Usage
From: George Spelvin @ 2016-12-21 18:07 UTC (permalink / raw)
To: linux, torvalds
Cc: ak, davem, David.Laight, djb, ebiggers3, eric.dumazet, hannes,
Jason, jeanphilippe.aumasson, kernel-hardening, linux-crypto,
linux-kernel, luto, netdev, tom, tytso, vegard.nossum
In-Reply-To: <CA+55aFy8fNOxw3bnwkX1S46jKnW6i26mueaiuOsScyN3kFJp+A@mail.gmail.com>
Linus wrote:
>> How much does kernel_fpu_begin()/kernel_fpu_end() cost?
>
> It's now better than it used to be, but it's absolutely disastrous
> still. We're talking easily many hundreds of cycles. Under some loads,
> thousands.
I think I've been thoroughly dissuaded, but just to clarify one thing
that resembles a misunderstanding:
> In contrast, in reality, especially with things like "do it once or
> twice per incoming packet", you'll easily hit the absolute worst
> cases, where not only does it take a few hundred cycles to save the FP
> state, you'll then return to user space in between packets, which
> triggers the slow-path return code and reloads the FP state, which is
> another few hundred cycles plus.
Everything being discussed is per-TCP-connection overhead, *not* per
packet. (Twice for outgoing connections, because one is to generate
the ephemeral port number.)
I know you know this, but I don't want anyone spectating to be confused
about it.
^ permalink raw reply
* Re: [PATCH] stmmac: CSR clock configuration fix
From: David Miller @ 2016-12-21 18:21 UTC (permalink / raw)
To: Joao.Pinto
Cc: peppe.cavallaro, hock.leong.kweh, niklas.cassel, pavel,
linux-kernel, netdev
In-Reply-To: <6d4c6d15a60c93a8aef5e3e03b9cd64cdcf232c8.1482232420.git.jpinto@synopsys.com>
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Tue, 20 Dec 2016 11:21:47 +0000
> When testing stmmac with my QoS reference design I checked a problem in the
> CSR clock configuration that was impossibilitating the phy discovery, since
> every read operation returned 0x0000ffff. This patch fixes the issue.
>
> Signed-off-by: Joao Pinto <jpinto@synopsys.com>
This isn't enough.
It looks like various parts of this driver set the mask field
differently.
dwmac1000_core.c and dwmac100_core.c set the mask to be the low bits.
But dwmac4_core.c uses GENMASK(11, 8) which means the mask is a value
which is shifted up already.
So your patch will break chips driven by dwmac4_core.c.
In order for your change to be correct you must consolidate all of
these various pieces to use the same convention.
^ permalink raw reply
* Re: [PATCH net] be2net: Increase skb headroom size to 256 bytes
From: David Miller @ 2016-12-21 18:23 UTC (permalink / raw)
To: suresh.reddy; +Cc: netdev, kalesh-anakkur.purayil
In-Reply-To: <20161220151430.11115-1-suresh.reddy@broadcom.com>
From: Suresh Reddy <suresh.reddy@broadcom.com>
Date: Tue, 20 Dec 2016 10:14:30 -0500
> From: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
>
> The driver currently allocates 128 bytes of skb headroom.
> This was found to be insufficient with some configurations
> like Geneve tunnels, which resulted in skb head reallocations.
>
> Increase the headroom to 256 bytes to fix this.
>
> Signed-off-by: Kalesh A P <kalesh-anakkur.purayil@broadcom.com>
> Signed-off-by: Suresh Reddy <suresh.reddy@broadcom.com>
Applied.
^ permalink raw reply
* Re: [PATCH net 3/3] cpsw/netcp: work around reverse cpts dependency
From: Grygorii Strashko @ 2016-12-21 18:23 UTC (permalink / raw)
To: Arnd Bergmann, David S. Miller
Cc: Thomas Gleixner, Nicolas Pitre, Uwe Kleine-König,
Kwok, WingMan, netdev@vger.kernel.org,
linux-kernel@vger.kernel.org
In-Reply-To: <20161216092017.2560717-3-arnd@arndb.de>
On 12/16/2016 03:19 AM, Arnd Bergmann wrote:
> The dependency is reversed: cpsw and netcp call into cpts,
> but cpts depends on the other two in Kconfig. This can lead
> to cpts being a loadable module and its callers built-in:
>
> drivers/net/ethernet/ti/cpsw.o: In function `cpsw_remove':
> cpsw.c:(.text.cpsw_remove+0xd0): undefined reference to `cpts_release'
> drivers/net/ethernet/ti/cpsw.o: In function `cpsw_rx_handler':
> cpsw.c:(.text.cpsw_rx_handler+0x2dc): undefined reference to `cpts_rx_timestamp'
> drivers/net/ethernet/ti/cpsw.o: In function `cpsw_tx_handler':
> cpsw.c:(.text.cpsw_tx_handler+0x7c): undefined reference to `cpts_tx_timestamp'
> drivers/net/ethernet/ti/cpsw.o: In function `cpsw_ndo_stop':
>
> As a workaround, I'm introducing another Kconfig symbol to
> control the compilation of cpts, while making the actual
> module controlled by a silent symbol that is =y when necessary.
>
> Fixes: 6246168b4a38 ("net: ethernet: ti: netcp: add support of cpts")
> Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Thanks for the fix.
I've tried to test as much combination as possible,
but seems not of them:(
Reviewed-by: Grygorii Strashko <grygorii.strashko@ti.com>
> ---
> drivers/net/ethernet/ti/Kconfig | 10 ++++++++--
> 1 file changed, 8 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig
> index 366e29ff8605..c114efcd1575 100644
> --- a/drivers/net/ethernet/ti/Kconfig
> +++ b/drivers/net/ethernet/ti/Kconfig
> @@ -73,8 +73,8 @@ config TI_CPSW
> To compile this driver as a module, choose M here: the module
> will be called cpsw.
>
> -config TI_CPTS
> - tristate "TI Common Platform Time Sync (CPTS) Support"
> +config TI_CPTS_ENABLE
> + bool "TI Common Platform Time Sync (CPTS) Support"
> depends on TI_CPSW || TI_KEYSTONE_NETCP
> depends on POSIX_TIMERS
> select PTP_1588_CLOCK
> @@ -84,6 +84,12 @@ config TI_CPTS
> The unit can time stamp PTP UDP/IPv4 and Layer 2 packets, and the
> driver offers a PTP Hardware Clock.
>
> +config TI_CPTS
> + tristate
> + depends on TI_CPTS_ENABLE
> + default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y
> + default m
> +
> config TI_KEYSTONE_NETCP
> tristate "TI Keystone NETCP Core Support"
> select TI_CPSW_ALE
>
--
regards,
-grygorii
^ permalink raw reply
* Re: [PATCH] net: ethernet: stmmac: dwmac-rk: make clk enablement first in powerup
From: David Miller @ 2016-12-21 18:25 UTC (permalink / raw)
To: heiko
Cc: netdev, peppe.cavallaro, alexandre.torgue, wxt, briannorris,
dianders, linux-rockchip
In-Reply-To: <20161220161706.26505-1-heiko@sntech.de>
From: Heiko Stuebner <heiko@sntech.de>
Date: Tue, 20 Dec 2016 17:17:06 +0100
> Right now the dwmac-rk tries to set up the GRF-specific speed and link
> options before enabling clocks, phys etc and on previous socs this works
> because the GRF is supplied on the whole by one clock.
>
> On the rk3399 however the GRF (General Register Files) clock-supply
> has been split into multiple clocks and while there is no specific
> grf-gmac clock like for other sub-blocks, it seems the mac-specific
> portions are actually supplied by the general mac clock.
>
> This results in hangs on rk3399 boards if the driver is build as module.
> When built in te problem of course doesn't surface, as the clocks
> are of course still on at the stage before clock_disable_unused.
>
> To solve this, simply move the clock enablement to the first position
> in the powerup callback. This is also a good idea in general to
> enable clocks before everything else.
>
> Tested on rk3288, rk3368 and rk3399 the dwmac still works on all of them.
>
> Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Applied, thank you.
^ permalink raw reply
* Re: [PATCH v3] stmmac: enable rx queues
From: David Miller @ 2016-12-21 18:26 UTC (permalink / raw)
To: Joao.Pinto
Cc: peppe.cavallaro, seraphin.bonnaffe, hock.leong.kweh,
niklas.cassel, pavel, linux-kernel, netdev
In-Reply-To: <406c3af0c74809090a49d14af6a27cca2f5e9d6b.1482253639.git.jpinto@synopsys.com>
From: Joao Pinto <Joao.Pinto@synopsys.com>
Date: Tue, 20 Dec 2016 17:09:28 +0000
> When the hardware is synthesized with multiple queues, all queues are
> disabled for default. This patch adds the rx queues configuration.
> This patch was successfully tested in a Synopsys QoS Reference design.
>
> Signed-off-by: Joao Pinto <jpinto@synopsys.com>
> ---
> changes v2 -> v3 (Seraphin Bonnaffe):
> - GMAC_RX_QUEUE_CLEAR macro simplified
> changes v1 -> v2 (Niklas Cassel and Seraphin Bonnaffe):
> - Instead of using number of DMA channels, lets use number of queues
> - Create 2 flavors of RX queue enable Macros: AV and DCB (AV by default)
> - Make sure that the RX queue related bits are cleared before setting
> - Check if rx_queue_enable is available before executing
This change seems more appropriate for net-next, please resubmit when
that tree opens up again.
Thanks.
^ permalink raw reply
* Re: [PATCH 1/5 net-next] inet: replace ->bind_conflict with ->rcv_saddr_equal
From: David Miller @ 2016-12-21 18:28 UTC (permalink / raw)
To: jbacik; +Cc: hannes, kraigatgoog, eric.dumazet, tom, netdev, kernel-team
In-Reply-To: <1482264424-15439-2-git-send-email-jbacik@fb.com>
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 20 Dec 2016 15:07:00 -0500
> The only difference between inet6_csk_bind_conflict and inet_csk_bind_conflict
> is how they check the rcv_saddr. Since we want to be able to check the saddr in
> other places just drop the protocol specific ->bind_conflict and replace it with
> ->rcv_saddr_equal, then make inet_csk_bind_conflict the one true bind conflict
> function.
>
> Signed-off-by: Josef Bacik <jbacik@fb.com>
This may be a nice cleanup and all, but realize that if we do actually
have to traverse a lot of sockets this code has become significantly
slower.
We now have to execute a hard to predict indirect call for every
socket we process on the list.
This is almost certainly why we had two seperate functions expanded
rather than having an AF-specific helper execute in the inner loop
of a generic function.
^ permalink raw reply
* Re: [PATCH 2/5 net-next] inet: kill smallest_size and smallest_port
From: David Miller @ 2016-12-21 18:30 UTC (permalink / raw)
To: jbacik; +Cc: hannes, kraigatgoog, eric.dumazet, tom, netdev, kernel-team
In-Reply-To: <1482264424-15439-3-git-send-email-jbacik@fb.com>
From: Josef Bacik <jbacik@fb.com>
Date: Tue, 20 Dec 2016 15:07:01 -0500
> In inet_csk_get_port we seem to be using smallest_port to figure out where the
> best place to look for a SO_REUSEPORT sk that matches with an existing set of
> SO_REUSEPORT's. However if we get to the logic
>
> if (smallest_size != -1) {
> port = smallest_port;
> goto have_port;
> }
>
> we will do a useless search, because we would have already done the
> inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
> would have gone to found_tb and succeeded. Since this logic makes us do yet
> another trip through inet_csk_bind_conflict for a port we know won't work just
> delete this code and save us the time.
>
> Signed-off-by: Josef Bacik <jbacik@fb.com>
So the "all else being equal, use 'tb' with smallest socket count" logic
wasn't being used at all?
Instead of removing it why don't we make it work properly again? Something
obviously broke it somewhere along the line, because I am pretty sure this
heuristic worked at some point in the past.
^ permalink raw reply
* Re: [PATCH 1/2] net: mdio: add mdio45_ethtool_ksettings_get
From: David Miller @ 2016-12-21 18:31 UTC (permalink / raw)
To: tremyfr; +Cc: linux-net-drivers, ecree, bkenward, andrew, netdev, linux-kernel
In-Reply-To: <1482272667-1206-1-git-send-email-tremyfr@gmail.com>
Please resubmit this patch series when net-next opens back up, thanks.
^ permalink raw reply
* Re: [PATCH v5] net: dummy: Introduce dummy virtual functions
From: David Miller @ 2016-12-21 18:32 UTC (permalink / raw)
To: phil; +Cc: netdev, sd
In-Reply-To: <20161220222621.32493-1-phil@nwl.cc>
From: Phil Sutter <phil@nwl.cc>
Date: Tue, 20 Dec 2016 23:26:21 +0100
> The idea for this was born when testing VF support in iproute2 which was
> impeded by hardware requirements. In fact, not every VF-capable hardware
> driver implements all netdev ops, so testing the interface is still hard
> to do even with a well-sorted hardware shelf.
>
> To overcome this and allow for testing the user-kernel interface, this
> patch allows to turn dummy into a PF with a configurable amount of VFs.
>
> Due to the assumption that all PFs are PCI devices, this implementation
> is not completely straightforward: In order to allow for
> rtnl_fill_ifinfo() to see the dummy VFs, a fake PCI parent device is
> attached to the dummy netdev. This has to happen at the right spot so
> register_netdevice() does not get confused. This patch abuses
> ndo_fix_features callback for that. In ndo_uninit callback, the fake
> parent is removed again for the same purpose.
>
> Joint work with Sabrina Dubroca.
>
> Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> Signed-off-by: Phil Sutter <phil@nwl.cc>
Please resubmit when net-next opens back up, thanks.
^ permalink raw reply
* Re: [PATCH] net: qcom/emac: add ethtool support
From: David Miller @ 2016-12-21 18:32 UTC (permalink / raw)
To: timur; +Cc: f.fainelli, netdev, cov, alokc
In-Reply-To: <1482273129-11812-1-git-send-email-timur@codeaurora.org>
From: Timur Tabi <timur@codeaurora.org>
Date: Tue, 20 Dec 2016 16:32:09 -0600
> Add support for some ethtool methods: get/set link settings, get/set
> message level, get statistics, get link status, and restart
> autonegotiation.
>
> Signed-off-by: Timur Tabi <timur@codeaurora.org>
Please resubmit when net-next opens back up, thanks.
^ permalink raw reply
* Re: HalfSipHash Acceptable Usage
From: George Spelvin @ 2016-12-21 18:37 UTC (permalink / raw)
To: eric.dumazet, Jason
Cc: ak, davem, David.Laight, djb, ebiggers3, hannes,
jeanphilippe.aumasson, kernel-hardening, linux-crypto,
linux-kernel, linux, luto, netdev, tom, torvalds, tytso,
vegard.nossum
In-Reply-To: <1482335804.8944.44.camel@edumazet-glaptop3.roam.corp.google.com>
Eric Dumazet wrote:
> Now I am quite confused.
>
> George said :
>> Cycles per byte on 1024 bytes of data:
>> Pentium Core 2 Ivy
>> 4 Duo Bridge
>> SipHash-2-4 38.9 8.3 5.8
>> HalfSipHash-2-4 12.7 4.5 3.2
>> MD5 8.3 5.7 4.7
>
> That really was for 1024 bytes blocks, so pretty much useless for our
> discussion ?
No, they're actually quite relevant, but you have to interpret them
correctly. I thought I explained in the text following that table,
but let me make it clearer:
To find the time to compute the SipHash of N bytes, round (N+17) up to
the next multiple of 8 bytes and multiply by the numbers above.
To find the time to compute the HalfSipHash of N bytes, round (N+9) up to
the next multiple of 4 bytes and multiply by the numbers above.
To find the time to compute the MD5 of N bytes, round (N+9) up to the
next multiple of 64 bytes and multiply by the numbers above.
It's the different rounding rules that make all the difference. For small
input blocks, SipHash can be slower per byte yet still faster because
it hashes fewer bytes.
> Reading your numbers last week, I thought SipHash was faster, but George
> numbers are giving the opposite impression.
SipHash annihilates the competition on 64-bit superscalar hardware.
SipHash dominates the field on 64-bit in-order hardware.
SipHash wins easily on 32-bit hardware *with enough registers*.
On register-starved 32-bit machines, it really struggles.
As I explained, in that last case, SipHash barely wins at all.
(On a P4, it actually *loses* to MD5, not that anyone cares. Running
on a P4 and caring about performance are mutually exclusive.)
^ permalink raw reply
* Re: [PATCHv2] net: mvpp2: fix dma unmapping of TX buffers for fragments
From: David Miller @ 2016-12-21 18:38 UTC (permalink / raw)
To: thomas.petazzoni
Cc: netdev, jason, andrew, sebastian.hesselbarth, gregory.clement,
nadavh, hannah, yehuday, mw, raphael.glon, stable
In-Reply-To: <1482316129-22332-1-git-send-email-thomas.petazzoni@free-electrons.com>
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Wed, 21 Dec 2016 11:28:49 +0100
> Since commit 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX
> buffers unmapping"), we are not correctly DMA unmapping TX buffers for
> fragments.
>
> Indeed, the mvpp2_txq_inc_put() function only stores in the
> txq_cpu->tx_buffs[] array the physical address of the buffer to be
> DMA-unmapped when skb != NULL. In addition, when DMA-unmapping, we use
> skb_headlen(skb) to get the size to be unmapped. Both of this works fine
> for TX descriptors that are associated directly to a SKB, but not the
> ones that are used for fragments, with a NULL pointer as skb:
>
> - We have a NULL physical address when calling DMA unmap
> - skb_headlen(skb) crashes because skb is NULL
>
> This causes random crashes when fragments are used.
>
> To solve this problem, we need to:
>
> - Store the physical address of the buffer to be unmapped
> unconditionally, regardless of whether it is tied to a SKB or not.
>
> - Store the length of the buffer to be unmapped, which requires a new
> field.
>
> Instead of adding a third array to store the length of the buffer to be
> unmapped, and as suggested by David Miller, this commit refactors the
> tx_buffs[] and tx_skb[] arrays of 'struct mvpp2_txq_pcpu' into a
> separate structure 'mvpp2_txq_pcpu_buf', to which a 'size' field is
> added. Therefore, instead of having three arrays to allocate/free, we
> have a single one, which also improve data locality, reducing the
> impact on the CPU cache.
>
> Fixes: 71ce391dfb784 ("net: mvpp2: enable proper per-CPU TX buffers unmapping")
> Reported-by: Raphael G <raphael.glon@corp.ovh.com>
> Cc: Raphael G <raphael.glon@corp.ovh.com>
> Cc: stable@vger.kernel.org
> Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
> ---
> Changes since v1:
> - As requested by David Miller, move the per TX buffer fields into a
> separate structure, 'struct mvpp2_txq_pcpu_buf', so that instead of
> allocating three arrays, we allocate a single array.
Indeed, this looks a lot better.
Applied, thanks.
^ permalink raw reply
* Re: HalfSipHash Acceptable Usage
From: Jason A. Donenfeld @ 2016-12-21 18:40 UTC (permalink / raw)
To: George Spelvin
Cc: Eric Dumazet, Andi Kleen, David Miller, David Laight,
Daniel J . Bernstein, Eric Biggers, Hannes Frederic Sowa,
Jean-Philippe Aumasson, kernel-hardening,
Linux Crypto Mailing List, LKML, Andy Lutomirski, Netdev,
Tom Herbert, Linus Torvalds, Theodore Ts'o, Vegard Nossum
In-Reply-To: <20161221183751.1123.qmail@ns.sciencehorizons.net>
On Wed, Dec 21, 2016 at 7:37 PM, George Spelvin
<linux@sciencehorizons.net> wrote:
> SipHash annihilates the competition on 64-bit superscalar hardware.
> SipHash dominates the field on 64-bit in-order hardware.
> SipHash wins easily on 32-bit hardware *with enough registers*.
> On register-starved 32-bit machines, it really struggles.
>
> As I explained, in that last case, SipHash barely wins at all.
> (On a P4, it actually *loses* to MD5, not that anyone cares. Running
> on a P4 and caring about performance are mutually exclusive.)
>From the discussion off list which examined your benchmark code, it
looks like we're going to move ahead with SipHash.
^ permalink raw reply
* Re: [PATCH net] net, sched: fix soft lockup in tc_classify
From: Cong Wang @ 2016-12-21 18:51 UTC (permalink / raw)
To: Daniel Borkmann
Cc: David Miller, Shahar Klein, Or Gerlitz, Roi Dayan, Jiri Pirko,
John Fastabend, Linux Kernel Network Developers
In-Reply-To: <1954223e8b7aa90ff835357e8a3ef7537be33e43.1482337600.git.daniel@iogearbox.net>
On Wed, Dec 21, 2016 at 9:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> What happens is that in tc_ctl_tfilter(), thread A allocates a new
> tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
> with it. In that classifier callback we had to unlock/lock the rtnl
> mutex and returned with -EAGAIN. One reason why we need to drop there
> is, for example, that we need to request an action module to be loaded.
Excellent catch!
But why do we have to replay the request here? Shouldn't we just return
EAGAIN to user-space and let user-space decide to retry or not?
Replaying is the root of the evil here.
I mean:
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3fbba79..a21f7d66 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -154,7 +154,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb,
struct nlmsghdr *n)
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
-replay:
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
@@ -278,8 +277,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb,
struct nlmsghdr *n)
tp_ops = tcf_proto_lookup_ops(kind);
/* We dropped the RTNL semaphore in order to
* perform the module load. So, even if we
- * succeeded in loading the module we have to
- * replay the request. We indicate this using
+ * succeeded in loading the module we return
* -EAGAIN.
*/
if (tp_ops != NULL) {
@@ -378,9 +376,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb,
struct nlmsghdr *n)
errout:
if (cl)
cops->put(q, cl);
- if (err == -EAGAIN)
- /* Replay the request. */
- goto replay;
return err;
}
^ permalink raw reply related
* Re: ipv6: handle -EFAULT from skb_copy_bits
From: David Miller @ 2016-12-21 19:04 UTC (permalink / raw)
To: hannes; +Cc: xiyou.wangcong, davej, netdev
In-Reply-To: <1482324073.2260.4.camel@stressinduktion.org>
From: Hannes Frederic Sowa <hannes@stressinduktion.org>
Date: Wed, 21 Dec 2016 13:41:13 +0100
> On Wed, 2016-12-21 at 13:27 +0100, Hannes Frederic Sowa wrote:
>> @@ -555,8 +566,8 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
>> goto out;
>>
>> offset = rp->offset;
>> - total_len = inet_sk(sk)->cork.base.length;
>> - if (offset >= total_len - 1) {
>> + transport_len = raw6_corked_transport_len(sk);
>> + if (offset >= transport_len - 1) {
>> err = -EINVAL;
>> ip6_flush_pending_frames(sk);
>> goto out;
>> @@ -598,7 +609,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
>> tmp_csum = csum_sub(tmp_csum, csum_unfold(csum));
>>
>> csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr,
>> - total_len, fl6->flowi6_proto, tmp_csum);
>> + transport_len, fl6->flowi6_proto, tmp_csum);
>>
>>
>
> Ops, here we need actually the total_len plus the opt->opt_nflen to
> always calculate the correct checksum.
It's a real shame we can't just use skb_transport_offset(). This value
has essentially been calculated for us already.
Also, if we iterate over multiple SKBs in the write queue, don't you have
to redo this calculation for every SKB we iterate over?
Furthermore, what if the user queued up some SKBs in the raw socket
with MSG_MORE, and then changed some of the options for a subsequent
sendmsg() call?
Given all of this, I think the best thing to do is validate the offset
after the queue walks, which is pretty much what Dave Jones's original
patch was doing.
Sigh... well, at least we now understand what's going on here.
^ permalink raw reply
* Re: [PATCH net] net, sched: fix soft lockup in tc_classify
From: Cong Wang @ 2016-12-21 19:10 UTC (permalink / raw)
To: Daniel Borkmann
Cc: David Miller, Shahar Klein, Or Gerlitz, Roi Dayan, Jiri Pirko,
John Fastabend, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpXadrGWJYqizEkq+xmMKVV9krrcCWKur=ZCRZBS9Mx_8w@mail.gmail.com>
On Wed, Dec 21, 2016 at 10:51 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
> On Wed, Dec 21, 2016 at 9:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> What happens is that in tc_ctl_tfilter(), thread A allocates a new
>> tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
>> with it. In that classifier callback we had to unlock/lock the rtnl
>> mutex and returned with -EAGAIN. One reason why we need to drop there
>> is, for example, that we need to request an action module to be loaded.
>
> Excellent catch!
>
> But why do we have to replay the request here? Shouldn't we just return
> EAGAIN to user-space and let user-space decide to retry or not?
> Replaying is the root of the evil here.
Answer myself: probably due to historical reasons, but still replaying
inside such a big function is just error-prone, we should promote it
out:
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 3fbba79..7d5b42b 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -129,7 +129,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
/* Add/change/delete/get a filter node */
-static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
+static int __tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
{
struct net *net = sock_net(skb->sk);
struct nlattr *tca[TCA_MAX + 1];
@@ -154,7 +154,6 @@ static int tc_ctl_tfilter(struct sk_buff *skb,
struct nlmsghdr *n)
!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
return -EPERM;
-replay:
err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL);
if (err < 0)
return err;
@@ -378,12 +377,19 @@ static int tc_ctl_tfilter(struct sk_buff *skb,
struct nlmsghdr *n)
errout:
if (cl)
cops->put(q, cl);
- if (err == -EAGAIN)
- /* Replay the request. */
- goto replay;
return err;
}
+static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n)
+{
+ int ret;
+replay:
+ ret = __tc_ctl_tfilter(skb, n);
+ if (ret == -EAGAIN)
+ goto replay;
+ return ret;
+}
+
static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, unsigned long fh, u32 portid,
u32 seq, u16 flags, int event)
^ permalink raw reply related
* Re: [PATCH net] net, sched: fix soft lockup in tc_classify
From: Daniel Borkmann @ 2016-12-21 19:16 UTC (permalink / raw)
To: Cong Wang
Cc: David Miller, Shahar Klein, Or Gerlitz, Roi Dayan, Jiri Pirko,
John Fastabend, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpXadrGWJYqizEkq+xmMKVV9krrcCWKur=ZCRZBS9Mx_8w@mail.gmail.com>
On 12/21/2016 07:51 PM, Cong Wang wrote:
> On Wed, Dec 21, 2016 at 9:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>> What happens is that in tc_ctl_tfilter(), thread A allocates a new
>> tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
>> with it. In that classifier callback we had to unlock/lock the rtnl
>> mutex and returned with -EAGAIN. One reason why we need to drop there
>> is, for example, that we need to request an action module to be loaded.
>
> Excellent catch!
>
> But why do we have to replay the request here? Shouldn't we just return
> EAGAIN to user-space and let user-space decide to retry or not?
> Replaying is the root of the evil here.
Right, this behavior is already pretty old (2005), see history
tree 8d7c694553dc ("[PKT_SCHED]: act_api.c: drop rtnl for loading
modules") and 437293de63d8 ("[PKT_SCHED]: cls_api.c: drop rtnl
for loading modules"), some binaries could rely on that behavior
in one way or another I'd presume.
^ permalink raw reply
* Re: [PATCH 2/5 net-next] inet: kill smallest_size and smallest_port
From: Eric Dumazet @ 2016-12-21 19:29 UTC (permalink / raw)
To: David Miller; +Cc: jbacik, hannes, kraigatgoog, tom, netdev, kernel-team
In-Reply-To: <20161221.133003.1401543777326711002.davem@davemloft.net>
On Wed, 2016-12-21 at 13:30 -0500, David Miller wrote:
> From: Josef Bacik <jbacik@fb.com>
> Date: Tue, 20 Dec 2016 15:07:01 -0500
>
> > In inet_csk_get_port we seem to be using smallest_port to figure out where the
> > best place to look for a SO_REUSEPORT sk that matches with an existing set of
> > SO_REUSEPORT's. However if we get to the logic
> >
> > if (smallest_size != -1) {
> > port = smallest_port;
> > goto have_port;
> > }
> >
> > we will do a useless search, because we would have already done the
> > inet_csk_bind_conflict for that port and it would have returned 1, otherwise we
> > would have gone to found_tb and succeeded. Since this logic makes us do yet
> > another trip through inet_csk_bind_conflict for a port we know won't work just
> > delete this code and save us the time.
> >
> > Signed-off-by: Josef Bacik <jbacik@fb.com>
>
> So the "all else being equal, use 'tb' with smallest socket count" logic
> wasn't being used at all?
>
> Instead of removing it why don't we make it work properly again? Something
> obviously broke it somewhere along the line, because I am pretty sure this
> heuristic worked at some point in the past.
Yeah, but heuristic to choose the smallest list was hurting a lot, since
it tends to consume all ports.
At connect() time we prefer to have another heuristic, actually leaving
some slots absolutely empty, so that a bind() will succeed later.
Remember I had more than 30 ms latency as described in the 2nd commit
changelog :
References :
commit ea8add2b190395408b22a9127bed2c0912aecbc8
Author: Eric Dumazet <edumazet@google.com>
Date: Thu Feb 11 16:28:50 2016 -0800
tcp/dccp: better use of ephemeral ports in bind()
Implement strategy used in __inet_hash_connect() in opposite way :
Try to find a candidate using odd ports, then fallback to even ports.
We no longer disable BH for whole traversal, but one bucket at a time.
We also use cond_resched() to yield cpu to other tasks if needed.
I removed one indentation level and tried to mirror the loop we have
in __inet_hash_connect() and variable names to ease code maintenance.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
commit 1580ab63fc9a03593072cc5656167a75c4f1d173
Author: Eric Dumazet <edumazet@google.com>
Date: Thu Feb 11 16:28:49 2016 -0800
tcp/dccp: better use of ephemeral ports in connect()
In commit 07f4c90062f8 ("tcp/dccp: try to not exhaust ip_local_port_range
in connect()"), I added a very simple heuristic, so that we got better
chances to use even ports, and allow bind() users to have more available
slots.
It gave nice results, but with more than 200,000 TCP sessions on a typical
server, the ~30,000 ephemeral ports are still a rare resource.
I chose to go a step further, by looking at all even ports, and if none
was available, fallback to odd ports.
The companion patch does the same in bind(), but in opposite way.
I've seen exec times of up to 30ms on busy servers, so I no longer
disable BH for the whole traversal, but only for each hash bucket.
I also call cond_resched() to be gentle to other tasks.
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
^ permalink raw reply
* Re: [PATCH 2/5 net-next] inet: kill smallest_size and smallest_port
From: Josef Bacik @ 2016-12-21 19:32 UTC (permalink / raw)
To: David Miller; +Cc: hannes, kraigatgoog, eric.dumazet, tom, netdev, kernel-team
In-Reply-To: <20161221.133003.1401543777326711002.davem@davemloft.net>
On Wed, Dec 21, 2016 at 1:30 PM, David Miller <davem@davemloft.net>
wrote:
> From: Josef Bacik <jbacik@fb.com>
> Date: Tue, 20 Dec 2016 15:07:01 -0500
>
>> In inet_csk_get_port we seem to be using smallest_port to figure
>> out where the
>> best place to look for a SO_REUSEPORT sk that matches with an
>> existing set of
>> SO_REUSEPORT's. However if we get to the logic
>>
>> if (smallest_size != -1) {
>> port = smallest_port;
>> goto have_port;
>> }
>>
>> we will do a useless search, because we would have already done the
>> inet_csk_bind_conflict for that port and it would have returned 1,
>> otherwise we
>> would have gone to found_tb and succeeded. Since this logic makes
>> us do yet
>> another trip through inet_csk_bind_conflict for a port we know
>> won't work just
>> delete this code and save us the time.
>>
>> Signed-off-by: Josef Bacik <jbacik@fb.com>
>
> So the "all else being equal, use 'tb' with smallest socket count"
> logic
> wasn't being used at all?
>
> Instead of removing it why don't we make it work properly again?
> Something
> obviously broke it somewhere along the line, because I am pretty sure
> this
> heuristic worked at some point in the past.
Yeah as soon as we would find a tb with no bind conflicts we'd
immediately jump to found_tb: and subsequently exit, so if we did
manage to get to the point of checking smallest_size it would be
redundant as we would have had to hit a bind conflict for that port to
even reach that code.
How do you want me to add it back? The logic only kicked in if we were
SO_REUSEPORT with snum == 0, but Tom tells me that is basically
useless, so we've disallowed that behavior. Should we call a
bind_conflict() for every port and then go back and pick the one with
the smallest tb? That's a lot of scanning. Can you tell me what
behavior you desire and I'll add another patch to reintroduce it?
Thanks,
Josef
^ permalink raw reply
* Re: [PATCH net] net, sched: fix soft lockup in tc_classify
From: Daniel Borkmann @ 2016-12-21 20:02 UTC (permalink / raw)
To: Cong Wang
Cc: David Miller, Shahar Klein, Or Gerlitz, Roi Dayan, Jiri Pirko,
John Fastabend, Linux Kernel Network Developers
In-Reply-To: <CAM_iQpX3Sk_YGDmvY-yj0CxPbG5qfT7P7ioNgY5Lb_DqsTWh6Q@mail.gmail.com>
On 12/21/2016 08:10 PM, Cong Wang wrote:
> On Wed, Dec 21, 2016 at 10:51 AM, Cong Wang <xiyou.wangcong@gmail.com> wrote:
>> On Wed, Dec 21, 2016 at 9:04 AM, Daniel Borkmann <daniel@iogearbox.net> wrote:
>>> What happens is that in tc_ctl_tfilter(), thread A allocates a new
>>> tp, initializes it, sets tp_created to 1, and calls into tp->ops->change()
>>> with it. In that classifier callback we had to unlock/lock the rtnl
>>> mutex and returned with -EAGAIN. One reason why we need to drop there
>>> is, for example, that we need to request an action module to be loaded.
>>
>> Excellent catch!
>>
>> But why do we have to replay the request here? Shouldn't we just return
>> EAGAIN to user-space and let user-space decide to retry or not?
>> Replaying is the root of the evil here.
>
> Answer myself: probably due to historical reasons, but still replaying
> inside such a big function is just error-prone, we should promote it
> out:
Have no strong opinion, I guess it could be done as a simplification
for net-next, why not, along with moving out the netlink_ns_capable()
check or possibly other things after careful analysis that don't need
to be redone in that circumstance.
Thanks,
Daniel
^ permalink raw reply
* Re: [PATCH v5] net: dummy: Introduce dummy virtual functions
From: Phil Sutter @ 2016-12-21 20:06 UTC (permalink / raw)
To: David Miller; +Cc: netdev, sd
In-Reply-To: <20161221.133200.173665554896949124.davem@davemloft.net>
On Wed, Dec 21, 2016 at 01:32:00PM -0500, David Miller wrote:
> From: Phil Sutter <phil@nwl.cc>
> Date: Tue, 20 Dec 2016 23:26:21 +0100
>
> > The idea for this was born when testing VF support in iproute2 which was
> > impeded by hardware requirements. In fact, not every VF-capable hardware
> > driver implements all netdev ops, so testing the interface is still hard
> > to do even with a well-sorted hardware shelf.
> >
> > To overcome this and allow for testing the user-kernel interface, this
> > patch allows to turn dummy into a PF with a configurable amount of VFs.
> >
> > Due to the assumption that all PFs are PCI devices, this implementation
> > is not completely straightforward: In order to allow for
> > rtnl_fill_ifinfo() to see the dummy VFs, a fake PCI parent device is
> > attached to the dummy netdev. This has to happen at the right spot so
> > register_netdevice() does not get confused. This patch abuses
> > ndo_fix_features callback for that. In ndo_uninit callback, the fake
> > parent is removed again for the same purpose.
> >
> > Joint work with Sabrina Dubroca.
> >
> > Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
> > Signed-off-by: Phil Sutter <phil@nwl.cc>
>
> Please resubmit when net-next opens back up, thanks.
Sure, thanks for the heads-up!
Cheers, Phil
^ permalink raw reply
* Re: [PATCH RFC net-next 3/7] sctp: add dst_pending_confirm flag
From: Julian Anastasov @ 2016-12-21 20:19 UTC (permalink / raw)
To: Neil Horman; +Cc: netdev, linux-sctp, YueHaibing
In-Reply-To: <20161221141108.GB23197@hmsreliant.think-freely.org>
Hello,
On Wed, 21 Dec 2016, Neil Horman wrote:
> On Sun, Dec 18, 2016 at 10:56:32PM +0200, Julian Anastasov wrote:
> > - tp->af_specific->sctp_xmit(head, tp);
> > + confirm = tp->dst_pending_confirm;
> > + if (confirm)
> > + head->dst_pending_confirm = 1;
> Why not use your confirm helper function here?
It is for sk, I can add a skb_set_dst_pending_confirm(skb,int)
helper, if needed.
> > + if (!t->dst_pending_confirm)
> > + t->dst_pending_confirm = 1;
> I'm not sure why you check it for being zero before setting it here, just set it
> unilaterally instead, or are you trying to avoid dirtying a cacheline?
Yes, no other reason. I'll change it in the next version.
Regards
^ permalink raw reply
* BUG/panic in ctnetlink_conntrack_event in 4.8.11
From: Chris Boot @ 2016-12-21 20:20 UTC (permalink / raw)
To: Netfilter Development Mailing list
Cc: Linux Kernel Network Developers, coreteam, linux-kernel
Hi all,
I've encountered this BUG three times in the last few days, though I
must admit I've only captured the trace once so far so I can't be
completely certain it was exactly this the last few times. I did not
experience this with a 4.7 kernel; it only seemed to start with 4.8.
For some background: I use conntrackd (this is an "HA" firewall pair),
plenty of IPv6, IPsec with vti6 interfaces, conntrack, some NAT on IPv4
but definitely not with IPv6.
Without further ado, here is my crash:
[147965.209318] BUG: unable to handle kernel NULL pointer dereference at 0000000000000018
[147965.217347] IP: [<ffffffffb4bb8b19>] icmp6_send+0x229/0x9f0
[147965.223051] PGD 0
[147965.225184] Oops: 0000 [#1] SMP
[147965.228424] Modules linked in: sch_fq_codel sch_htb pppoe pppox ppp_generic slhc ip6_vti ip6_tunnel tunnel6 drbg ansi_cprng seqiv esp6 xfrm4_mode_tunnel xfrm6_mode_tunnel ghash_generic gcm twofish_generic twofish_x86_64_3way twofish_x86_64 twofish_common serpent_sse2_x86_64 serpent_generic blowfish_generic blowfish_x86_64 blowfish_common cast5_generic cast_common ctr des_generic cbc algif_skcipher camellia_generic camellia_x86_64 xts xcbc sha512_ssse3 sha512_generic md4 algif_hash af_alg xfrm_user xfrm4_tunnel tunnel4 ipcomp xfrm_ipcomp esp4 ah4 af_key xfrm_algo tun hmac xt_nat xt_policy xt_statistic xt_helper xt_CLASSIFY xt_recent ip6table_nat xt_dscp xt_length binfmt_misc ip6t_REJECT xt_hashlimit nf_reject_ipv6 ip6table_mangle xt_comment iptable_nat ipt_REJECT nf_reject_ipv4 xt_addrtype xt_set ip_set_hash_ip ip_set xt_connmark xt_mark iptable_mangle xt_tcpudp iptable_raw xt_CT ip6table_raw xt_multiport xt_conntrack nf_log_ipv4 nf_nat_tftp nf_nat_snmp_basic nf_conntrack_snmp xt_NFLOG nf_nat_sip xt_LOG nf_log_ipv6 nf_nat_pptp nf_log_common nf_nat_proto_gre nf_nat_irc nf_nat_h323 nf_nat_ftp nf_nat_amanda ts_kmp nf_conntrack_amanda nf_conntrack_sane nf_conntrack_tftp nf_conntrack_sip nf_conntrack_proto_udplite nf_conntrack_proto_sctp nf_conntrack_pptp nf_conntrack_proto_gre nf_conntrack_netlink nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_irc nf_conntrack_h323 nf_conntrack_ftp ip6table_filter ip6_tables iptable_filter openvswitch nf_conntrack_ipv6 nf_nat_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_defrag_ipv6 nf_nat nf_conntrack 8021q garp mrp stp llc dummy nfnetlink_log nfnetlink evdev kvm_amd kvm irqbypass pcspkr k10temp sp5100_tco i2c_piix4 sg shpchp acpi_cpufreq tpm_tis tpm_tis_core tpm button drbd lru_cache libcrc32c ip_tables x_tables autofs4 ext4 crc16 jbd2 crc32c_generic fscrypto ecb glue_helper lrw gf128mul ablk_helper cryptd aes_x86_64 mbcache dm_mod sd_mod uas usb_storage ohci_pci ehci_pci ohci_hcd ehci_hcd usbcore ahci libahci libata scsi_mod usb_common r8169 mii
[147965.409769] CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.8.0-2-amd64 #1 Debian 4.8.11-1
[147965.417773] Hardware name: PC Engines APU, BIOS SageBios_PCEngines_APU-45 04/05/2014
[147965.425607] task: ffff96d2d940aec0 task.stack: ffff96d2d9410000
[147965.431622] RIP: 0010:[<ffffffffb4bb8b19>] [<ffffffffb4bb8b19>] icmp6_send+0x229/0x9f0
[147965.439742] RSP: 0018:ffff96d2ded03d30 EFLAGS: 00010246
[147965.445150] RAX: 0000000000000000 RBX: ffff96d2861e1700 RCX: 0000000000000020
[147965.452377] RDX: 0000000000000001 RSI: 0000000000000200 RDI: ffff96d1cb42799e
[147965.459597] RBP: ffff96d2ded03e60 R08: 0000000000000000 R09: ffff96d299832000
[147965.466823] R10: 0000000000000001 R11: 0000000000000000 R12: ffff96d1cb427996
[147965.474042] R13: ffffffffb50da680 R14: 0000000000000000 R15: 0000000000000003
[147965.481263] FS: 0000000000000000(0000) GS:ffff96d2ded00000(0000) knlGS:0000000000000000
[147965.489444] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[147965.495283] CR2: 0000000000000018 CR3: 0000000116d59000 CR4: 00000000000006e0
[147965.502501] Stack:
[147965.504607] ffff96d291d14880 0000000000000000 ffff96d289288400 0000000000000000
[147965.512189] 0000000000000000 0000000000000000 ffff96d1cb42799e 0000000000000000
[147965.519772] 0000000000000001 ffff96d200000001 ffff96d1cb4279ae ffff96d280fb2900
[147965.527356] Call Trace:
[147965.529895] <IRQ>
[147965.531932] [<ffffffffc07330df>] ? ctnetlink_conntrack_event+0x3ff/0x620 [nf_conntrack_netlink]
[147965.541005] [<ffffffffc068e94a>] ? nf_nat_cleanup_conntrack+0xea/0x1a0 [nf_nat]
[147965.548492] [<ffffffffb4b74653>] ? get_frag_bucket_locked+0x43/0x70
[147965.554939] [<ffffffffc0696330>] ? nf_ct_net_init+0x130/0x130 [nf_defrag_ipv6]
[147965.562338] [<ffffffffb4bbff78>] ? ip6_expire_frag_queue+0xf8/0x100
[147965.568787] [<ffffffffb46e6e80>] ? call_timer_fn+0x30/0x120
[147965.574539] [<ffffffffb46e7406>] ? run_timer_softirq+0x216/0x4b0
[147965.580728] [<ffffffffb46f7ac0>] ? tick_sched_handle.isra.12+0x20/0x50
[147965.587434] [<ffffffffb46f7b28>] ? tick_sched_timer+0x38/0x70
[147965.593363] [<ffffffffb4bf2598>] ? __do_softirq+0xf8/0x290
[147965.599030] [<ffffffffb4681abb>] ? irq_exit+0x9b/0xa0
[147965.604266] [<ffffffffb4bf23ae>] ? smp_apic_timer_interrupt+0x3e/0x50
[147965.610884] [<ffffffffb4bf16c2>] ? apic_timer_interrupt+0x82/0x90
[147965.617155] <EOI>
[147965.619184] [<ffffffffb4ab0f06>] ? cpuidle_enter_state+0x126/0x2d0
[147965.625740] [<ffffffffb4ab0ef3>] ? cpuidle_enter_state+0x113/0x2d0
[147965.632100] [<ffffffffb46bd742>] ? cpu_startup_entry+0x2a2/0x350
[147965.638291] [<ffffffffb464eddd>] ? start_secondary+0x14d/0x190
[147965.644299] Code: 8b 44 24 38 75 46 f6 c2 02 74 05 f6 c2 30 75 3c 48 8b 43 58 4c 89 44 24 20 44 89 5c 24 38 44 89 54 24 40 89 54 24 48 48 83 e0 fe <48> 8b 78 18 e8 8e 88 02 00 8b 54 24 48 41 89 c1 44 8b 54 24 40
[147965.664907] RIP [<ffffffffb4bb8b19>] icmp6_send+0x229/0x9f0
[147965.670686] RSP <ffff96d2ded03d30>
[147965.674267] CR2: 0000000000000018
[147965.677683] ---[ end trace d5725bb00a2f3d6b ]---
[147965.682396] Kernel panic - not syncing: Fatal exception in interrupt
[147965.688898] Kernel Offset: 0x33600000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[147965.699764] ---[ end Kernel panic - not syncing: Fatal exception in interrupt
[147965.707009] ------------[ cut here ]------------
[147965.711730] WARNING: CPU: 1 PID: 0 at /build/linux-lIgGMF/linux-4.8.11/arch/x86/kernel/smp.c:125 check_preempt_curr+0x50/0x90
[147965.723108] Modules linked in: sch_fq_codel sch_htb pppoe pppox ppp_generic slhc ip6_vti ip6_tunnel tunnel6 drbg ansi_cprng seqiv esp6 xfrm4_mode_tunnel xfrm6_mode_tunnel ghash_generic gcm twofish_generic twofish_x86_64_3way twofish_x86_64 twofish_common serpent_sse2_x86_64 serpent_generic blowfish_generic blowfish_x86_64 blowfish_common cast5_generic cast_common ctr des_generic cbc algif_skcipher camellia_generic camellia_x86_64 xts xcbc sha512_ssse3 sha512_generic md4 algif_hash af_alg xfrm_user xfrm4_tunnel tunnel4 ipcomp xfrm_ipcomp esp4 ah4 af_key xfrm_algo tun hmac xt_nat xt_policy xt_statistic xt_helper xt_CLASSIFY xt_recent ip6table_nat xt_dscp xt_length binfmt_misc ip6t_REJECT xt_hashlimit nf_reject_ipv6 ip6table_mangle xt_comment iptable_nat ipt_REJECT nf_reject_ipv4 xt_addrtype xt_set ip_set_hash_ip ip_set xt_connmark xt_mark iptable_mangle xt_tcpudp iptable_raw xt_CT ip6table_raw xt_multiport xt_conntrack nf_log_ipv4 nf_nat_tftp nf_nat_snmp_basic nf_conntrack_snmp xt_NFLOG nf_nat_sip xt_LOG nf_log_ipv6 nf_nat_pptp nf_log_common nf_nat_proto_gre nf_nat_irc nf_nat_h323 nf_nat_ftp nf_nat_amanda ts_kmp nf_conntrack_amanda nf_conntrack_sane nf_conntrack_tftp nf_conntrack_sip nf_conntrack_proto_udplite nf_conntrack_proto_sctp nf_conntrack_pptp nf_conntrack_proto_gre nf_conntrack_netlink nf_conntrack_netbios_ns nf_conntrack_broadcast nf_conntrack_irc nf_conntrack_h323 nf_conntrack_ftp ip6table_filter ip6_tables iptable_filter openvswitch nf_conntrack_ipv6 nf_nat_ipv6 nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_defrag_ipv6 nf_nat nf_conntrack 8021q garp mrp stp llc dummy nfnetlink_log nfnetlink evdev kvm_amd kvm irqbypass pcspkr k10temp sp5100_tco i2c_piix4 sg shpchp acpi_cpufreq tpm_tis tpm_tis_core tpm button drbd lru_cache libcrc32c ip_tables x_tables autofs4 ext4 crc16 jbd2 crc32c_generic fscrypto ecb glue_helper lrw gf128mul ablk_helper cryptd aes_x86_64 mbcache dm_mod sd_mod uas usb_storage ohci_pci ehci_pci ohci_hcd ehci_hcd usbcore ahci libahci libata scsi_mod usb_common r8169 mii
[147965.904454] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G D 4.8.0-2-amd64 #1 Debian 4.8.11-1
[147965.913671] Hardware name: PC Engines APU, BIOS SageBios_PCEngines_APU-45 04/05/2014
[147965.921496] 0000000000000086 2c0493b5e5995666 ffffffffb49269f5 0000000000000000
[147965.929080] 0000000000000000 ffffffffb467c16e ffff96d2dec18180 ffff96d2c17fee40
[147965.936662] ffff96d2dec18180 0000000000000004 0000000000000046 ffff96d2dec18180
[147965.944244] Call Trace:
[147965.946786] <IRQ> [<ffffffffb49269f5>] ? dump_stack+0x5c/0x77
[147965.952830] [<ffffffffb467c16e>] ? __warn+0xbe/0xe0
[147965.957893] [<ffffffffb46a4720>] ? check_preempt_curr+0x50/0x90
[147965.963993] [<ffffffffb46a4774>] ? ttwu_do_wakeup+0x14/0xe0
[147965.969745] [<ffffffffb46a5441>] ? try_to_wake_up+0x191/0x3a0
[147965.975675] [<ffffffffb46bce93>] ? autoremove_wake_function+0x13/0x40
[147965.982293] [<ffffffffb46bc76e>] ? __wake_up_common+0x4e/0x90
[147965.988221] [<ffffffffb46bc7e4>] ? __wake_up+0x34/0x50
[147965.993545] [<ffffffffb475b943>] ? irq_work_run_list+0x43/0x70
[147965.999557] [<ffffffffb46318da>] ? smp_irq_work_interrupt+0x2a/0x30
[147966.006005] [<ffffffffb4bf2202>] ? irq_work_interrupt+0x82/0x90
[147966.012108] [<ffffffffb477a7b4>] ? panic+0x1e6/0x226
[147966.017254] [<ffffffffb477a7ad>] ? panic+0x1df/0x226
[147966.022404] [<ffffffffb462fa42>] ? oops_end+0xc2/0xd0
[147966.027635] [<ffffffffb46651e8>] ? no_context+0x128/0x370
[147966.033217] [<ffffffffb46a56d0>] ? wake_up_q+0x60/0x60
[147966.038538] [<ffffffffb4bf0e58>] ? page_fault+0x28/0x30
[147966.043948] [<ffffffffb4bb8b19>] ? icmp6_send+0x229/0x9f0
[147966.049532] [<ffffffffc07330df>] ? ctnetlink_conntrack_event+0x3ff/0x620 [nf_conntrack_netlink]
[147966.058404] [<ffffffffc068e94a>] ? nf_nat_cleanup_conntrack+0xea/0x1a0 [nf_nat]
[147966.065888] [<ffffffffb4b74653>] ? get_frag_bucket_locked+0x43/0x70
[147966.072340] [<ffffffffc0696330>] ? nf_ct_net_init+0x130/0x130 [nf_defrag_ipv6]
[147966.079738] [<ffffffffb4bbff78>] ? ip6_expire_frag_queue+0xf8/0x100
[147966.086186] [<ffffffffb46e6e80>] ? call_timer_fn+0x30/0x120
[147966.091941] [<ffffffffb46e7406>] ? run_timer_softirq+0x216/0x4b0
[147966.098127] [<ffffffffb46f7ac0>] ? tick_sched_handle.isra.12+0x20/0x50
[147966.104835] [<ffffffffb46f7b28>] ? tick_sched_timer+0x38/0x70
[147966.110764] [<ffffffffb4bf2598>] ? __do_softirq+0xf8/0x290
[147966.116431] [<ffffffffb4681abb>] ? irq_exit+0x9b/0xa0
[147966.121664] [<ffffffffb4bf23ae>] ? smp_apic_timer_interrupt+0x3e/0x50
[147966.128285] [<ffffffffb4bf16c2>] ? apic_timer_interrupt+0x82/0x90
[147966.134557] <EOI> [<ffffffffb4ab0f06>] ? cpuidle_enter_state+0x126/0x2d0
[147966.141555] [<ffffffffb4ab0ef3>] ? cpuidle_enter_state+0x113/0x2d0
[147966.147916] [<ffffffffb46bd742>] ? cpu_startup_entry+0x2a2/0x350
[147966.154103] [<ffffffffb464eddd>] ? start_secondary+0x14d/0x190
[147966.160117] ---[ end trace d5725bb00a2f3d6c ]---
Regards,
Chris
--
Chris Boot
bootc@bootc.net
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox