Netdev List
 help / color / mirror / Atom feed
* [PATCH 2/7] netfilter: nf_flow_table: fix offload for flows that are subject to xfrm
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

This makes the previously added 'encap test' pass.
Because its possible that the xfrm dst entry becomes stale while such
a flow is offloaded, we need to call dst_check() -- the notifier that
handles this for non-tunneled traffic isn't sufficient, because SA or
or policies might have changed.

If dst becomes stale the flow offload entry will be tagged for teardown
and packets will be passed to 'classic' forwarding path.

Removing the entry right away is problematic, as this would
introduce a race condition with the gc worker.

In case flow is long-lived, it could eventually be offloaded again
once the gc worker removes the entry from the flow table.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_flow_table_ip.c | 43 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index cdfc33517e85..d68c801dd614 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -214,6 +214,25 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
 	return true;
 }
 
+static int nf_flow_offload_dst_check(struct dst_entry *dst)
+{
+	if (unlikely(dst_xfrm(dst)))
+		return dst_check(dst, 0) ? 0 : -1;
+
+	return 0;
+}
+
+static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
+				      const struct nf_hook_state *state,
+				      struct dst_entry *dst)
+{
+	skb_orphan(skb);
+	skb_dst_set_noref(skb, dst);
+	skb->tstamp = 0;
+	dst_output(state->net, state->sk, skb);
+	return NF_STOLEN;
+}
+
 unsigned int
 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 			const struct nf_hook_state *state)
@@ -254,6 +273,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff))
 		return NF_ACCEPT;
 
+	if (nf_flow_offload_dst_check(&rt->dst)) {
+		flow_offload_teardown(flow);
+		return NF_ACCEPT;
+	}
+
 	if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0)
 		return NF_DROP;
 
@@ -261,6 +285,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb,
 	iph = ip_hdr(skb);
 	ip_decrease_ttl(iph);
 
+	if (unlikely(dst_xfrm(&rt->dst))) {
+		memset(skb->cb, 0, sizeof(struct inet_skb_parm));
+		IPCB(skb)->iif = skb->dev->ifindex;
+		IPCB(skb)->flags = IPSKB_FORWARDED;
+		return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+	}
+
 	skb->dev = outdev;
 	nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr);
 	skb_dst_set_noref(skb, &rt->dst);
@@ -467,6 +498,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 				sizeof(*ip6h)))
 		return NF_ACCEPT;
 
+	if (nf_flow_offload_dst_check(&rt->dst)) {
+		flow_offload_teardown(flow);
+		return NF_ACCEPT;
+	}
+
 	if (skb_try_make_writable(skb, sizeof(*ip6h)))
 		return NF_DROP;
 
@@ -477,6 +513,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
 	ip6h = ipv6_hdr(skb);
 	ip6h->hop_limit--;
 
+	if (unlikely(dst_xfrm(&rt->dst))) {
+		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
+		IP6CB(skb)->iif = skb->dev->ifindex;
+		IP6CB(skb)->flags = IP6SKB_FORWARDED;
+		return nf_flow_xmit_xfrm(skb, state, &rt->dst);
+	}
+
 	skb->dev = outdev;
 	nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6);
 	skb_dst_set_noref(skb, &rt->dst);
-- 
2.11.0



^ permalink raw reply related

* [PATCH 1/7] selftests: netfilter: extend flowtable test script for ipsec
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev
In-Reply-To: <20190814092440.20087-1-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

'flow offload' expression should not offload flows that will be subject
to ipsec, but it does.

This results in a connectivity blackhole for the affected flows -- first
packets will go through (offload happens after established state is
reached), but all remaining ones bypass ipsec encryption and are thus
discarded by the peer.

This can be worked around by adding "rt ipsec exists accept"
before the 'flow offload' rule matches.

This test case will fail, support for such flows is added in
next patch.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 tools/testing/selftests/netfilter/nft_flowtable.sh | 48 ++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tools/testing/selftests/netfilter/nft_flowtable.sh b/tools/testing/selftests/netfilter/nft_flowtable.sh
index fe52488a6f72..16571ac1dab4 100755
--- a/tools/testing/selftests/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/netfilter/nft_flowtable.sh
@@ -321,4 +321,52 @@ else
 	ip netns exec nsr1 nft list ruleset
 fi
 
+KEY_SHA="0x"$(ps -xaf | sha1sum | cut -d " " -f 1)
+KEY_AES="0x"$(ps -xaf | md5sum | cut -d " " -f 1)
+SPI1=$RANDOM
+SPI2=$RANDOM
+
+if [ $SPI1 -eq $SPI2 ]; then
+	SPI2=$((SPI2+1))
+fi
+
+do_esp() {
+    local ns=$1
+    local me=$2
+    local remote=$3
+    local lnet=$4
+    local rnet=$5
+    local spi_out=$6
+    local spi_in=$7
+
+    ip -net $ns xfrm state add src $remote dst $me proto esp spi $spi_in  enc aes $KEY_AES  auth sha1 $KEY_SHA mode tunnel sel src $rnet dst $lnet
+    ip -net $ns xfrm state add src $me  dst $remote proto esp spi $spi_out enc aes $KEY_AES auth sha1 $KEY_SHA mode tunnel sel src $lnet dst $rnet
+
+    # to encrypt packets as they go out (includes forwarded packets that need encapsulation)
+    ip -net $ns xfrm policy add src $lnet dst $rnet dir out tmpl src $me dst $remote proto esp mode tunnel priority 1 action allow
+    # to fwd decrypted packets after esp processing:
+    ip -net $ns xfrm policy add src $rnet dst $lnet dir fwd tmpl src $remote dst $me proto esp mode tunnel priority 1 action allow
+
+}
+
+do_esp nsr1 192.168.10.1 192.168.10.2 10.0.1.0/24 10.0.2.0/24 $SPI1 $SPI2
+
+do_esp nsr2 192.168.10.2 192.168.10.1 10.0.2.0/24 10.0.1.0/24 $SPI2 $SPI1
+
+ip netns exec nsr1 nft delete table ip nat
+
+# restore default routes
+ip -net ns2 route del 192.168.10.1 via 10.0.2.1
+ip -net ns2 route add default via 10.0.2.1
+ip -net ns2 route add default via dead:2::1
+
+test_tcp_forwarding ns1 ns2
+if [ $? -eq 0 ] ;then
+	echo "PASS: ipsec tunnel mode for ns1/ns2"
+else
+	echo "FAIL: ipsec tunnel mode for ns1/ns2"
+	ip netns exec nsr1 nft list ruleset 1>&2
+	ip netns exec nsr1 cat /proc/net/xfrm_stat 1>&2
+fi
+
 exit $ret
-- 
2.11.0



^ permalink raw reply related

* [PATCH 0/7] Netfilter fixes for net
From: Pablo Neira Ayuso @ 2019-08-14  9:24 UTC (permalink / raw)
  To: netfilter-devel; +Cc: davem, netdev

Hi,

This patchset contains Netfilter fixes for net:

1) Extend selftest to cover flowtable with ipsec, from Florian Westphal.

2) Fix interaction of ipsec with flowtable, also from Florian.

3) User-after-free with bound set to rule that fails to load.

4) Adjust state and timeout for flows that expire.

5) Timeout update race with flows in teardown state.

6) Ensure conntrack id hash calculation use invariants as input,
   from Dirk Morris.

7) Do not push flows into flowtable for TCP fin/rst packets.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks.

----------------------------------------------------------------

The following changes since commit 5e5412c365a32e452daa762eac36121cb8a370bb:

  net/socket: fix GCC8+ Wpacked-not-aligned warnings (2019-08-03 11:02:46 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to dfe42be15fde16232340b8b2a57c359f51cc10d9:

  netfilter: nft_flow_offload: skip tcp rst and fin packets (2019-08-14 11:09:07 +0200)

----------------------------------------------------------------
Dirk Morris (1):
      netfilter: conntrack: Use consistent ct id hash calculation

Florian Westphal (2):
      selftests: netfilter: extend flowtable test script for ipsec
      netfilter: nf_flow_table: fix offload for flows that are subject to xfrm

Pablo Neira Ayuso (4):
      netfilter: nf_tables: use-after-free in failing rule with bound set
      netfilter: nf_flow_table: conntrack picks up expired flows
      netfilter: nf_flow_table: teardown flow timeout race
      netfilter: nft_flow_offload: skip tcp rst and fin packets

 include/net/netfilter/nf_tables.h                  |  9 +++-
 net/netfilter/nf_conntrack_core.c                  | 16 ++++----
 net/netfilter/nf_flow_table_core.c                 | 43 +++++++++++++------
 net/netfilter/nf_flow_table_ip.c                   | 43 +++++++++++++++++++
 net/netfilter/nf_tables_api.c                      | 15 ++++---
 net/netfilter/nft_flow_offload.c                   |  9 ++--
 tools/testing/selftests/netfilter/nft_flowtable.sh | 48 ++++++++++++++++++++++
 7 files changed, 153 insertions(+), 30 deletions(-)


^ permalink raw reply

* Re: [PATCH bpf-next 1/3] libbpf: add asm/unistd.h to xsk to get __NR_mmap2
From: Ivan Khoronzhuk @ 2019-08-14  9:24 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: Magnus Karlsson, Björn Töpel, David S. Miller,
	Jesper Dangaard Brouer, john fastabend, Jakub Kicinski,
	Daniel Borkmann, Networking, bpf, xdp-newbies, open list
In-Reply-To: <CAEf4BzZ2y_DmTXkVqFh6Hdcquo6UvntvCygw5h5WwrWYXRRg_g@mail.gmail.com>

On Tue, Aug 13, 2019 at 04:38:13PM -0700, Andrii Nakryiko wrote:

Hi, Andrii

>On Tue, Aug 13, 2019 at 3:24 AM Ivan Khoronzhuk
><ivan.khoronzhuk@linaro.org> wrote:
>>
>> That's needed to get __NR_mmap2 when mmap2 syscall is used.
>>
>> Signed-off-by: Ivan Khoronzhuk <ivan.khoronzhuk@linaro.org>
>> ---
>>  tools/lib/bpf/xsk.c | 1 +
>>  1 file changed, 1 insertion(+)
>>
>> diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c
>> index 5007b5d4fd2c..f2fc40f9804c 100644
>> --- a/tools/lib/bpf/xsk.c
>> +++ b/tools/lib/bpf/xsk.c
>> @@ -12,6 +12,7 @@
>>  #include <stdlib.h>
>>  #include <string.h>
>>  #include <unistd.h>
>> +#include <asm/unistd.h>
>
>asm/unistd.h is not present in Github libbpf projection. Is there any

Look on includes from
tools/lib/bpf/libpf.c
tools/lib/bpf/bpf.c

That's how it's done... Copping headers to arch/arm will not
solve this, it includes both of them anyway, and anyway it needs
asm/unistd.h inclusion here, only because xsk.c needs __NR_*


>way to avoid including this header? Generally, libbpf can't easily use
>all of kernel headers, we need to re-implemented all the extra used
>stuff for Github version of libbpf, so we try to minimize usage of new
>headers that are not just plain uapi headers from include/uapi.

Yes I know, it's far away from real number of changes needed.
I faced enough about this already and kernel headers, especially
for arm32 it's a bit decency problem. But this patch it's part of
normal one. I have couple issues despite this normally fixed mmap2
that is the same even if uapi includes are coppied to tools/arch/arm.

In continuation of kernel headers inclusion and arm build:

For instance, what about this rough "kernel headers" hack:
https://github.com/ikhorn/af_xdp_stuff/commit/aa645ccca4d844f404ec3c2b27402d4d7848d1b5

or this one related for arm32 only:
https://github.com/ikhorn/af_xdp_stuff/commit/2c6c6d538605aac39600dcb3c9b66de11c70b963

I have more...

>
>>  #include <arpa/inet.h>
>>  #include <asm/barrier.h>
>>  #include <linux/compiler.h>
>> --
>> 2.17.1
>>

-- 
Regards,
Ivan Khoronzhuk

^ permalink raw reply

* [PATCH] can: rcar_can: Remove unused platform data support
From: Geert Uytterhoeven @ 2019-08-14  9:22 UTC (permalink / raw)
  To: Wolfgang Grandegger, Marc Kleine-Budde, Sergei Shtylyov
  Cc: David S . Miller, Wolfram Sang, linux-can, linux-renesas-soc,
	netdev, Geert Uytterhoeven

All R-Car platforms use DT for describing CAN controllers.
R-Car CAN platform data support was never used in any upstream kernel.

Move the Clock Select Register settings enum into the driver, and remove
platform data support and the corresponding header file.

Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/net/can/rcar/rcar_can.c       | 22 +++++++++-------------
 include/linux/can/platform/rcar_can.h | 18 ------------------
 2 files changed, 9 insertions(+), 31 deletions(-)
 delete mode 100644 include/linux/can/platform/rcar_can.h

diff --git a/drivers/net/can/rcar/rcar_can.c b/drivers/net/can/rcar/rcar_can.c
index cf218949a8fb52d5..3c5e9c2c5342147f 100644
--- a/drivers/net/can/rcar/rcar_can.c
+++ b/drivers/net/can/rcar/rcar_can.c
@@ -15,11 +15,17 @@
 #include <linux/can/led.h>
 #include <linux/can/dev.h>
 #include <linux/clk.h>
-#include <linux/can/platform/rcar_can.h>
 #include <linux/of.h>
 
 #define RCAR_CAN_DRV_NAME	"rcar_can"
 
+/* Clock Select Register settings */
+enum CLKR {
+	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
+	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
+	CLKR_CLKEXT = 3	/* Externally input clock */
+};
+
 #define RCAR_SUPPORTED_CLOCKS	(BIT(CLKR_CLKP1) | BIT(CLKR_CLKP2) | \
 				 BIT(CLKR_CLKEXT))
 
@@ -736,7 +742,6 @@ static const char * const clock_names[] = {
 
 static int rcar_can_probe(struct platform_device *pdev)
 {
-	struct rcar_can_platform_data *pdata;
 	struct rcar_can_priv *priv;
 	struct net_device *ndev;
 	struct resource *mem;
@@ -745,17 +750,8 @@ static int rcar_can_probe(struct platform_device *pdev)
 	int err = -ENODEV;
 	int irq;
 
-	if (pdev->dev.of_node) {
-		of_property_read_u32(pdev->dev.of_node,
-				     "renesas,can-clock-select", &clock_select);
-	} else {
-		pdata = dev_get_platdata(&pdev->dev);
-		if (!pdata) {
-			dev_err(&pdev->dev, "No platform data provided!\n");
-			goto fail;
-		}
-		clock_select = pdata->clock_select;
-	}
+	of_property_read_u32(pdev->dev.of_node, "renesas,can-clock-select",
+			     &clock_select);
 
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
diff --git a/include/linux/can/platform/rcar_can.h b/include/linux/can/platform/rcar_can.h
deleted file mode 100644
index a43dcd0cf79ee3ec..0000000000000000
--- a/include/linux/can/platform/rcar_can.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _CAN_PLATFORM_RCAR_CAN_H_
-#define _CAN_PLATFORM_RCAR_CAN_H_
-
-#include <linux/types.h>
-
-/* Clock Select Register settings */
-enum CLKR {
-	CLKR_CLKP1 = 0,	/* Peripheral clock (clkp1) */
-	CLKR_CLKP2 = 1,	/* Peripheral clock (clkp2) */
-	CLKR_CLKEXT = 3	/* Externally input clock */
-};
-
-struct rcar_can_platform_data {
-	enum CLKR clock_select;	/* Clock source select */
-};
-
-#endif	/* !_CAN_PLATFORM_RCAR_CAN_H_ */
-- 
2.17.1


^ permalink raw reply related

* Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
From: Allan W . Nielsen @ 2019-08-14  9:16 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237E0F32D6CC719682E8C1AF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

The 08/14/2019 04:56, Y.b. Lu wrote:
> > -----Original Message-----
> > From: Allan W . Nielsen <allan.nielsen@microchip.com>
> > Sent: Tuesday, August 13, 2019 2:25 PM
> > To: Y.b. Lu <yangbo.lu@nxp.com>
> > Cc: netdev@vger.kernel.org; David S . Miller <davem@davemloft.net>;
> > Alexandre Belloni <alexandre.belloni@bootlin.com>; Microchip Linux Driver
> > Support <UNGLinuxDriver@microchip.com>
> > Subject: Re: [v2, 4/4] ocelot: add VCAP IS2 rule to trap PTP Ethernet frames
> > 
> > The 08/13/2019 10:52, Yangbo Lu wrote:
> > > All the PTP messages over Ethernet have etype 0x88f7 on them.
> > > Use etype as the key to trap PTP messages.
> > >
> > > Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
> > > ---
> > > Changes for v2:
> > > 	- Added this patch.
> > > ---
> > >  drivers/net/ethernet/mscc/ocelot.c | 28 ++++++++++++++++++++++++++++
> > >  1 file changed, 28 insertions(+)
> > >
> > > diff --git a/drivers/net/ethernet/mscc/ocelot.c
> > > b/drivers/net/ethernet/mscc/ocelot.c
> > > index 6932e61..40f4e0d 100644
> > > --- a/drivers/net/ethernet/mscc/ocelot.c
> > > +++ b/drivers/net/ethernet/mscc/ocelot.c
> > > @@ -1681,6 +1681,33 @@ int ocelot_probe_port(struct ocelot *ocelot, u8
> > > port,  }  EXPORT_SYMBOL(ocelot_probe_port);
> > >
> > > +static int ocelot_ace_add_ptp_rule(struct ocelot *ocelot) {
> > > +	struct ocelot_ace_rule *rule;
> > > +
> > > +	rule = kzalloc(sizeof(*rule), GFP_KERNEL);
> > > +	if (!rule)
> > > +		return -ENOMEM;
> > > +
> > > +	/* Entry for PTP over Ethernet (etype 0x88f7)
> > > +	 * Action: trap to CPU port
> > > +	 */
> > > +	rule->ocelot = ocelot;
> > > +	rule->prio = 1;
> > > +	rule->type = OCELOT_ACE_TYPE_ETYPE;
> > > +	/* Available on all ingress port except CPU port */
> > > +	rule->ingress_port = ~BIT(ocelot->num_phys_ports);
> > > +	rule->dmac_mc = OCELOT_VCAP_BIT_1;
> > > +	rule->frame.etype.etype.value[0] = 0x88;
> > > +	rule->frame.etype.etype.value[1] = 0xf7;
> > > +	rule->frame.etype.etype.mask[0] = 0xff;
> > > +	rule->frame.etype.etype.mask[1] = 0xff;
> > > +	rule->action = OCELOT_ACL_ACTION_TRAP;
> > > +
> > > +	ocelot_ace_rule_offload_add(rule);
> > > +	return 0;
> > > +}
> > > +
> > >  int ocelot_init(struct ocelot *ocelot)  {
> > >  	u32 port;
> > > @@ -1708,6 +1735,7 @@ int ocelot_init(struct ocelot *ocelot)
> > >  	ocelot_mact_init(ocelot);
> > >  	ocelot_vlan_init(ocelot);
> > >  	ocelot_ace_init(ocelot);
> > > +	ocelot_ace_add_ptp_rule(ocelot);
> > >
> > >  	for (port = 0; port < ocelot->num_phys_ports; port++) {
> > >  		/* Clear all counters (5 groups) */
> > This seems really wrong to me, and much too hard-coded...
> > 
> > What if I want to forward the PTP frames to be forwarded like a normal
> > non-aware PTP switch?
> 
> [Y.b. Lu] As Andrew said, other switches could identify PTP messages and forward to CPU for processing.
> https://patchwork.ozlabs.org/patch/1145627/
Yes, it would be good to see some exampels to understand this better.

> I'm also wondering whether there is common method in linux to address your questions.
Me too.

> If no, I think trapping all PTP messages on all ports to CPU could be used for now.
> If users require PTP synchronization, they actually don’t want a non-aware PTP switch.
Can we continue this discussion in the other thread where I listed the 3
scenarios?

> I once see other ocelot code configure ptp trap rules in ioctl timestamping
> setting. But I don’t think it's proper either.  Enable timestamping doesn’t
> mean we want to trap PTP messages.
Where did you see this?

The effort in [1] is just about the time-stamping and does not really consider
the bridge part of it, and it should not be installing any TCAM rules (I believe
it did in earlier versions, but this has been changed).

[1] https://patchwork.ozlabs.org/patch/1145777/

> > What if do not want this on all ports?
> [Y.b. Lu] Actually I don’t think there should be difference of handling PTP messages on each port.
> You don’t need to run PTP protocol application on the specific port if you don’t want.
What if you want some vlans or some ports to be PTP unaware, and other to be PTP
aware.

> > If you do not have an application behind this implementing a boundary or
> > transparent clock, then you are breaking PTP on the network.
> [Y.b. Lu] You're right. But actually for PTP network, all PTP devices should run PTP protocol on it.
> Of course, it's better to have a way to configure it as non-aware PTP switch.
I think we agree.

In my point of view, it is the PTP daemon who should configure frames to be
trapped. Then the switch will be PTP unaware until the PTP daemon starts up and
is ready to make it aware.

If we put it in the init function, then it will be of PTP broken until the PTP
daemon starts.

/Allan


^ permalink raw reply

* [PATCH net] net/packet: fix race in tpacket_snd()
From: Eric Dumazet @ 2019-08-14  9:11 UTC (permalink / raw)
  To: David S . Miller; +Cc: netdev, Eric Dumazet, Eric Dumazet, syzbot

packet_sendmsg() checks tx_ring.pg_vec to decide
if it must call tpacket_snd().

Problem is that the check is lockless, meaning another thread
can issue a concurrent setsockopt(PACKET_TX_RING ) to flip
tx_ring.pg_vec back to NULL.

Given that tpacket_snd() grabs pg_vec_lock mutex, we can
perform the check again to solve the race.

syzbot reported :

kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault: 0000 [#1] PREEMPT SMP KASAN
CPU: 1 PID: 11429 Comm: syz-executor394 Not tainted 5.3.0-rc4+ #101
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
RIP: 0010:packet_lookup_frame+0x8d/0x270 net/packet/af_packet.c:474
Code: c1 ee 03 f7 73 0c 80 3c 0e 00 0f 85 cb 01 00 00 48 8b 0b 89 c0 4c 8d 24 c1 48 b8 00 00 00 00 00 fc ff df 4c 89 e1 48 c1 e9 03 <80> 3c 01 00 0f 85 94 01 00 00 48 8d 7b 10 4d 8b 3c 24 48 b8 00 00
RSP: 0018:ffff88809f82f7b8 EFLAGS: 00010246
RAX: dffffc0000000000 RBX: ffff8880a45c7030 RCX: 0000000000000000
RDX: 0000000000000000 RSI: 1ffff110148b8e06 RDI: ffff8880a45c703c
RBP: ffff88809f82f7e8 R08: ffff888087aea200 R09: fffffbfff134ae50
R10: fffffbfff134ae4f R11: ffffffff89a5727f R12: 0000000000000000
R13: 0000000000000001 R14: ffff8880a45c6ac0 R15: 0000000000000000
FS:  00007fa04716f700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fa04716edb8 CR3: 0000000091eb4000 CR4: 00000000001406e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 packet_current_frame net/packet/af_packet.c:487 [inline]
 tpacket_snd net/packet/af_packet.c:2667 [inline]
 packet_sendmsg+0x590/0x6250 net/packet/af_packet.c:2975
 sock_sendmsg_nosec net/socket.c:637 [inline]
 sock_sendmsg+0xd7/0x130 net/socket.c:657
 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311
 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413
 __do_sys_sendmmsg net/socket.c:2442 [inline]
 __se_sys_sendmmsg net/socket.c:2439 [inline]
 __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439
 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: syzbot <syzkaller@googlegroups.com>
---
 net/packet/af_packet.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8d54f3047768d2272cbd28a7bcda33df800aa589..e2742b006d255f598fc98953dbb823f615d2bf9a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2618,6 +2618,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
 
 	mutex_lock(&po->pg_vec_lock);
 
+	/* packet_sendmsg() check on tx_ring.pg_vec was lockless,
+	 * we need to confirm it under protection of pg_vec_lock.
+	 */
+	if (unlikely(!po->tx_ring.pg_vec)) {
+		err = -EBUSY;
+		goto out;
+	}
 	if (likely(saddr == NULL)) {
 		dev	= packet_cached_dev_get(po);
 		proto	= po->num;
-- 
2.23.0.rc1.153.gdeed80330f-goog


^ permalink raw reply related

* Re: [PATCH v4 13/14] net: phy: adin: add ethtool get_stats support
From: Ardelean, Alexandru @ 2019-08-14  9:08 UTC (permalink / raw)
  To: andrew@lunn.ch
  Cc: davem@davemloft.net, hkallweit1@gmail.com,
	devicetree@vger.kernel.org, mark.rutland@arm.com,
	linux-kernel@vger.kernel.org, f.fainelli@gmail.com,
	netdev@vger.kernel.org, robh+dt@kernel.org
In-Reply-To: <c3fdb21c40900dae0e52b02b98fe27924a76c256.camel@analog.com>

On Tue, 2019-08-13 at 08:48 +0300, Alexandru Ardelean wrote:
> On Mon, 2019-08-12 at 16:33 +0200, Andrew Lunn wrote:
> > [External]
> > 
> > > +static int adin_read_mmd_stat_regs(struct phy_device *phydev,
> > > +				   struct adin_hw_stat *stat,
> > > +				   u32 *val)
> > > +{
> > > +	int ret;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val = (ret & 0xffff);
> > > +
> > > +	if (stat->reg2 == 0)
> > > +		return 0;
> > > +
> > > +	ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > > +	if (ret < 0)
> > > +		return ret;
> > > +
> > > +	*val <<= 16;
> > > +	*val |= (ret & 0xffff);
> > > +
> > > +	return 0;
> > > +}
> > 
> > It still looks like you have not dealt with overflow from the LSB into
> > the MSB between the two reads.
> 
> Apologies for forgetting to address this.
> I did not intentionally leave it out; this item got lost after V1 [which had the most remarks].
> Changelog V1 -> V2 was quite bulky, and I did not look at V1 remarks after I finished V2.
> 
> Thanks for snippet.

So, I have to apologize again here.
I guess I was an idiot/n00b about this.

The PHY stats do support snapshot, and I sync-ed with someone from the chip-team to confirm.

Also, from the datasheet[1] (page 29 - FRAME GENERATOR AND CHECKER - 5th paragraph):
--------------------------------------------------------------
The frame checker counts the number of CRC errors and these 
are reported in the receive error counter register (RxErrCnt 
register,  address 0x0014). To ensure synchronization between 
the frame checker error counter and frame checker frame counters,
all of the counters are latched once the receive error counter
register is read. Hence when using the frame checker, the
receive error counter should be read first and then all the other
frame counters and error counters should be read. A latched copy
of the receive frame counter register is available in the
FcFrmCntH and FcFrmCntL registers (register addresses 0x1E.0x940A
and 0x1E.0x940B).
-------------------------------------------------------------

Then in the description of these regs, it mentions (repeteadly):
-------------------------------------------------------------
This register is a latched copy of bits 31:16 of the 32-bit
receive frame counter register. When the receive error counter
(RxErrCnt register address 0x0014) is read, the receive
frame
counter register is latched. A copy of the receive frame counter
register is latched when RxErrCnt is read so that
the error count
and receive frame count are synchronized
-------------------------------------------------------------

I'll re-spin this with the rename of the strings, and maybe do a minor polish of the code.

Thanks & sorry for the noise/trouble
Alex

[1] https://www.analog.com/media/en/technical-documentation/data-sheets/ADIN1300.pdf


> 
> > 	do {
> > 		hi1 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi1 < 0)
> > 			return hi1;
> > 		
> > 		low = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg1);
> > 		if (low < 0)
> > 			return low;
> > 
> > 		hi2 = phy_read_mmd(phydev, MDIO_MMD_VEND1, stat->reg2);
> > 		if (hi2 < 0)
> > 			return hi1;
> > 	} while (hi1 != hi2)
> > 
> > 	return low | (hi << 16);
> > 
> > 	Andrew

^ permalink raw reply

* Re: [PATCH 3/3] ocelot_ace: fix action of trap
From: Allan W. Nielsen @ 2019-08-14  8:57 UTC (permalink / raw)
  To: Y.b. Lu
  Cc: Andrew Lunn, netdev@vger.kernel.org, David S . Miller,
	Alexandre Belloni, Microchip Linux Driver Support
In-Reply-To: <VI1PR0401MB2237D9358AA17400E72A776EF8AD0@VI1PR0401MB2237.eurprd04.prod.outlook.com>

Hi Y.b. and Andrew,

The 08/14/2019 04:28, Y.b. Lu wrote:
> > > I'd like to trap all IEEE 1588 PTP Ethernet frames to CPU through etype
> > 0x88f7.
> > 
> > Is this the correct way to handle PTP for this switch? For other switches we
> > don't need such traps. The switch itself identifies PTP frames and forwards
> > them to the CPU so it can process them.
> > 
> > I'm just wondering if your general approach is wrong?
> 
> [Y.b. Lu] PTP messages over Ethernet will use two multicast addresses.
> 01-80-C2-00-00-0E for peer delay messages.
Yes, and as you write, this is a BPDU which must not be forwarded (and they are
not).

> 01-1B-19-00-00-00 for other messages.
Yes, this is a normal L2 multicast address, which by default are broadcastet.

> But only 01-80-C2-00-00-0E could be handled by hardware filter for BPDU frames
> (01-80-C2-00-00-0x).  For PTP messages handling, trapping them to CPU through
> VCAP IS2 is the suggested way by Ocelot/Felix.
As I see it there are at least 3 scenarios which needs to be considered and
ideally supported:

1) Operate as a PTP-unaware switch. This means that Peer-delays messages are
   trapped to the CPU and not handled. End-to-End PTP sessions can still run on
   the network as 01-1B-19-00-00-00 frames are forwarded normally.

   This is what we have by default today.

2) A "passive" PTP switch (in MSCC/MCHP we call this an end-to-end transparent
   clock) where 01-80-C2-00-00-0E frames are still trapped to the CPU (and not
   handled), 01-1B-19-00-00-00 frames are forwarded, but we use the TCAM to add
   the residence time to the correction field in Sync and Delay request
   messages.

   This is a simple mechanism which allow end-to-end PTP sessions to synchronize
   their time, and compensate for the variable delay caused by the switch.

   Compared to implement a complete boundary clock, this is much simpler, and
   cause a much lower work load on the CPU (even small switches may be serving
   many many PTP sessions).

3) Full PTP aware switch. In this mode we need all PTP frames trapped (on the
   ports where PTP are running) to the CPU, and we need a PTP daemon in
   user-space to process them in-order for things to work.

   I guess this is what you are trying to achieve.

Eventually, I hope we can get to a point where all (and maybe more) scenarios
are supported.

Lets consider them case by case:

1) This is what we have today.

2) To support this, we need a SW implementation of this, and then we can add
   hooks to offload this in HW.

   We would certainly be interested in supporting this in both SW and HW.

3) It can be done via 'tc' using the trap action, but I do not know if this is
   the desired way of doing it. Ocelot will be using a TCAM rule to do this,
   which align nicely with the 'tc' approach, but other chips may be have
   dedicated HW for doing this.

   Also, in the current implementation we will be using a rule per port, and
   ideally we could have done it with a single rule (this is what Y.B. prepared
   in this patch series).

I'm very much against configuring option 3 in the driver initialization, as it
will prevent us from having a conforming switch if a PTP daemon is not running
in user-space, and it give us very little room for supporting other ways in the
future without breaking backwards compatibility.

> I have a question since you are experts.
I'm not really an expert on this, but I have access to some good guidance from
collages knowing PTP very well :-D

> For other switches, whether they are always trapping PTP messages to CPU?
Good question, I could not find anything in the SW bridge forcing option 3.

My understanding is that the SW bridge is implementing option 1, but I could be
wrong.

> Is there any common method in linux to configure switch to select trapping or
> just forwarding PTP messages?
You should be able to use TC for this. But due to the port vs port-mask
limitation you will need to install a rule per port.

I do not know if this is what others are doing, but would like to learn about
that.

/Allan


^ permalink raw reply

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:32 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Igor Russkikh, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <20190813162823.GH15047@lunn.ch>

Hi Andrew,

On Tue, Aug 13, 2019 at 06:28:23PM +0200, Andrew Lunn wrote:
> 
> It would also be nice to add extra information to the netlink API to
> indicate if HW or SW is being used. In other places where we offload
> to accelerators we have such additional information.

Yes, that would be very nice to have.

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* Re: [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-14  8:31 UTC (permalink / raw)
  To: Igor Russkikh
  Cc: Andrew Lunn, Antoine Tenart, davem@davemloft.net,
	sd@queasysnail.net, f.fainelli@gmail.com, hkallweit1@gmail.com,
	netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	thomas.petazzoni@bootlin.com, alexandre.belloni@bootlin.com,
	allan.nielsen@microchip.com, camelia.groza@nxp.com,
	Simon Edelhaus, Pavel Belous
In-Reply-To: <2e3c2307-d414-a531-26cb-064e05fa01fc@aquantia.com>

Hi Igor,

On Tue, Aug 13, 2019 at 04:18:40PM +0000, Igor Russkikh wrote:
> On 13.08.2019 16:17, Andrew Lunn wrote:
> > On Tue, Aug 13, 2019 at 10:58:17AM +0200, Antoine Tenart wrote:
> >> I think this question is linked to the use of a MACsec virtual interface
> >> when using h/w offloading. The starting point for me was that I wanted
> >> to reuse the data structures and the API exposed to the userspace by the
> >> s/w implementation of MACsec. I then had two choices: keeping the exact
> >> same interface for the user (having a virtual MACsec interface), or
> >> registering the MACsec genl ops onto the real net devices (and making
> >> the s/w implementation a virtual net dev and a provider of the MACsec
> >> "offloading" ops).
> >>
> >> The advantages of the first option were that nearly all the logic of the
> >> s/w implementation could be kept and especially that it would be
> >> transparent for the user to use both implementations of MACsec.
> > 
> > We have always talked about offloading operations to the hardware,
> > accelerating what the linux stack can do by making use of hardware
> > accelerators. The basic user API should not change because of
> > acceleration. Those are the general guidelines.
> > 
> > It would however be interesting to get comments from those who did the
> > software implementation and what they think of this architecture. I've
> > no personal experience with MACSec, so it is hard for me to say if the
> > current architecture makes sense when using accelerators.
> 
> In terms of overall concepts, I'd add the following:
> 
> 1) With current implementation it's impossible to install SW macsec engine onto
> the device which supports HW offload. That could be a strong limitation in
> cases when user sees HW macsec offload is broken or work differently, and he/she
> wants to replace it with SW one.
> MACSec is a complex feature, and it may happen something is missing in HW.
> Trivial example is 256bit encryption, which is not always a musthave in HW
> implementations.

Agreed. I'm not sure it would be possible to have both used at the same
time but there should be a way to switch between the two
implementations. That is not supported for now, but I think that would
be a good thing, and can probably come later on.

> 2) I think, Antoine, its not totally true that otherwise the user macsec API
> will be broken/changed. netlink api is the same, the only thing we may want to
> add is an optional parameter to force selection of SW macsec engine.

I meant that we can either have a virtual net device representing the
MACsec feature and being the iface used to configure it, or we could
have it only when s/w MACsec is used. That to me is part of the "API",
or at least part of what's exposed to the user.

> I'm also eager to hear from sw macsec users/devs on whats better here.

I'd like more comments as well :)

Thanks!
Antoine

-- 
Antoine Ténart, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* [PATCH net-next] r8152: divide the tx and rx bottom functions
From: Hayes Wang @ 2019-08-14  8:30 UTC (permalink / raw)
  To: netdev; +Cc: nic_swsd, linux-kernel, Hayes Wang

Move the tx bottom function from NAPI to a new tasklet. Then, for
multi-cores, the bottom functions of tx and rx may be run at same
time with different cores. This is used to improve performance.

Signed-off-by: Hayes Wang <hayeswang@realtek.com>
---
 drivers/net/usb/r8152.c | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c
index 40d18e866269..3ed9f8e082c9 100644
--- a/drivers/net/usb/r8152.c
+++ b/drivers/net/usb/r8152.c
@@ -619,7 +619,7 @@ enum rtl8152_flags {
 	RTL8152_LINK_CHG,
 	SELECTIVE_SUSPEND,
 	PHY_RESET,
-	SCHEDULE_NAPI,
+	SCHEDULE_TASKLET,
 	GREEN_ETHERNET,
 	DELL_TB_RX_AGG_BUG,
 };
@@ -733,6 +733,7 @@ struct r8152 {
 #ifdef CONFIG_PM_SLEEP
 	struct notifier_block pm_notifier;
 #endif
+	struct tasklet_struct tx_tl;
 
 	struct rtl_ops {
 		void (*init)(struct r8152 *);
@@ -1401,7 +1402,7 @@ static void write_bulk_callback(struct urb *urb)
 		return;
 
 	if (!skb_queue_empty(&tp->tx_queue))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 }
 
 static void intr_callback(struct urb *urb)
@@ -2179,8 +2180,12 @@ static void tx_bottom(struct r8152 *tp)
 	} while (res == 0);
 }
 
-static void bottom_half(struct r8152 *tp)
+static void bottom_half(unsigned long data)
 {
+	struct r8152 *tp;
+
+	tp = (struct r8152 *)data;
+
 	if (test_bit(RTL8152_UNPLUG, &tp->flags))
 		return;
 
@@ -2192,7 +2197,7 @@ static void bottom_half(struct r8152 *tp)
 	if (!netif_carrier_ok(tp->netdev))
 		return;
 
-	clear_bit(SCHEDULE_NAPI, &tp->flags);
+	clear_bit(SCHEDULE_TASKLET, &tp->flags);
 
 	tx_bottom(tp);
 }
@@ -2203,16 +2208,12 @@ static int r8152_poll(struct napi_struct *napi, int budget)
 	int work_done;
 
 	work_done = rx_bottom(tp, budget);
-	bottom_half(tp);
 
 	if (work_done < budget) {
 		if (!napi_complete_done(napi, work_done))
 			goto out;
 		if (!list_empty(&tp->rx_done))
 			napi_schedule(napi);
-		else if (!skb_queue_empty(&tp->tx_queue) &&
-			 !list_empty(&tp->tx_free))
-			napi_schedule(napi);
 	}
 
 out:
@@ -2366,11 +2367,11 @@ static netdev_tx_t rtl8152_start_xmit(struct sk_buff *skb,
 
 	if (!list_empty(&tp->tx_free)) {
 		if (test_bit(SELECTIVE_SUSPEND, &tp->flags)) {
-			set_bit(SCHEDULE_NAPI, &tp->flags);
+			set_bit(SCHEDULE_TASKLET, &tp->flags);
 			schedule_delayed_work(&tp->schedule, 0);
 		} else {
 			usb_mark_last_busy(tp->udev);
-			napi_schedule(&tp->napi);
+			tasklet_schedule(&tp->tx_tl);
 		}
 	} else if (skb_queue_len(&tp->tx_queue) > tp->tx_qlen) {
 		netif_stop_queue(netdev);
@@ -4020,9 +4021,11 @@ static void set_carrier(struct r8152 *tp)
 	} else {
 		if (netif_carrier_ok(netdev)) {
 			netif_carrier_off(netdev);
+			tasklet_disable(&tp->tx_tl);
 			napi_disable(napi);
 			tp->rtl_ops.disable(tp);
 			napi_enable(napi);
+			tasklet_enable(&tp->tx_tl);
 			netif_info(tp, link, netdev, "carrier off\n");
 		}
 	}
@@ -4055,10 +4058,10 @@ static void rtl_work_func_t(struct work_struct *work)
 	if (test_and_clear_bit(RTL8152_SET_RX_MODE, &tp->flags))
 		_rtl8152_set_rx_mode(tp->netdev);
 
-	/* don't schedule napi before linking */
-	if (test_and_clear_bit(SCHEDULE_NAPI, &tp->flags) &&
+	/* don't schedule tasket before linking */
+	if (test_and_clear_bit(SCHEDULE_TASKLET, &tp->flags) &&
 	    netif_carrier_ok(tp->netdev))
-		napi_schedule(&tp->napi);
+		tasklet_schedule(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4144,6 +4147,7 @@ static int rtl8152_open(struct net_device *netdev)
 		goto out_unlock;
 	}
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 
 	mutex_unlock(&tp->control);
 
@@ -4171,6 +4175,7 @@ static int rtl8152_close(struct net_device *netdev)
 #ifdef CONFIG_PM_SLEEP
 	unregister_pm_notifier(&tp->pm_notifier);
 #endif
+	tasklet_disable(&tp->tx_tl);
 	if (!test_bit(RTL8152_UNPLUG, &tp->flags))
 		napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
@@ -4440,6 +4445,7 @@ static int rtl8152_pre_reset(struct usb_interface *intf)
 		return 0;
 
 	netif_stop_queue(netdev);
+	tasklet_disable(&tp->tx_tl);
 	napi_disable(&tp->napi);
 	clear_bit(WORK_ENABLE, &tp->flags);
 	usb_kill_urb(tp->intr_urb);
@@ -4483,6 +4489,7 @@ static int rtl8152_post_reset(struct usb_interface *intf)
 	}
 
 	napi_enable(&tp->napi);
+	tasklet_enable(&tp->tx_tl);
 	netif_wake_queue(netdev);
 	usb_submit_urb(tp->intr_urb, GFP_KERNEL);
 
@@ -4636,10 +4643,12 @@ static int rtl8152_system_suspend(struct r8152 *tp)
 
 		clear_bit(WORK_ENABLE, &tp->flags);
 		usb_kill_urb(tp->intr_urb);
+		tasklet_disable(&tp->tx_tl);
 		napi_disable(napi);
 		cancel_delayed_work_sync(&tp->schedule);
 		tp->rtl_ops.down(tp);
 		napi_enable(napi);
+		tasklet_enable(&tp->tx_tl);
 	}
 
 	return 0;
@@ -5499,6 +5508,8 @@ static int rtl8152_probe(struct usb_interface *intf,
 	mutex_init(&tp->control);
 	INIT_DELAYED_WORK(&tp->schedule, rtl_work_func_t);
 	INIT_DELAYED_WORK(&tp->hw_phy_work, rtl_hw_phy_work_func_t);
+	tasklet_init(&tp->tx_tl, bottom_half, (unsigned long)tp);
+	tasklet_disable(&tp->tx_tl);
 
 	netdev->netdev_ops = &rtl8152_netdev_ops;
 	netdev->watchdog_timeo = RTL8152_TX_TIMEOUT;
@@ -5585,6 +5596,7 @@ static int rtl8152_probe(struct usb_interface *intf,
 
 out1:
 	netif_napi_del(&tp->napi);
+	tasklet_kill(&tp->tx_tl);
 	usb_set_intfdata(intf, NULL);
 out:
 	free_netdev(netdev);
@@ -5601,6 +5613,7 @@ static void rtl8152_disconnect(struct usb_interface *intf)
 
 		netif_napi_del(&tp->napi);
 		unregister_netdev(tp->netdev);
+		tasklet_kill(&tp->tx_tl);
 		cancel_delayed_work_sync(&tp->hw_phy_work);
 		tp->rtl_ops.unload(tp);
 		free_netdev(tp->netdev);
-- 
2.21.0


^ permalink raw reply related

* Re: [PATCH] net: ethernet: mediatek: Add MT7628/88 SoC support
From: Stefan Roese @ 2019-08-14  8:26 UTC (permalink / raw)
  To: René van Dorst
  Cc: netdev, linux-mediatek, Sean Wang, Felix Fietkau, John Crispin,
	Daniel Golle
In-Reply-To: <20190717125345.Horde.JcDE_nBChPFDDjEgIRfPSl3@www.vdorst.com>

Hi Rene,

On 17.07.19 14:53, René van Dorst wrote:

<snip>

>> +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h
>> @@ -39,7 +39,8 @@
>>   				 NETIF_F_SG | NETIF_F_TSO | \
>>   				 NETIF_F_TSO6 | \
>>   				 NETIF_F_IPV6_CSUM)
>> -#define NEXT_RX_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>> +#define MTK_HW_FEATURES_MT7628	(NETIF_F_SG | NETIF_F_RXCSUM)
>> +#define NEXT_DESP_IDX(X, Y)	(((X) + 1) & ((Y) - 1))
>>
>>   #define MTK_MAX_RX_RING_NUM	4
>>   #define MTK_HW_LRO_DMA_SIZE	8
>> @@ -118,6 +119,7 @@
>>   /* PDMA Global Configuration Register */
>>   #define MTK_PDMA_GLO_CFG	0xa04
>>   #define MTK_MULTI_EN		BIT(10)
>> +#define MTK_PDMA_SIZE_8DWORDS	(1 << 4)
>>
>>   /* PDMA Reset Index Register */
>>   #define MTK_PDMA_RST_IDX	0xa08
>> @@ -276,11 +278,18 @@
>>   #define TX_DMA_OWNER_CPU	BIT(31)
>>   #define TX_DMA_LS0		BIT(30)
>>   #define TX_DMA_PLEN0(_x)	(((_x) & MTK_TX_DMA_BUF_LEN) << 16)
>> +#define TX_DMA_PLEN1(_x)	((_x) & MTK_TX_DMA_BUF_LEN)
>>   #define TX_DMA_SWC		BIT(14)
>>   #define TX_DMA_SDL(_x)		(((_x) & 0x3fff) << 16)
>>
>> +/* PDMA on MT7628 */
>> +#define TX_DMA_DONE		BIT(31)
>> +#define TX_DMA_LS1		BIT(14)
>> +#define TX_DMA_DESP2_DEF	(TX_DMA_LS0 | TX_DMA_DONE)
>> +
>>   /* QDMA descriptor rxd2 */
>>   #define RX_DMA_DONE		BIT(31)
>> +#define RX_DMA_LSO		BIT(30)
>>   #define RX_DMA_PLEN0(_x)	(((_x) & 0x3fff) << 16)
>>   #define RX_DMA_GET_PLEN0(_x)	(((_x) >> 16) & 0x3fff)
>>
>> @@ -289,6 +298,7 @@
>>
>>   /* QDMA descriptor rxd4 */
>>   #define RX_DMA_L4_VALID		BIT(24)
>> +#define RX_DMA_L4_VALID_PDMA	BIT(30)		/* when PDMA is used */
>>   #define RX_DMA_FPORT_SHIFT	19
>>   #define RX_DMA_FPORT_MASK	0x7
>>
>> @@ -412,6 +422,19 @@
>>   #define CO_QPHY_SEL            BIT(0)
>>   #define GEPHY_MAC_SEL          BIT(1)
>>
>> +/* MT7628/88 specific stuff */
>> +#define MT7628_PDMA_OFFSET	0x0800
>> +#define MT7628_SDM_OFFSET	0x0c00
>> +
>> +#define MT7628_TX_BASE_PTR0	(MT7628_PDMA_OFFSET + 0x00)
>> +#define MT7628_TX_MAX_CNT0	(MT7628_PDMA_OFFSET + 0x04)
>> +#define MT7628_TX_CTX_IDX0	(MT7628_PDMA_OFFSET + 0x08)
>> +#define MT7628_TX_DTX_IDX0	(MT7628_PDMA_OFFSET + 0x0c)
>> +#define MT7628_PST_DTX_IDX0	BIT(0)
>> +
>> +#define MT7628_SDM_MAC_ADRL	(MT7628_SDM_OFFSET + 0x0c)
>> +#define MT7628_SDM_MAC_ADRH	(MT7628_SDM_OFFSET + 0x10)
>> +
>>   struct mtk_rx_dma {
>>   	unsigned int rxd1;
>>   	unsigned int rxd2;
>> @@ -509,6 +532,7 @@ enum mtk_clks_map {
>>   				 BIT(MTK_CLK_SGMII_CK) | \
>>   				 BIT(MTK_CLK_ETH2PLL))
>>   #define MT7621_CLKS_BITMAP	(0)
>> +#define MT7628_CLKS_BITMAP	(0)
>>   #define MT7629_CLKS_BITMAP	(BIT(MTK_CLK_ETHIF) | BIT(MTK_CLK_ESW) |  \
>>   				 BIT(MTK_CLK_GP0) | BIT(MTK_CLK_GP1) | \
>>   				 BIT(MTK_CLK_GP2) | BIT(MTK_CLK_FE) | \
>> @@ -563,6 +587,10 @@ struct mtk_tx_ring {
>>   	struct mtk_tx_dma *last_free;
>>   	u16 thresh;
>>   	atomic_t free_count;
>> +	int dma_size;
>> +	struct mtk_tx_dma *dma_pdma;	/* For MT7628/88 PDMA handling */
>> +	dma_addr_t phys_pdma;
>> +	int cpu_idx;
>>   };
>>
>>   /* PDMA rx ring mode */
>> @@ -604,6 +632,7 @@ enum mkt_eth_capabilities {
>>   	MTK_HWLRO_BIT,
>>   	MTK_SHARED_INT_BIT,
>>   	MTK_TRGMII_MT7621_CLK_BIT,
>> +	MTK_SOC_MT7628,
> 
> This should be MTK_SOC_MT7628_BIT, this only defines the bit number!
> 
> and futher on #define MTK_SOC_MT7628 BIT(MTK_SOC_MT7628_BIT)

Okay, thanks.
  
> Based on this commit [0], MT7621 also needs the PDMA for the RX path.
> I know that is not your issue but I think it is better to add a extra
> capability bit for the PDMA bits so it can also be used on other socs.

Yes, MT7621 also uses PDMA for RX. The code for RX is pretty much
shared (re-used), with slight changes for the MT7628/88 to work
correctly on this SoC.

I'll work on a capability bit for PDMA vs QDMA on TX though. This
might make things a little more transparent.
  
> Greats,
> 
> René
> 
> [0] https://lkml.org/lkml/2018/3/14/1038

Thanks,
Stefan

^ permalink raw reply

* Re: [v5,0/4] tools: bpftool: add net attach/detach command to attach XDP prog
From: Daniel T. Lee @ 2019-08-14  8:25 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: Daniel Borkmann, Alexei Starovoitov, netdev
In-Reply-To: <20190813144303.10da8ff0@cakuba.netronome.com>

On Wed, Aug 14, 2019 at 6:43 AM Jakub Kicinski
<jakub.kicinski@netronome.com> wrote:
>
> On Tue, 13 Aug 2019 11:46:17 +0900, Daniel T. Lee wrote:
> > Currently, bpftool net only supports dumping progs attached on the
> > interface. To attach XDP prog on interface, user must use other tool
> > (eg. iproute2). By this patch, with `bpftool net attach/detach`, user
> > can attach/detach XDP prog on interface.
> >
> >     # bpftool prog
> >         16: xdp  name xdp_prog1  tag 539ec6ce11b52f98  gpl
> >         loaded_at 2019-08-07T08:30:17+0900  uid 0
> >         ...
> >         20: xdp  name xdp_fwd_prog  tag b9cb69f121e4a274  gpl
> >         loaded_at 2019-08-07T08:30:17+0900  uid 0
> >
> >     # bpftool net attach xdpdrv id 16 dev enp6s0np0
> >     # bpftool net
> >     xdp:
> >         enp6s0np0(4) driver id 16
> >
> >     # bpftool net attach xdpdrv id 20 dev enp6s0np0 overwrite
> >     # bpftool net
> >     xdp:
> >         enp6s0np0(4) driver id 20
> >
> >     # bpftool net detach xdpdrv dev enp6s0np0
> >     # bpftool net
> >     xdp:
> >
> >
> > While this patch only contains support for XDP, through `net
> > attach/detach`, bpftool can further support other prog attach types.
> >
> > XDP attach/detach tested on Mellanox ConnectX-4 and Netronome Agilio.
> >
> > ---
> > Changes in v5:
> >   - fix wrong error message, from errno to err with do_attach/detach
>
> The inconsistency in libbpf's error reporting is generally troubling,
> but a problem of this set, so:
>
> Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
>
> In the future please keep review tags if you have only made minor
> changes to the code.

Thank you for the review.
Sorry to bother you. I'll keep that in mind.

Thanks!

^ permalink raw reply

* Re: pull-request: bpf-next 2019-08-14
From: Daniel Borkmann @ 2019-08-14  8:21 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: davem, ast, andrii.nakryiko, netdev, bpf
In-Reply-To: <20190813165902.41da0730@cakuba.netronome.com>

On 8/14/19 1:59 AM, Jakub Kicinski wrote:
> On Wed, 14 Aug 2019 01:16:39 +0200, Daniel Borkmann wrote:
>> Hi David, hi Jakub,
>>
>> The following pull-request contains BPF updates for your *net-next* tree.
> 
> Pulled, let me know if I did it wrong 🤞

LGTM, thanks a lot! :-)

^ permalink raw reply

* [PATCH net-next v4 2/2] qed: Add driver API for flashing the config attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior
In-Reply-To: <20190814081153.18889-1-skalluru@marvell.com>

The patch adds driver interface for reading the config attributes from user
provided buffer, and updates these values on nvm config flash partition.

This is basically an expansion of our existing ethtool -f implementation.
The management FW has exposed an additional method of configuring some of
the nvram options, and this makes use of that. This implementation will
come into use when newer FW files which contain configuration directives
employing this API will be provided to ethtool -f.

Signed-off-by: Sudarsana Reddy Kalluru <skalluru@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
---
 drivers/net/ethernet/qlogic/qed/qed_main.c | 68 ++++++++++++++++++++++++++++++
 include/linux/qed/qed_if.h                 |  1 +
 2 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c
index e5ac8bd..0a76459 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_main.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_main.c
@@ -67,6 +67,8 @@
 #define QED_ROCE_QPS			(8192)
 #define QED_ROCE_DPIS			(8)
 #define QED_RDMA_SRQS                   QED_ROCE_QPS
+#define QED_NVM_CFG_SET_FLAGS		0xE
+#define QED_NVM_CFG_SET_PF_FLAGS	0x1E
 
 static char version[] =
 	"QLogic FastLinQ 4xxxx Core Module qed " DRV_MODULE_VERSION "\n";
@@ -2231,6 +2233,69 @@ static int qed_nvm_flash_image_validate(struct qed_dev *cdev,
 	return 0;
 }
 
+/* Binary file format -
+ *     /----------------------------------------------------------------------\
+ * 0B  |                       0x5 [command index]                            |
+ * 4B  | Entity ID     | Reserved        |  Number of config attributes       |
+ * 8B  | Config ID                       | Length        | Value              |
+ *     |                                                                      |
+ *     \----------------------------------------------------------------------/
+ * There can be several cfg_id-Length-Value sets as specified by 'Number of...'.
+ * Entity ID - A non zero entity value for which the config need to be updated.
+ *
+ * The API parses config attributes from the user provided buffer and flashes
+ * them to the respective NVM path using Management FW inerface.
+ */
+static int qed_nvm_flash_cfg_write(struct qed_dev *cdev, const u8 **data)
+{
+	struct qed_hwfn *hwfn = QED_LEADING_HWFN(cdev);
+	u8 entity_id, len, buf[32];
+	struct qed_ptt *ptt;
+	u16 cfg_id, count;
+	int rc = 0, i;
+	u32 flags;
+
+	ptt = qed_ptt_acquire(hwfn);
+	if (!ptt)
+		return -EAGAIN;
+
+	/* NVM CFG ID attribute header */
+	*data += 4;
+	entity_id = **data;
+	*data += 2;
+	count = *((u16 *)*data);
+	*data += 2;
+
+	DP_VERBOSE(cdev, NETIF_MSG_DRV,
+		   "Read config ids: entity id %02x num _attrs = %0d\n",
+		   entity_id, count);
+	/* NVM CFG ID attributes */
+	for (i = 0; i < count; i++) {
+		cfg_id = *((u16 *)*data);
+		*data += 2;
+		len = **data;
+		(*data)++;
+		memcpy(buf, *data, len);
+		*data += len;
+
+		flags = entity_id ? QED_NVM_CFG_SET_PF_FLAGS :
+			QED_NVM_CFG_SET_FLAGS;
+
+		DP_VERBOSE(cdev, NETIF_MSG_DRV,
+			   "cfg_id = %d len = %d\n", cfg_id, len);
+		rc = qed_mcp_nvm_set_cfg(hwfn, ptt, cfg_id, entity_id, flags,
+					 buf, len);
+		if (rc) {
+			DP_ERR(cdev, "Error %d configuring %d\n", rc, cfg_id);
+			break;
+		}
+	}
+
+	qed_ptt_release(hwfn, ptt);
+
+	return rc;
+}
+
 static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
 {
 	const struct firmware *image;
@@ -2272,6 +2337,9 @@ static int qed_nvm_flash(struct qed_dev *cdev, const char *name)
 			rc = qed_nvm_flash_image_access(cdev, &data,
 							&check_resp);
 			break;
+		case QED_NVM_FLASH_CMD_NVM_CFG_ID:
+			rc = qed_nvm_flash_cfg_write(cdev, &data);
+			break;
 		default:
 			DP_ERR(cdev, "Unknown command %08x\n", cmd_type);
 			rc = -EINVAL;
diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h
index 2302136..e366399 100644
--- a/include/linux/qed/qed_if.h
+++ b/include/linux/qed/qed_if.h
@@ -804,6 +804,7 @@ enum qed_nvm_flash_cmd {
 	QED_NVM_FLASH_CMD_FILE_DATA = 0x2,
 	QED_NVM_FLASH_CMD_FILE_START = 0x3,
 	QED_NVM_FLASH_CMD_NVM_CHANGE = 0x4,
+	QED_NVM_FLASH_CMD_NVM_CFG_ID = 0x5,
 	QED_NVM_FLASH_CMD_NVM_MAX,
 };
 
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next v4 1/2] qed: Add API for configuring NVM attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior
In-Reply-To: <20190814081153.18889-1-skalluru@marvell.com>

The patch adds API for configuring the NVM config attributes using
Management FW (MFW) interfaces.

Signed-off-by: Sudarsana Reddy Kalluru <skalluru@marvell.com>
Signed-off-by: Ariel Elior <aelior@marvell.com>
---
 drivers/net/ethernet/qlogic/qed/qed_hsi.h | 17 ++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c | 32 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h | 20 +++++++++++++++++++
 3 files changed, 69 insertions(+)

diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
index e054f6c..557a12e 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h
@@ -12580,6 +12580,8 @@ struct public_drv_mb {
 #define DRV_MSG_CODE_BW_UPDATE_ACK		0x32000000
 #define DRV_MSG_CODE_NIG_DRAIN			0x30000000
 #define DRV_MSG_CODE_S_TAG_UPDATE_ACK		0x3b000000
+#define DRV_MSG_CODE_GET_NVM_CFG_OPTION		0x003e0000
+#define DRV_MSG_CODE_SET_NVM_CFG_OPTION		0x003f0000
 #define DRV_MSG_CODE_INITIATE_PF_FLR            0x02010000
 #define DRV_MSG_CODE_VF_DISABLED_DONE		0xc0000000
 #define DRV_MSG_CODE_CFG_VF_MSIX		0xc0010000
@@ -12748,6 +12750,21 @@ struct public_drv_mb {
 #define DRV_MB_PARAM_FEATURE_SUPPORT_PORT_EEE		0x00000002
 #define DRV_MB_PARAM_FEATURE_SUPPORT_FUNC_VLINK		0x00010000
 
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_SHIFT		0
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ID_MASK		0x0000FFFF
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ALL_SHIFT		16
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ALL_MASK		0x00010000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_INIT_SHIFT		17
+#define DRV_MB_PARAM_NVM_CFG_OPTION_INIT_MASK		0x00020000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT_SHIFT	18
+#define DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT_MASK		0x00040000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_FREE_SHIFT		19
+#define DRV_MB_PARAM_NVM_CFG_OPTION_FREE_MASK		0x00080000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL_SHIFT	20
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL_MASK	0x00100000
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID_SHIFT	24
+#define DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID_MASK	0x0f000000
+
 	u32 fw_mb_header;
 #define FW_MSG_CODE_MASK			0xffff0000
 #define FW_MSG_CODE_UNSUPPORTED                 0x00000000
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
index 758702c..89462c4 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
@@ -3750,3 +3750,35 @@ int qed_mcp_get_ppfid_bitmap(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt)
 
 	return 0;
 }
+
+int qed_mcp_nvm_set_cfg(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			u16 option_id, u8 entity_id, u16 flags, u8 *p_buf,
+			u32 len)
+{
+	u32 mb_param = 0, resp, param;
+
+	QED_MFW_SET_FIELD(mb_param, DRV_MB_PARAM_NVM_CFG_OPTION_ID, option_id);
+	if (flags & QED_NVM_CFG_OPTION_ALL)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ALL, 1);
+	if (flags & QED_NVM_CFG_OPTION_INIT)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_INIT, 1);
+	if (flags & QED_NVM_CFG_OPTION_COMMIT)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_COMMIT, 1);
+	if (flags & QED_NVM_CFG_OPTION_FREE)
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_FREE, 1);
+	if (flags & QED_NVM_CFG_OPTION_ENTITY_SEL) {
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_SEL, 1);
+		QED_MFW_SET_FIELD(mb_param,
+				  DRV_MB_PARAM_NVM_CFG_OPTION_ENTITY_ID,
+				  entity_id);
+	}
+
+	return qed_mcp_nvm_wr_cmd(p_hwfn, p_ptt,
+				  DRV_MSG_CODE_SET_NVM_CFG_OPTION,
+				  mb_param, &resp, &param, len, (u32 *)p_buf);
+}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
index e4f8fe4..83649a8 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h
+++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h
@@ -251,6 +251,12 @@ struct qed_mfw_tlv_generic {
 	struct qed_mfw_tlv_iscsi iscsi;
 };
 
+#define QED_NVM_CFG_OPTION_ALL		BIT(0)
+#define QED_NVM_CFG_OPTION_INIT		BIT(1)
+#define QED_NVM_CFG_OPTION_COMMIT       BIT(2)
+#define QED_NVM_CFG_OPTION_FREE		BIT(3)
+#define QED_NVM_CFG_OPTION_ENTITY_SEL	BIT(4)
+
 /**
  * @brief - returns the link params of the hw function
  *
@@ -1202,4 +1208,18 @@ void qed_mcp_resc_lock_default_init(struct qed_resc_lock_params *p_lock,
  */
 int qed_mcp_get_ppfid_bitmap(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt);
 
+/**
+ * @brief Set NVM config attribute value.
+ *
+ * @param p_hwfn
+ * @param p_ptt
+ * @param option_id
+ * @param entity_id
+ * @param flags
+ * @param p_buf
+ * @param len
+ */
+int qed_mcp_nvm_set_cfg(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt,
+			u16 option_id, u8 entity_id, u16 flags, u8 *p_buf,
+			u32 len);
 #endif
-- 
1.8.3.1


^ permalink raw reply related

* [PATCH net-next v4 0/2] qed*: Support for NVM config attributes.
From: Sudarsana Reddy Kalluru @ 2019-08-14  8:11 UTC (permalink / raw)
  To: davem; +Cc: netdev, mkalderon, aelior

The patch series adds support for managing the NVM config attributes.
Patch (1) adds functionality to update config attributes via MFW.
Patch (2) adds driver interface for updating the config attributes.

Changes from previous versions:
-------------------------------
v4: Added more details on the functionality and its usage.
v3: Removed unused variable.
v2: Removed unused API.

Please consider applying this series to "net-next".


Sudarsana Reddy Kalluru (2):
  qed: Add API for configuring NVM attributes.
  qed: Add driver API for flashing the config attributes.

 drivers/net/ethernet/qlogic/qed/qed_hsi.h  | 17 ++++++++
 drivers/net/ethernet/qlogic/qed/qed_main.c | 68 ++++++++++++++++++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.c  | 32 ++++++++++++++
 drivers/net/ethernet/qlogic/qed/qed_mcp.h  | 20 +++++++++
 include/linux/qed/qed_if.h                 |  1 +
 5 files changed, 138 insertions(+)

-- 
1.8.3.1


^ permalink raw reply

* [PATCH 1/2] PTP: introduce new versions of IOCTLs
From: Felipe Balbi @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Richard Cochran; +Cc: Christopher S Hall, netdev, linux-kernel, Felipe Balbi

The current version of the IOCTL have a small problem which prevents us
from extending the API by making use of reserved fields. In these new
IOCTLs, we are now making sure that flags and rsv fields are zero which
will allow us to extend the API in the future.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/ptp/ptp_chardev.c      | 58 ++++++++++++++++++++++++++++++++--
 include/uapi/linux/ptp_clock.h | 12 +++++++
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 18ffe449efdf..204212fc3f8c 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -123,9 +123,11 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 	struct timespec64 ts;
 	int enable, err = 0;
 
+	memset(&req, 0, sizeof(req));
 	switch (cmd) {
 
 	case PTP_CLOCK_GETCAPS:
+	case PTP_CLOCK_GETCAPS2:
 		memset(&caps, 0, sizeof(caps));
 		caps.max_adj = ptp->info->max_adj;
 		caps.n_alarm = ptp->info->n_alarm;
@@ -139,11 +141,22 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_EXTTS_REQUEST:
+	case PTP_EXTTS_REQUEST2:
 		if (copy_from_user(&req.extts, (void __user *)arg,
 				   sizeof(req.extts))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((req.extts.flags || req.extts.rsv[0] || req.extts.rsv[1])
+			&& cmd == PTP_EXTTS_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_EXTTS_REQUEST) {
+			req.extts.flags = 0;
+			req.extts.rsv[0] = 0;
+			req.extts.rsv[1] = 0;
+		}
+			
 		if (req.extts.index >= ops->n_ext_ts) {
 			err = -EINVAL;
 			break;
@@ -154,11 +167,24 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_PEROUT_REQUEST:
+	case PTP_PEROUT_REQUEST2:
 		if (copy_from_user(&req.perout, (void __user *)arg,
 				   sizeof(req.perout))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((req.perout.flags || req.perout.rsv[0] || req.perout.rsv[1]
+				|| req.perout.rsv[2] || req.perout.rsv[3])
+			&& cmd == PTP_PEROUT_REQUEST2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PEROUT_REQUEST) {
+			req.perout.flags = 0;
+			req.perout.rsv[0] = 0;
+			req.perout.rsv[1] = 0;
+			req.perout.rsv[2] = 0;
+			req.perout.rsv[3] = 0;
+		}
 		if (req.perout.index >= ops->n_per_out) {
 			err = -EINVAL;
 			break;
@@ -169,6 +195,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_ENABLE_PPS:
+	case PTP_ENABLE_PPS2:
 		if (!capable(CAP_SYS_TIME))
 			return -EPERM;
 		req.type = PTP_CLK_REQ_PPS;
@@ -177,6 +204,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_PRECISE:
+	case PTP_SYS_OFFSET_PRECISE2:
 		if (!ptp->info->getcrosststamp) {
 			err = -EOPNOTSUPP;
 			break;
@@ -201,6 +229,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET_EXTENDED:
+	case PTP_SYS_OFFSET_EXTENDED2:
 		if (!ptp->info->gettimex64) {
 			err = -EOPNOTSUPP;
 			break;
@@ -232,6 +261,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		break;
 
 	case PTP_SYS_OFFSET:
+	case PTP_SYS_OFFSET2:
 		sysoff = memdup_user((void __user *)arg, sizeof(*sysoff));
 		if (IS_ERR(sysoff)) {
 			err = PTR_ERR(sysoff);
@@ -265,11 +295,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
-	case PTP_PIN_GETFUNC:
+	case PTP_PIN_GETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_GETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_GETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
@@ -284,11 +326,23 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 		break;
 
-	case PTP_PIN_SETFUNC:
+	case PTP_PIN_SETFUNC2:
 		if (copy_from_user(&pd, (void __user *)arg, sizeof(pd))) {
 			err = -EFAULT;
 			break;
 		}
+		if ((pd.rsv[0] || pd.rsv[1] || pd.rsv[2]
+				|| pd.rsv[3] || pd.rsv[4])
+			&& cmd == PTP_PIN_SETFUNC2) {
+			err = -EINVAL;
+			break;
+		} else if (cmd == PTP_PIN_SETFUNC) {
+			pd.rsv[0] = 0;
+			pd.rsv[1] = 0;
+			pd.rsv[2] = 0;
+			pd.rsv[3] = 0;
+			pd.rsv[4] = 0;
+		}
 		pin_index = pd.index;
 		if (pin_index >= ops->n_pins) {
 			err = -EINVAL;
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 1bc794ad957a..039cd62ec706 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -149,6 +149,18 @@ struct ptp_pin_desc {
 #define PTP_SYS_OFFSET_EXTENDED \
 	_IOWR(PTP_CLK_MAGIC, 9, struct ptp_sys_offset_extended)
 
+#define PTP_CLOCK_GETCAPS2  _IOR(PTP_CLK_MAGIC, 10, struct ptp_clock_caps)
+#define PTP_EXTTS_REQUEST2  _IOW(PTP_CLK_MAGIC, 11, struct ptp_extts_request)
+#define PTP_PEROUT_REQUEST2 _IOW(PTP_CLK_MAGIC, 12, struct ptp_perout_request)
+#define PTP_ENABLE_PPS2     _IOW(PTP_CLK_MAGIC, 13, int)
+#define PTP_SYS_OFFSET2     _IOW(PTP_CLK_MAGIC, 14, struct ptp_sys_offset)
+#define PTP_PIN_GETFUNC2    _IOWR(PTP_CLK_MAGIC, 15, struct ptp_pin_desc)
+#define PTP_PIN_SETFUNC2    _IOW(PTP_CLK_MAGIC, 16, struct ptp_pin_desc)
+#define PTP_SYS_OFFSET_PRECISE2 \
+	_IOWR(PTP_CLK_MAGIC, 17, struct ptp_sys_offset_precise)
+#define PTP_SYS_OFFSET_EXTENDED2 \
+	_IOWR(PTP_CLK_MAGIC, 18, struct ptp_sys_offset_extended)
+
 struct ptp_extts_event {
 	struct ptp_clock_time t; /* Time event occured. */
 	unsigned int index;      /* Which channel produced the event. */
-- 
2.22.0


^ permalink raw reply related

* [PATCH 2/2] PTP: add support for one-shot output
From: Felipe Balbi @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Richard Cochran; +Cc: Christopher S Hall, netdev, linux-kernel, Felipe Balbi
In-Reply-To: <20190814074712.10684-1-felipe.balbi@linux.intel.com>

Some controllers allow for a one-shot output pulse, in contrast to
periodic output. Now that we have extensible versions of our IOCTLs, we
can finally make use of the 'flags' field to pass a bit telling driver
that if we want one-shot pulse output.

Signed-off-by: Felipe Balbi <felipe.balbi@linux.intel.com>
---
 drivers/ptp/ptp_chardev.c      | 5 ++---
 include/uapi/linux/ptp_clock.h | 4 +++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index 204212fc3f8c..b75a65880056 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -173,9 +173,8 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 			err = -EFAULT;
 			break;
 		}
-		if ((req.perout.flags || req.perout.rsv[0] || req.perout.rsv[1]
-				|| req.perout.rsv[2] || req.perout.rsv[3])
-			&& cmd == PTP_PEROUT_REQUEST2) {
+		if ((req.perout.rsv[0] || req.perout.rsv[1] || req.perout.rsv[2]
+			|| req.perout.rsv[3]) && cmd == PTP_PEROUT_REQUEST2) {
 			err = -EINVAL;
 			break;
 		} else if (cmd == PTP_PEROUT_REQUEST) {
diff --git a/include/uapi/linux/ptp_clock.h b/include/uapi/linux/ptp_clock.h
index 039cd62ec706..9412b16cc8ed 100644
--- a/include/uapi/linux/ptp_clock.h
+++ b/include/uapi/linux/ptp_clock.h
@@ -67,7 +67,9 @@ struct ptp_perout_request {
 	struct ptp_clock_time start;  /* Absolute start time. */
 	struct ptp_clock_time period; /* Desired period, zero means disable. */
 	unsigned int index;           /* Which channel to configure. */
-	unsigned int flags;           /* Reserved for future use. */
+
+#define PTP_PEROUT_ONE_SHOT BIT(0)
+	unsigned int flags;           /* Bit 0 -> oneshot output. */
 	unsigned int rsv[4];          /* Reserved for future use. */
 };
 
-- 
2.22.0


^ permalink raw reply related

* Re: [patch net-next] selftests: netdevsim: add devlink params tests
From: Jiri Pirko @ 2019-08-14  7:47 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: netdev, davem, mlxsw
In-Reply-To: <20190813154108.30509472@cakuba.netronome.com>

Wed, Aug 14, 2019 at 12:41:08AM CEST, jakub.kicinski@netronome.com wrote:
>On Tue, 13 Aug 2019 15:04:46 +0200, Jiri Pirko wrote:
>> From: Jiri Pirko <jiri@mellanox.com>
>> 
>> Test recently added netdevsim devlink param implementation.
>> 
>> Signed-off-by: Jiri Pirko <jiri@mellanox.com>
>
>Thanks for the test, but it doesn't pass here:
>
>TEST: fw flash test                                                 [ OK ]
>TEST: params test                                                   [FAIL]
>	Failed to get test1 param value

Interesting. Fors for me correctly. When I run it manually, I get this:
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test1 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
true
bash-5.0# echo $?
0
bash-5.0# devlink dev param set netdevsim/netdevsim11 name test1 cmode driverinit value false
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test1 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
false
bash-5.0# echo $?
0




>
>> diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> index 9d8baf5d14b3..858ebdc8d8a3 100755
>> --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh
>> @@ -3,7 +3,7 @@
>>  
>>  lib_dir=$(dirname $0)/../../../net/forwarding
>>  
>> -ALL_TESTS="fw_flash_test"
>> +ALL_TESTS="fw_flash_test params_test"
>>  NUM_NETIFS=0
>>  source $lib_dir/lib.sh
>>  
>> @@ -30,6 +30,66 @@ fw_flash_test()
>>  	log_test "fw flash test"
>>  }
>>  
>> +param_get()
>> +{
>> +	local name=$1
>> +
>> +	devlink dev param show $DL_HANDLE name $name -j | \
>> +		jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
>
>                   ^^
>
>The -e makes jq set exit code to 1 when test1 param is false.
>
>Quoting the man page:
>
>       ·   -e / --exit-status:
>
>           Sets the exit status of jq to 0 if the last output values
>           was neither false nor null, 1 if the last output value was
>           either false or  null,  or  4  if  no valid  result  was
>           ever produced. Normally jq exits with 2 if there was any
>           usage problem or system error, 3 if there was a jq program
>           compile error, or 0 if the jq program ran.
>
>Without the -e all is well:

Not really, for non-existent param the return value would be wrong:
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test2 -j | jq -e -r '.[][][].values[] | select(.cmode == "driverinit").value'
devlink answers: Invalid argument
bash-5.0# echo $?
4
bash-5.0# devlink dev param show netdevsim/netdevsim11 name test2 -j | jq -r '.[][][].values[] | select(.cmode == "driverinit").value'
devlink answers: Invalid argument
bash-5.0# echo $?
0

The return value is 0 like everyone is fine. You probably have a
different jq version (1.6). Looks like I need to use the same
workaround I have in tools/testing/selftests/net/forwarding/tc_common.sh.
I thought that -e would avoid that.



>
># ./devlink.sh 
>TEST: fw flash test                                                 [ OK ]
>TEST: params test                                                   [ OK ]
>
>> +}
>> +

^ permalink raw reply

* Re: [RFC PATCH bpf-next 00/14] xdp_flow: Flow offload to XDP
From: Toshiaki Makita @ 2019-08-14  7:33 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
	Yonghong Song, David S. Miller, Jakub Kicinski,
	Jesper Dangaard Brouer, John Fastabend, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, netdev, bpf, William Tu
In-Reply-To: <20190814014445.3dnduyrass5jycr5@ast-mbp>

Hi Alexei, thank you for taking a look!

On 2019/08/14 10:44, Alexei Starovoitov wrote:
> On Tue, Aug 13, 2019 at 09:05:44PM +0900, Toshiaki Makita wrote:
>> This is a rough PoC for an idea to offload TC flower to XDP.
> ...
>>   xdp_flow  TC        ovs kmod
>>   --------  --------  --------
>>   4.0 Mpps  1.1 Mpps  1.1 Mpps
> 
> Is xdp_flow limited to 4 Mpps due to veth or something else?

Looking at perf, accumulation of each layer's overhead resulted in the number.
With XDP prog which only redirects packets and does not touch the data,
the drop rate is 10 Mpps. In this case the main overhead is XDP's redirect processing
and handling of 2 XDP progs (in veth and i40e).
In the xdp_flow test the overhead additionally includes flow key parse in XDP prog
and hash table lookup (including jhash calculation) which resulted in 4 Mpps.

>> So xdp_flow drop rate is roughly 4x faster than software TC or ovs kmod.
>>
>> OTOH the time to add a flow increases with xdp_flow.
>>
>> ping latency of first packet when veth1 does XDP_PASS instead of DROP:
>>
>>   xdp_flow  TC        ovs kmod
>>   --------  --------  --------
>>   25ms      12ms      0.6ms
>>
>> xdp_flow does a lot of work to emulate TC behavior including UMH
>> transaction and multiple bpf map update from UMH which I think increases
>> the latency.
> 
> make sense, but why vanilla TC is so slow ?

No ideas. At least TC requires additional syscall to insert a flow compared to ovs kmod,
but 12 ms looks too long for that.

>> * Implementation
>>
>> xdp_flow makes use of UMH to load an eBPF program for XDP, similar to
>> bpfilter. The difference is that xdp_flow does not generate the eBPF
>> program dynamically but a prebuilt program is embedded in UMH. This is
>> mainly because flow insertion is considerably frequent. If we generate
>> and load an eBPF program on each insertion of a flow, the latency of the
>> first packet of ping in above test will incease, which I want to avoid.
> 
> I think UMH approach is a good fit for this.
> Clearly the same algorithm can be done as kernel code or kernel module, but
> bpfilter-like UMH is a safer approach.
> 
>> - patch 9
>>   Add tc-offload-xdp netdev feature and hooks to call xdp_flow kmod in
>>   TC flower offload code.
> 
> The hook into UMH from TC looks simple. Do you expect the same interface to be
> reused from OVS ?

Do you mean openvswitch kernel module by OVS?
If so, no, at this point. TC hook is simple because I reused flow offload mechanism.
OVS kmod does not have offload interface and ovs-vswitchd is using TC for offload.
I wanted to reuse this mechanism for offloading to XDP, so using TC.

>> * About alternative userland (ovs-vswitchd etc.) implementation
>>
>> Maybe a similar logic can be implemented in ovs-vswitchd offload
>> mechanism, instead of adding code to kernel. I just thought offloading
>> TC is more generic and allows wider usage with direct TC command.
>>
>> For example, considering that OVS inserts a flow to kernel only when
>> flow miss happens in kernel, we can in advance add offloaded flows via
>> tc filter to avoid flow insertion latency for certain sensitive flows.
>> TC flower usage without using OVS is also possible.
>>
>> Also as written above nftables can be offloaded to XDP with this
>> mechanism as well.
> 
> Makes sense to me.
> 
>>    bpf, hashtab: Compare keys in long
> 
> 3Mpps vs 4Mpps just from this patch ?
> or combined with i40 prefech patch ?

Combined.

>>   drivers/net/ethernet/intel/i40e/i40e_txrx.c  |    1 +
> 
> Could you share "perf report" for just hash tab optimization
> and for i40 ?

Sure, I'll get some more data and post them.

> I haven't seen memcmp to be bottle neck in hash tab.
> What is the the of the key?

typo of "size of the key"? IIRC 64 bytes.

Toshiaki Makita

^ permalink raw reply

* [PATCH bpf-next v4 8/8] net/mlx5e: Add AF_XDP need_wakeup support
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

From: Maxim Mikityanskiy <maximmi@mellanox.com>

This commit adds support for the new need_wakeup feature of AF_XDP. The
applications can opt-in by using the XDP_USE_NEED_WAKEUP bind() flag.
When this feature is enabled, some behavior changes:

RX side: If the Fill Ring is empty, instead of busy-polling, set the
flag to tell the application to kick the driver when it refills the Fill
Ring.

TX side: If there are pending completions or packets queued for
transmission, set the flag to tell the application that it can skip the
sendto() syscall and save time.

The performance testing was performed on a machine with the following
configuration:

- 24 cores of Intel Xeon E5-2620 v3 @ 2.40 GHz
- Mellanox ConnectX-5 Ex with 100 Gbit/s link

The results with retpoline disabled:

       | without need_wakeup  | with need_wakeup     |
       |----------------------|----------------------|
       | one core | two cores | one core | two cores |
-------|----------|-----------|----------|-----------|
txonly | 20.1     | 33.5      | 29.0     | 34.2      |
rxdrop | 0.065    | 14.1      | 12.0     | 14.1      |
l2fwd  | 0.032    | 7.3       | 6.6      | 7.2       |

"One core" means the application and NAPI run on the same core. "Two
cores" means they are pinned to different cores.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h | 14 ++++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h | 12 ++++++++++++
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c     |  7 +++++--
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c   | 20 +++++++++++++++++---
 4 files changed, 48 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
index 307b923..cab0e93 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.h
@@ -5,6 +5,7 @@
 #define __MLX5_EN_XSK_RX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* RX data path */
 
@@ -24,4 +25,17 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 					      struct mlx5e_wqe_frag_info *wi,
 					      u32 cqe_bcnt);
 
+static inline bool mlx5e_xsk_update_rx_wakeup(struct mlx5e_rq *rq, bool alloc_err)
+{
+	if (!xsk_umem_uses_need_wakeup(rq->umem))
+		return alloc_err;
+
+	if (unlikely(alloc_err))
+		xsk_set_rx_need_wakeup(rq->umem);
+	else
+		xsk_clear_rx_need_wakeup(rq->umem);
+
+	return false;
+}
+
 #endif /* __MLX5_EN_XSK_RX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
index 9c50515..79b487d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.h
@@ -5,6 +5,7 @@
 #define __MLX5_EN_XSK_TX_H__
 
 #include "en.h"
+#include <net/xdp_sock.h>
 
 /* TX data path */
 
@@ -12,4 +13,15 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags);
 
 bool mlx5e_xsk_tx(struct mlx5e_xdpsq *sq, unsigned int budget);
 
+static inline void mlx5e_xsk_update_tx_wakeup(struct mlx5e_xdpsq *sq)
+{
+	if (!xsk_umem_uses_need_wakeup(sq->umem))
+		return;
+
+	if (sq->pc != sq->cc)
+		xsk_clear_tx_need_wakeup(sq->umem);
+	else
+		xsk_set_tx_need_wakeup(sq->umem);
+}
+
 #endif /* __MLX5_EN_XSK_TX_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index 60570b4..fae0694 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -692,8 +692,11 @@ bool mlx5e_post_rx_mpwqes(struct mlx5e_rq *rq)
 	rq->mpwqe.umr_in_progress += rq->mpwqe.umr_last_bulk;
 	rq->mpwqe.actual_wq_head   = head;
 
-	/* If XSK Fill Ring doesn't have enough frames, busy poll by
-	 * rescheduling the NAPI poll.
+	/* If XSK Fill Ring doesn't have enough frames, report the error, so
+	 * that one of the actions can be performed:
+	 * 1. If need_wakeup is used, signal that the application has to kick
+	 * the driver when it refills the Fill Ring.
+	 * 2. Otherwise, busy poll by rescheduling the NAPI poll.
 	 */
 	if (unlikely(alloc_err == -ENOMEM && rq->umem))
 		return true;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 6d16dee..257a7c9 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include "en.h"
 #include "en/xdp.h"
+#include "en/xsk/rx.h"
 #include "en/xsk/tx.h"
 
 static inline bool mlx5e_channel_no_affinity_change(struct mlx5e_channel *c)
@@ -83,10 +84,23 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
 
 static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
 {
-	bool busy_xsk = false;
-
+	bool busy_xsk = false, xsk_rx_alloc_err;
+
+	/* Handle the race between the application querying need_wakeup and the
+	 * driver setting it:
+	 * 1. Update need_wakeup both before and after the TX. If it goes to
+	 * "yes", it can only happen with the first update.
+	 * 2. If the application queried need_wakeup before we set it, the
+	 * packets will be transmitted anyway, even w/o a wakeup.
+	 * 3. Give a chance to clear need_wakeup after new packets were queued
+	 * for TX.
+	 */
+	mlx5e_xsk_update_tx_wakeup(xsksq);
 	busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-	busy_xsk |= xskrq->post_wqes(xskrq);
+	mlx5e_xsk_update_tx_wakeup(xsksq);
+
+	xsk_rx_alloc_err = xskrq->post_wqes(xskrq);
+	busy_xsk |= mlx5e_xsk_update_rx_wakeup(xskrq, xsk_rx_alloc_err);
 
 	return busy_xsk;
 }
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 7/8] net/mlx5e: Move the SW XSK code from NAPI poll to a separate function
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

From: Maxim Mikityanskiy <maximmi@mellanox.com>

Two XSK tasks are performed during NAPI polling, that are not bound to
hardware interrupts: TXing packets and polling for frames in the Fill
Ring. They are special in a way that the hardware doesn't know about
these tasks, so it doesn't trigger interrupts if there is still some
work to be done, it's our driver's responsibility to ensure NAPI will be
rescheduled if needed.

Create a new function to handle these tasks and move the corresponding
code from mlx5e_napi_poll to the new function to improve modularity and
prepare for the changes in the following patch.

Signed-off-by: Maxim Mikityanskiy <maximmi@mellanox.com>
Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
Reviewed-by: Saeed Mahameed <saeedm@mellanox.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
index 49b06b2..6d16dee 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_txrx.c
@@ -81,6 +81,16 @@ void mlx5e_trigger_irq(struct mlx5e_icosq *sq)
 	mlx5e_notify_hw(wq, sq->pc, sq->uar_map, &nopwqe->ctrl);
 }
 
+static bool mlx5e_napi_xsk_post(struct mlx5e_xdpsq *xsksq, struct mlx5e_rq *xskrq)
+{
+	bool busy_xsk = false;
+
+	busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
+	busy_xsk |= xskrq->post_wqes(xskrq);
+
+	return busy_xsk;
+}
+
 int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 {
 	struct mlx5e_channel *c = container_of(napi, struct mlx5e_channel,
@@ -122,8 +132,7 @@ int mlx5e_napi_poll(struct napi_struct *napi, int budget)
 	if (xsk_open) {
 		mlx5e_poll_ico_cq(&c->xskicosq.cq);
 		busy |= mlx5e_poll_xdpsq_cq(&xsksq->cq);
-		busy_xsk |= mlx5e_xsk_tx(xsksq, MLX5E_TX_XSK_POLL_BUDGET);
-		busy_xsk |= xskrq->post_wqes(xskrq);
+		busy_xsk |= mlx5e_napi_xsk_post(xsksq, xskrq);
 	}
 
 	busy |= busy_xsk;
-- 
2.7.4


^ permalink raw reply related

* [PATCH bpf-next v4 6/8] samples/bpf: add use of need_wakeup flag in xdpsock
From: Magnus Karlsson @ 2019-08-14  7:27 UTC (permalink / raw)
  To: magnus.karlsson, bjorn.topel, ast, daniel, netdev, brouer,
	maximmi
  Cc: bpf, bruce.richardson, ciara.loftus, jakub.kicinski, xiaolong.ye,
	qi.z.zhang, sridhar.samudrala, kevin.laatz, ilias.apalodimas,
	jonathan.lemon, kiran.patil, axboe, maciej.fijalkowski,
	maciejromanfijalkowski, intel-wired-lan
In-Reply-To: <1565767643-4908-1-git-send-email-magnus.karlsson@intel.com>

This commit adds using the need_wakeup flag to the xdpsock sample
application. It is turned on by default as we think it is a feature
that seems to always produce a performance benefit, if the application
has been written taking advantage of it. It can be turned off in the
sample app by using the '-m' command line option.

The txpush and l2fwd sub applications have also been updated to
support poll() with multiple sockets.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
---
 samples/bpf/xdpsock_user.c | 192 ++++++++++++++++++++++++++++-----------------
 1 file changed, 120 insertions(+), 72 deletions(-)

diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c
index 93eaaf7..da84c76 100644
--- a/samples/bpf/xdpsock_user.c
+++ b/samples/bpf/xdpsock_user.c
@@ -67,8 +67,10 @@ static int opt_ifindex;
 static int opt_queue;
 static int opt_poll;
 static int opt_interval = 1;
-static u32 opt_xdp_bind_flags;
+static u32 opt_xdp_bind_flags = XDP_USE_NEED_WAKEUP;
 static int opt_xsk_frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
+static int opt_timeout = 1000;
+static bool opt_need_wakeup = true;
 static __u32 prog_id;
 
 struct xsk_umem_info {
@@ -352,6 +354,7 @@ static struct option long_options[] = {
 	{"zero-copy", no_argument, 0, 'z'},
 	{"copy", no_argument, 0, 'c'},
 	{"frame-size", required_argument, 0, 'f'},
+	{"no-need-wakeup", no_argument, 0, 'm'},
 	{0, 0, 0, 0}
 };
 
@@ -372,6 +375,7 @@ static void usage(const char *prog)
 		"  -z, --zero-copy      Force zero-copy mode.\n"
 		"  -c, --copy           Force copy mode.\n"
 		"  -f, --frame-size=n   Set the frame size (must be a power of two, default is %d).\n"
+		"  -m, --no-need-wakeup Turn off use of driver need wakeup flag.\n"
 		"\n";
 	fprintf(stderr, str, prog, XSK_UMEM__DEFAULT_FRAME_SIZE);
 	exit(EXIT_FAILURE);
@@ -384,8 +388,9 @@ static void parse_command_line(int argc, char **argv)
 	opterr = 0;
 
 	for (;;) {
-		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:", long_options,
-				&option_index);
+
+		c = getopt_long(argc, argv, "Frtli:q:psSNn:czf:m",
+				long_options, &option_index);
 		if (c == -1)
 			break;
 
@@ -429,6 +434,9 @@ static void parse_command_line(int argc, char **argv)
 			break;
 		case 'f':
 			opt_xsk_frame_size = atoi(optarg);
+		case 'm':
+			opt_need_wakeup = false;
+			opt_xdp_bind_flags &= ~XDP_USE_NEED_WAKEUP;
 			break;
 		default:
 			usage(basename(argv[0]));
@@ -459,7 +467,8 @@ static void kick_tx(struct xsk_socket_info *xsk)
 	exit_with_error(errno);
 }
 
-static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
+static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk,
+				     struct pollfd *fds)
 {
 	u32 idx_cq = 0, idx_fq = 0;
 	unsigned int rcvd;
@@ -468,7 +477,9 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk);
+	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+		kick_tx(xsk);
+
 	ndescs = (xsk->outstanding_tx > BATCH_SIZE) ? BATCH_SIZE :
 		xsk->outstanding_tx;
 
@@ -482,6 +493,8 @@ static inline void complete_tx_l2fwd(struct xsk_socket_info *xsk)
 		while (ret != rcvd) {
 			if (ret < 0)
 				exit_with_error(-ret);
+			if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+				ret = poll(fds, num_socks, opt_timeout);
 			ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd,
 						     &idx_fq);
 		}
@@ -505,7 +518,8 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 	if (!xsk->outstanding_tx)
 		return;
 
-	kick_tx(xsk);
+	if (!opt_need_wakeup || xsk_ring_prod__needs_wakeup(&xsk->tx))
+		kick_tx(xsk);
 
 	rcvd = xsk_ring_cons__peek(&xsk->umem->cq, BATCH_SIZE, &idx);
 	if (rcvd > 0) {
@@ -515,20 +529,25 @@ static inline void complete_tx_only(struct xsk_socket_info *xsk)
 	}
 }
 
-static void rx_drop(struct xsk_socket_info *xsk)
+static void rx_drop(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
 	unsigned int rcvd, i;
 	u32 idx_rx = 0, idx_fq = 0;
 	int ret;
 
 	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
-	if (!rcvd)
+	if (!rcvd) {
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
 		return;
+	}
 
 	ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
 	while (ret != rcvd) {
 		if (ret < 0)
 			exit_with_error(-ret);
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
 		ret = xsk_ring_prod__reserve(&xsk->umem->fq, rcvd, &idx_fq);
 	}
 
@@ -549,42 +568,65 @@ static void rx_drop(struct xsk_socket_info *xsk)
 static void rx_drop_all(void)
 {
 	struct pollfd fds[MAX_SOCKS + 1];
-	int i, ret, timeout, nfds = 1;
+	int i, ret;
 
 	memset(fds, 0, sizeof(fds));
 
 	for (i = 0; i < num_socks; i++) {
 		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
 		fds[i].events = POLLIN;
-		timeout = 1000; /* 1sn */
 	}
 
 	for (;;) {
 		if (opt_poll) {
-			ret = poll(fds, nfds, timeout);
+			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
 		}
 
 		for (i = 0; i < num_socks; i++)
-			rx_drop(xsks[i]);
+			rx_drop(xsks[i], fds);
+	}
+}
+
+static void tx_only(struct xsk_socket_info *xsk, u32 frame_nb)
+{
+	u32 idx;
+
+	if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) == BATCH_SIZE) {
+		unsigned int i;
+
+		for (i = 0; i < BATCH_SIZE; i++) {
+			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr	=
+				(frame_nb + i) << XSK_UMEM__DEFAULT_FRAME_SHIFT;
+			xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
+				sizeof(pkt_data) - 1;
+		}
+
+		xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
+		xsk->outstanding_tx += BATCH_SIZE;
+		frame_nb += BATCH_SIZE;
+		frame_nb %= NUM_FRAMES;
 	}
+
+	complete_tx_only(xsk);
 }
 
-static void tx_only(struct xsk_socket_info *xsk)
+static void tx_only_all(void)
 {
-	int timeout, ret, nfds = 1;
-	struct pollfd fds[nfds + 1];
-	u32 idx, frame_nb = 0;
+	struct pollfd fds[MAX_SOCKS];
+	u32 frame_nb[MAX_SOCKS] = {};
+	int i, ret;
 
 	memset(fds, 0, sizeof(fds));
-	fds[0].fd = xsk_socket__fd(xsk->xsk);
-	fds[0].events = POLLOUT;
-	timeout = 1000; /* 1sn */
+	for (i = 0; i < num_socks; i++) {
+		fds[0].fd = xsk_socket__fd(xsks[i]->xsk);
+		fds[0].events = POLLOUT;
+	}
 
 	for (;;) {
 		if (opt_poll) {
-			ret = poll(fds, nfds, timeout);
+			ret = poll(fds, num_socks, opt_timeout);
 			if (ret <= 0)
 				continue;
 
@@ -592,69 +634,75 @@ static void tx_only(struct xsk_socket_info *xsk)
 				continue;
 		}
 
-		if (xsk_ring_prod__reserve(&xsk->tx, BATCH_SIZE, &idx) ==
-		    BATCH_SIZE) {
-			unsigned int i;
-
-			for (i = 0; i < BATCH_SIZE; i++) {
-				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->addr
-					= (frame_nb + i) * opt_xsk_frame_size;
-				xsk_ring_prod__tx_desc(&xsk->tx, idx + i)->len =
-					sizeof(pkt_data) - 1;
-			}
-
-			xsk_ring_prod__submit(&xsk->tx, BATCH_SIZE);
-			xsk->outstanding_tx += BATCH_SIZE;
-			frame_nb += BATCH_SIZE;
-			frame_nb %= NUM_FRAMES;
-		}
-
-		complete_tx_only(xsk);
+		for (i = 0; i < num_socks; i++)
+			tx_only(xsks[i], frame_nb[i]);
 	}
 }
 
-static void l2fwd(struct xsk_socket_info *xsk)
+static void l2fwd(struct xsk_socket_info *xsk, struct pollfd *fds)
 {
-	for (;;) {
-		unsigned int rcvd, i;
-		u32 idx_rx = 0, idx_tx = 0;
-		int ret;
+	unsigned int rcvd, i;
+	u32 idx_rx = 0, idx_tx = 0;
+	int ret;
 
-		for (;;) {
-			complete_tx_l2fwd(xsk);
+	complete_tx_l2fwd(xsk, fds);
 
-			rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE,
-						   &idx_rx);
-			if (rcvd > 0)
-				break;
-		}
+	rcvd = xsk_ring_cons__peek(&xsk->rx, BATCH_SIZE, &idx_rx);
+	if (!rcvd) {
+		if (xsk_ring_prod__needs_wakeup(&xsk->umem->fq))
+			ret = poll(fds, num_socks, opt_timeout);
+		return;
+	}
 
+	ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
+	while (ret != rcvd) {
+		if (ret < 0)
+			exit_with_error(-ret);
+		if (xsk_ring_prod__needs_wakeup(&xsk->tx))
+			kick_tx(xsk);
 		ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-		while (ret != rcvd) {
-			if (ret < 0)
-				exit_with_error(-ret);
-			ret = xsk_ring_prod__reserve(&xsk->tx, rcvd, &idx_tx);
-		}
+	}
+
+	for (i = 0; i < rcvd; i++) {
+		u64 addr = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx)->addr;
+		u32 len = xsk_ring_cons__rx_desc(&xsk->rx, idx_rx++)->len;
+		char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+
+		swap_mac_addresses(pkt);
 
-		for (i = 0; i < rcvd; i++) {
-			u64 addr = xsk_ring_cons__rx_desc(&xsk->rx,
-							  idx_rx)->addr;
-			u32 len = xsk_ring_cons__rx_desc(&xsk->rx,
-							 idx_rx++)->len;
-			char *pkt = xsk_umem__get_data(xsk->umem->buffer, addr);
+		hex_dump(pkt, len, addr);
+		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
+		xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
+	}
 
-			swap_mac_addresses(pkt);
+	xsk_ring_prod__submit(&xsk->tx, rcvd);
+	xsk_ring_cons__release(&xsk->rx, rcvd);
 
-			hex_dump(pkt, len, addr);
-			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx)->addr = addr;
-			xsk_ring_prod__tx_desc(&xsk->tx, idx_tx++)->len = len;
-		}
+	xsk->rx_npkts += rcvd;
+	xsk->outstanding_tx += rcvd;
+}
+
+static void l2fwd_all(void)
+{
+	struct pollfd fds[MAX_SOCKS];
+	int i, ret;
+
+	memset(fds, 0, sizeof(fds));
+
+	for (i = 0; i < num_socks; i++) {
+		fds[i].fd = xsk_socket__fd(xsks[i]->xsk);
+		fds[i].events = POLLOUT | POLLIN;
+	}
 
-		xsk_ring_prod__submit(&xsk->tx, rcvd);
-		xsk_ring_cons__release(&xsk->rx, rcvd);
+	for (;;) {
+		if (opt_poll) {
+			ret = poll(fds, num_socks, opt_timeout);
+			if (ret <= 0)
+				continue;
+		}
 
-		xsk->rx_npkts += rcvd;
-		xsk->outstanding_tx += rcvd;
+		for (i = 0; i < num_socks; i++)
+			l2fwd(xsks[i], fds);
 	}
 }
 
@@ -705,9 +753,9 @@ int main(int argc, char **argv)
 	if (opt_bench == BENCH_RXDROP)
 		rx_drop_all();
 	else if (opt_bench == BENCH_TXONLY)
-		tx_only(xsks[0]);
+		tx_only_all();
 	else
-		l2fwd(xsks[0]);
+		l2fwd_all();
 
 	return 0;
 }
-- 
2.7.4


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox