Netdev List

Netdev List
 help / color / mirror / Atom feed

* [bpf-next v2 PATCH 2/3] samples/bpf: make xdp_fwd more practically usable via devmap lookup
From: Jesper Dangaard Brouer @ 2019-08-08 15:46 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov
  Cc: a.s.protopopov, dsahern, Toke Høiland-Jørgensen,
	ys114321, Jesper Dangaard Brouer
In-Reply-To: <156527914510.20297.12225832190052744019.stgit@firesoul>

This address the TODO in samples/bpf/xdp_fwd_kern.c, which points out
that the chosen egress index should be checked for existence in the
devmap. This can now be done via taking advantage of Toke's work in
commit 0cdbb4b09a06 ("devmap: Allow map lookups from eBPF").

This change makes xdp_fwd more practically usable, as this allows for
a mixed environment, where IP-forwarding fallback to network stack, if
the egress device isn't configured to use XDP.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
---
 samples/bpf/xdp_fwd_kern.c |   17 +++++++++++------
 samples/bpf/xdp_fwd_user.c |   36 +++++++++++++++++++++++++-----------
 2 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index e6ffc4ea06f4..a43d6953c054 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -104,13 +104,18 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 	rc = bpf_fib_lookup(ctx, &fib_params, sizeof(fib_params), flags);
 
-	/* verify egress index has xdp support
-	 * TO-DO bpf_map_lookup_elem(&tx_port, &key) fails with
-	 *       cannot pass map_type 14 into func bpf_map_lookup_elem#1:
-	 * NOTE: without verification that egress index supports XDP
-	 *       forwarding packets are dropped.
-	 */
 	if (rc == 0) {
+		/* Verify egress index has been configured as TX-port.
+		 * (Note: User can still have inserted an egress ifindex that
+		 * doesn't support XDP xmit, which will result in packet drops).
+		 *
+		 * Note: lookup in devmap supported since 0cdbb4b09a0.
+		 * If not supported will fail with:
+		 *  cannot pass map_type 14 into func bpf_map_lookup_elem#1:
+		 */
+		if (!bpf_map_lookup_elem(&xdp_tx_ports, &fib_params.ifindex))
+			return XDP_PASS;
+
 		if (h_proto == htons(ETH_P_IP))
 			ip_decrease_ttl(iph);
 		else if (h_proto == htons(ETH_P_IPV6))
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index ba012d9f93dd..20951bc27477 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -27,14 +27,20 @@
 #include "libbpf.h"
 #include <bpf/bpf.h>
 
-
-static int do_attach(int idx, int fd, const char *name)
+static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
 {
 	int err;
 
-	err = bpf_set_link_xdp_fd(idx, fd, 0);
-	if (err < 0)
+	err = bpf_set_link_xdp_fd(idx, prog_fd, 0);
+	if (err < 0) {
 		printf("ERROR: failed to attach program to %s\n", name);
+		return err;
+	}
+
+	/* Adding ifindex as a possible egress TX port */
+	err = bpf_map_update_elem(map_fd, &idx, &idx, 0);
+	if (err)
+		printf("ERROR: failed using device %s as TX-port\n", name);
 
 	return err;
 }
@@ -47,6 +53,9 @@ static int do_detach(int idx, const char *name)
 	if (err < 0)
 		printf("ERROR: failed to detach program from %s\n", name);
 
+	/* TODO: Remember to cleanup map, when adding use of shared map
+	 *  bpf_map_delete_elem((map_fd, &idx);
+	 */
 	return err;
 }
 
@@ -67,10 +76,10 @@ int main(int argc, char **argv)
 	};
 	const char *prog_name = "xdp_fwd";
 	struct bpf_program *prog;
+	int prog_fd, map_fd = -1;
 	char filename[PATH_MAX];
 	struct bpf_object *obj;
 	int opt, i, idx, err;
-	int prog_fd, map_fd;
 	int attach = 1;
 	int ret = 0;
 
@@ -103,8 +112,17 @@ int main(int argc, char **argv)
 			return 1;
 		}
 
-		if (bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd))
+		err = bpf_prog_load_xattr(&prog_load_attr, &obj, &prog_fd);
+		if (err) {
+			if (err == -22) {
+				printf("Does kernel support devmap lookup?\n");
+				/* If not, the error message will be:
+				 * "cannot pass map_type 14 into func
+				 * bpf_map_lookup_elem#1"
+				 */
+			}
 			return 1;
+		}
 
 		prog = bpf_object__find_program_by_title(obj, prog_name);
 		prog_fd = bpf_program__fd(prog);
@@ -119,10 +137,6 @@ int main(int argc, char **argv)
 			return 1;
 		}
 	}
-	if (attach) {
-		for (i = 1; i < 64; ++i)
-			bpf_map_update_elem(map_fd, &i, &i, 0);
-	}
 
 	for (i = optind; i < argc; ++i) {
 		idx = if_nametoindex(argv[i]);
@@ -138,7 +152,7 @@ int main(int argc, char **argv)
 			if (err)
 				ret = err;
 		} else {
-			err = do_attach(idx, prog_fd, argv[i]);
+			err = do_attach(idx, prog_fd, map_fd, argv[i]);
 			if (err)
 				ret = err;
 		}


^ permalink raw reply related

* [bpf-next v2 PATCH 1/3] samples/bpf: xdp_fwd rename devmap name to be xdp_tx_ports
From: Jesper Dangaard Brouer @ 2019-08-08 15:46 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov
  Cc: a.s.protopopov, dsahern, Toke Høiland-Jørgensen,
	ys114321, Jesper Dangaard Brouer
In-Reply-To: <156527914510.20297.12225832190052744019.stgit@firesoul>

The devmap name 'tx_port' came from a copy-paste from xdp_redirect_map
which only have a single TX port. Change name to xdp_tx_ports
to make it more descriptive.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reviewed-by: David Ahern <dsahern@gmail.com>
Acked-by: Yonghong Song <yhs@fb.com>
---
 samples/bpf/xdp_fwd_kern.c |    5 +++--
 samples/bpf/xdp_fwd_user.c |    2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index a7e94e7ff87d..e6ffc4ea06f4 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -23,7 +23,8 @@
 
 #define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
 
-struct bpf_map_def SEC("maps") tx_port = {
+/* For TX-traffic redirect requires net_device ifindex to be in this devmap */
+struct bpf_map_def SEC("maps") xdp_tx_ports = {
 	.type = BPF_MAP_TYPE_DEVMAP,
 	.key_size = sizeof(int),
 	.value_size = sizeof(int),
@@ -117,7 +118,7 @@ static __always_inline int xdp_fwd_flags(struct xdp_md *ctx, u32 flags)
 
 		memcpy(eth->h_dest, fib_params.dmac, ETH_ALEN);
 		memcpy(eth->h_source, fib_params.smac, ETH_ALEN);
-		return bpf_redirect_map(&tx_port, fib_params.ifindex, 0);
+		return bpf_redirect_map(&xdp_tx_ports, fib_params.ifindex, 0);
 	}
 
 	return XDP_PASS;
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index 5b46ee12c696..ba012d9f93dd 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -113,7 +113,7 @@ int main(int argc, char **argv)
 			return 1;
 		}
 		map_fd = bpf_map__fd(bpf_object__find_map_by_name(obj,
-								  "tx_port"));
+							"xdp_tx_ports"));
 		if (map_fd < 0) {
 			printf("map not found: %s\n", strerror(map_fd));
 			return 1;


^ permalink raw reply related

* [bpf-next v2 PATCH 0/3] bpf: improvements to xdp_fwd sample
From: Jesper Dangaard Brouer @ 2019-08-08 15:46 UTC (permalink / raw)
  To: netdev, Daniel Borkmann, Alexei Starovoitov
  Cc: a.s.protopopov, dsahern, Toke Høiland-Jørgensen,
	ys114321, Jesper Dangaard Brouer

V2: Addressed issues point out by Yonghong Song
 - Please ACK patch 2/3 again
 - Added ACKs and reviewed-by to other patches

This patchset is focused on improvements for XDP forwarding sample
named xdp_fwd, which leverage the existing FIB routing tables as
described in LPC2018[1] talk by David Ahern.

The primary motivation is to illustrate how Toke's recent work
improves usability of XDP_REDIRECT via lookups in devmap. The other
patches are to help users understand the sample.

I have more improvements to xdp_fwd, but those might requires changes
to libbpf.  Thus, sending these patches first as they are isolated.

[1] http://vger.kernel.org/lpc-networking2018.html#session-1

---

Jesper Dangaard Brouer (3):
      samples/bpf: xdp_fwd rename devmap name to be xdp_tx_ports
      samples/bpf: make xdp_fwd more practically usable via devmap lookup
      samples/bpf: xdp_fwd explain bpf_fib_lookup return codes

 samples/bpf/xdp_fwd_kern.c |   39 ++++++++++++++++++++++++++++++---------
 samples/bpf/xdp_fwd_user.c |   38 ++++++++++++++++++++++++++------------
 2 files changed, 56 insertions(+), 21 deletions(-)

--

^ permalink raw reply

* Re: [PATCH v2 08/15] net: phy: adin: make RMII fifo depth configurable
From: Andrew Lunn @ 2019-08-08 15:42 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-9-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:19PM +0300, Alexandru Ardelean wrote:
> The FIFO depth can be configured for the RMII mode. This change adds
> support for doing this via device-tree (or ACPI).
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 07/15] net: phy: adin: make RGMII internal delays configurable
From: Andrew Lunn @ 2019-08-08 15:40 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-8-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:18PM +0300, Alexandru Ardelean wrote:
> The internal delays for the RGMII are configurable for both RX & TX. This
> change adds support for configuring them via device-tree (or ACPI).
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 06/15] net: phy: adin: configure RGMII/RMII/MII modes on config
From: Andrew Lunn @ 2019-08-08 15:38 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-7-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:17PM +0300, Alexandru Ardelean wrote:
> The ADIN1300 chip supports RGMII, RMII & MII modes. Default (if
> unconfigured) is RGMII.
> This change adds support for configuring these modes via the device
> registers.
> 
> For RGMII with internal delays (modes RGMII_ID,RGMII_TXID, RGMII_RXID),
> the default delay is 2 ns. This can be configurable and will be done in
> a subsequent change.
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 05/15] net: phy: adin: add {write,read}_mmd hooks
From: Andrew Lunn @ 2019-08-08 15:35 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-6-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:16PM +0300, Alexandru Ardelean wrote:
> Both ADIN1200 & ADIN1300 support Clause 45 access.
> The Extended Management Interface (EMI) registers are accessible via both
> Clause 45 (at register MDIO_MMD_VEND1) and using Clause 22.
> 
> However, the Clause 22 MMD access operations differ from the implementation
> in the kernel, in the sense that it uses registers ExtRegPtr (0x10) &
> ExtRegData (0x11) to access Clause 45 & EMI registers.

It is not that they differ from what the kernel supports. Its that
they differ from what the Standard says they should use. These
registers are defined in 802.3, part of C22, and this hardware
implements the standard incorrectly.

	   Andrew

^ permalink raw reply

* Re: [PATCH bpf-next 1/3] bpf: support cloning sk storage on accept()
From: Stanislav Fomichev @ 2019-08-08 15:28 UTC (permalink / raw)
  To: Martin Lau
  Cc: Stanislav Fomichev, netdev@vger.kernel.org, bpf@vger.kernel.org,
	davem@davemloft.net, ast@kernel.org, daniel@iogearbox.net
In-Reply-To: <20190808063936.3p4ahtdkw35rrzqu@kafai-mbp>

On 08/08, Martin Lau wrote:
> On Wed, Aug 07, 2019 at 08:47:18AM -0700, Stanislav Fomichev wrote:
> > Add new helper bpf_sk_storage_clone which optionally clones sk storage
> > and call it from bpf_sk_storage_clone. Reuse the gap in
> > bpf_sk_storage_elem to store clone/non-clone flag.
> > 
> > Cc: Martin KaFai Lau <kafai@fb.com>
> > Signed-off-by: Stanislav Fomichev <sdf@google.com>
> > ---
> >  include/net/bpf_sk_storage.h |  10 ++++
> >  include/uapi/linux/bpf.h     |   1 +
> >  net/core/bpf_sk_storage.c    | 102 +++++++++++++++++++++++++++++++++--
> >  net/core/sock.c              |   9 ++--
> >  4 files changed, 115 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h
> > index b9dcb02e756b..8e4f831d2e52 100644
> > --- a/include/net/bpf_sk_storage.h
> > +++ b/include/net/bpf_sk_storage.h
> > @@ -10,4 +10,14 @@ void bpf_sk_storage_free(struct sock *sk);
> >  extern const struct bpf_func_proto bpf_sk_storage_get_proto;
> >  extern const struct bpf_func_proto bpf_sk_storage_delete_proto;
> >  
> > +#ifdef CONFIG_BPF_SYSCALL
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk);
> > +#else
> > +static inline int bpf_sk_storage_clone(const struct sock *sk,
> > +				       struct sock *newsk)
> > +{
> > +	return 0;
> > +}
> > +#endif
> > +
> >  #endif /* _BPF_SK_STORAGE_H */
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 4393bd4b2419..00459ca4c8cf 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2931,6 +2931,7 @@ enum bpf_func_id {
> >  
> >  /* BPF_FUNC_sk_storage_get flags */
> >  #define BPF_SK_STORAGE_GET_F_CREATE	(1ULL << 0)
> > +#define BPF_SK_STORAGE_GET_F_CLONE	(1ULL << 1)
> It is only used in bpf_sk_storage_get().
> What if the elem is created from bpf_fd_sk_storage_update_elem()
> i.e. from the syscall API ?
> 
> What may be the use case for a map to have both CLONE and non-CLONE
> elements?  If it is not the case, would it be better to add
> BPF_F_CLONE to bpf_attr->map_flags?
I didn't think about putting it on the map itself since the API
is on a per-element, but it does make sense. I can't come up
with a use-case for a per-element selective clone/non-clone.
Thanks, will move to the map itself.

> >  
> >  /* Mode for BPF_FUNC_skb_adjust_room helper. */
> >  enum bpf_adj_room_mode {
> > diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
> > index 94c7f77ecb6b..b6dea67965bc 100644
> > --- a/net/core/bpf_sk_storage.c
> > +++ b/net/core/bpf_sk_storage.c
> > @@ -12,6 +12,9 @@
> >  
> >  static atomic_t cache_idx;
> >  
> > +#define BPF_SK_STORAGE_GET_F_MASK	(BPF_SK_STORAGE_GET_F_CREATE | \
> > +					 BPF_SK_STORAGE_GET_F_CLONE)
> > +
> >  struct bucket {
> >  	struct hlist_head list;
> >  	raw_spinlock_t lock;
> > @@ -66,7 +69,8 @@ struct bpf_sk_storage_elem {
> >  	struct hlist_node snode;	/* Linked to bpf_sk_storage */
> >  	struct bpf_sk_storage __rcu *sk_storage;
> >  	struct rcu_head rcu;
> > -	/* 8 bytes hole */
> > +	u8 clone:1;
> > +	/* 7 bytes hole */
> >  	/* The data is stored in aother cacheline to minimize
> >  	 * the number of cachelines access during a cache hit.
> >  	 */
> > @@ -509,7 +513,7 @@ static int sk_storage_delete(struct sock *sk, struct bpf_map *map)
> >  	return 0;
> >  }
> >  
> > -/* Called by __sk_destruct() */
> > +/* Called by __sk_destruct() & bpf_sk_storage_clone() */
> >  void bpf_sk_storage_free(struct sock *sk)
> >  {
> >  	struct bpf_sk_storage_elem *selem;
> > @@ -739,19 +743,106 @@ static int bpf_fd_sk_storage_delete_elem(struct bpf_map *map, void *key)
> >  	return err;
> >  }
> >  
> > +static struct bpf_sk_storage_elem *
> > +bpf_sk_storage_clone_elem(struct sock *newsk,
> > +			  struct bpf_sk_storage_map *smap,
> > +			  struct bpf_sk_storage_elem *selem)
> > +{
> > +	struct bpf_sk_storage_elem *copy_selem;
> > +
> > +	copy_selem = selem_alloc(smap, newsk, NULL, true);
> > +	if (!copy_selem)
> > +		return ERR_PTR(-ENOMEM);
> nit.
> may be just return NULL as selem_alloc() does.
Sounds good.

> > +
> > +	if (map_value_has_spin_lock(&smap->map))
> > +		copy_map_value_locked(&smap->map, SDATA(copy_selem)->data,
> > +				      SDATA(selem)->data, true);
> > +	else
> > +		copy_map_value(&smap->map, SDATA(copy_selem)->data,
> > +			       SDATA(selem)->data);
> > +
> > +	return copy_selem;
> > +}
> > +
> > +int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
> > +{
> > +	struct bpf_sk_storage *new_sk_storage = NULL;
> > +	struct bpf_sk_storage *sk_storage;
> > +	struct bpf_sk_storage_elem *selem;
> > +	int ret;
> > +
> > +	RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > +
> > +	rcu_read_lock();
> > +	sk_storage = rcu_dereference(sk->sk_bpf_storage);
> > +
> > +	if (!sk_storage || hlist_empty(&sk_storage->list))
> > +		goto out;
> > +
> > +	hlist_for_each_entry_rcu(selem, &sk_storage->list, snode) {
> > +		struct bpf_sk_storage_map *smap;
> > +		struct bpf_sk_storage_elem *copy_selem;
> > +
> > +		if (!selem->clone)
> > +			continue;
> > +
> > +		smap = rcu_dereference(SDATA(selem)->smap);
> > +		if (!smap)
> smap should not be NULL.
I see; you never set it back to NULL and we are guaranteed that the
map is still around due to rcu. Removed.

> > +			continue;
> > +
> > +		copy_selem = bpf_sk_storage_clone_elem(newsk, smap, selem);
> > +		if (IS_ERR(copy_selem)) {
> > +			ret = PTR_ERR(copy_selem);
> > +			goto err;
> > +		}
> > +
> > +		if (!new_sk_storage) {
> > +			ret = sk_storage_alloc(newsk, smap, copy_selem);
> > +			if (ret) {
> > +				kfree(copy_selem);
> > +				atomic_sub(smap->elem_size,
> > +					   &newsk->sk_omem_alloc);
> > +				goto err;
> > +			}
> > +
> > +			new_sk_storage = rcu_dereference(copy_selem->sk_storage);
> > +			continue;
> > +		}
> > +
> > +		raw_spin_lock_bh(&new_sk_storage->lock);
> > +		selem_link_map(smap, copy_selem);
> Unlike the existing selem-update use-cases in bpf_sk_storage.c,
> the smap->map.refcnt has not been held here.  Reading the smap
> is fine.  However, adding a new selem to a deleting smap is an issue.
> Hence, I think bpf_map_inc_not_zero() should be done first.
In this case, I should probably do it after smap = rcu_deref()?

> > +		__selem_link_sk(new_sk_storage, copy_selem);
> > +		raw_spin_unlock_bh(&new_sk_storage->lock);
> > +	}
> > +
> > +out:
> > +	rcu_read_unlock();
> > +	return 0;
> > +
> > +err:
> > +	rcu_read_unlock();
> > +
> > +	bpf_sk_storage_free(newsk);
> > +	return ret;
> > +}
> > +
> >  BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >  	   void *, value, u64, flags)
> >  {
> >  	struct bpf_sk_storage_data *sdata;
> >  
> > -	if (flags > BPF_SK_STORAGE_GET_F_CREATE)
> > +	if (flags & ~BPF_SK_STORAGE_GET_F_MASK)
> > +		return (unsigned long)NULL;
> > +
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CLONE) &&
> > +	    !(flags & BPF_SK_STORAGE_GET_F_CREATE))
> >  		return (unsigned long)NULL;
> >  
> >  	sdata = sk_storage_lookup(sk, map, true);
> >  	if (sdata)
> >  		return (unsigned long)sdata->data;
> >  
> > -	if (flags == BPF_SK_STORAGE_GET_F_CREATE &&
> > +	if ((flags & BPF_SK_STORAGE_GET_F_CREATE) &&
> >  	    /* Cannot add new elem to a going away sk.
> >  	     * Otherwise, the new elem may become a leak
> >  	     * (and also other memory issues during map
> > @@ -762,6 +853,9 @@ BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
> >  		/* sk must be a fullsock (guaranteed by verifier),
> >  		 * so sock_gen_put() is unnecessary.
> >  		 */
> > +		if (!IS_ERR(sdata))
> > +			SELEM(sdata)->clone =
> > +				!!(flags & BPF_SK_STORAGE_GET_F_CLONE);
> >  		sock_put(sk);
> >  		return IS_ERR(sdata) ?
> >  			(unsigned long)NULL : (unsigned long)sdata->data;
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index d57b0cc995a0..f5e801a9cea4 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -1851,9 +1851,12 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
> >  			goto out;
> >  		}
> >  		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
> > -#ifdef CONFIG_BPF_SYSCALL
> > -		RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
> > -#endif
> > +
> > +		if (bpf_sk_storage_clone(sk, newsk)) {
> > +			sk_free_unlock_clone(newsk);
> > +			newsk = NULL;
> > +			goto out;
> > +		}
> >  
> >  		newsk->sk_err	   = 0;
> >  		newsk->sk_err_soft = 0;
> > -- 
> > 2.22.0.770.g0f2c4a37fd-goog
> > 

^ permalink raw reply

* Re: [PATCH v2 04/15] net: phy: adin: add support for interrupts
From: Andrew Lunn @ 2019-08-08 15:25 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-5-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:15PM +0300, Alexandru Ardelean wrote:
> This change adds support for enabling PHY interrupts that can be used by
> the PHY framework to get signal for link/speed/auto-negotiation changes.
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 03/15] net: phy: adin: hook genphy_{suspend,resume} into the driver
From: Andrew Lunn @ 2019-08-08 15:24 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-4-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:14PM +0300, Alexandru Ardelean wrote:
> The chip supports standard suspend/resume via BMCR reg.
> Hook these functions into the `adin` driver.
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH v2 02/15] net: phy: adin: hook genphy_read_abilities() to get_features
From: Andrew Lunn @ 2019-08-08 15:24 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-3-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:13PM +0300, Alexandru Ardelean wrote:
> The ADIN PHYs can operate with Clause 45, however they are not typical for
> how phylib considers Clause 45 PHYs.
> 
> If the `features` field & the `get_features` hook are unspecified, and the
> device wants to operate via Clause 45, it would also try to read features
> via the `genphy_c45_pma_read_abilities()`, which will try to read PMA regs
> that are unsupported.
> 
> Hooking the `genphy_read_abilities()` function to the `get_features` hook
> will ensure that this does not happen and the PHY features are read
> correctly regardless of Clause 22 or Clause 45 operation.

I think we need to stop and think about a PHY which supports both C22
and C45.

How does bus enumeration work? Is it discovered twice?  I've always
considered phydev->is_c45 means everything is c45, not that some
registers can be accessed via c45. But the driver is mixing c22 and
c45. Does the driver actually require c45? Are some features which are
only accessibly via C45? What does C45 actually bring us for this
device?

     Andrew

^ permalink raw reply

* Re: [PATCH v2 1/2] tcp: add new tcp_mtu_probe_floor sysctl
From: Josh Hunt @ 2019-08-08 15:14 UTC (permalink / raw)
  To: Eric Dumazet, netdev; +Cc: davem, edumazet, ncardwell
In-Reply-To: <a3a69a9d-5e30-77c4-02e2-c644bfdab820@gmail.com>

On 8/7/19 11:12 PM, Eric Dumazet wrote:
> 
> 
> On 8/8/19 1:52 AM, Josh Hunt wrote:
>> The current implementation of TCP MTU probing can considerably
>> underestimate the MTU on lossy connections allowing the MSS to get down to
>> 48. We have found that in almost all of these cases on our networks these
>> paths can handle much larger MTUs meaning the connections are being
>> artificially limited. Even though TCP MTU probing can raise the MSS back up
>> we have seen this not to be the case causing connections to be "stuck" with
>> an MSS of 48 when heavy loss is present.
>>
>> Prior to pushing out this change we could not keep TCP MTU probing enabled
>> b/c of the above reasons. Now with a reasonble floor set we've had it
>> enabled for the past 6 months.
> 
> I am still sad to see you do not share what is a reasonable value and let
> everybody guess.
> 
> It seems to be a top-secret value.

Haha, no sorry I didn't mean for it to come across like that.

We are currently setting tcp_base_mss to 1348 and tcp_mtu_probe_floor to 
1208. I thought I mentioned it in our earlier mails, but I guess I did 
not put the exact #s. 1348 was derived after analyzing common MTU we see 
across our networks and noticing that an MTU of around 1400 would cover 
a very large % (sorry I don't have the #s handy) of those paths. 1400 - 
20 - 20 - 12 = 1348. For the floor we based it off the v6 min MTU of 
1280 and subtracted out headers, etc, so 1280 - 40 - 20 - 12 = 1208. 
Using a floor of 1280 MTU matches what the RFC suggests in section 7.7 
for v6 connections, we're just applying that to v4 right now as well. I 
guess that brings up a good point, would per-IP proto floor sysctls be 
an option here for upstream (the RFC suggests different floors for v4 
and v6)? For now I'm not sure it makes sense b/c of the problems we see 
with lossy connections, but in the future if that can be resolved it 
seems like it would give some more flexibility.

I'd like to investigate this all further at some point to see if we can 
make it work better for lossy connections. It looks like one of the 
problems is there are a # of conditions which cause us to not probe in 
the upward direction. I'm not sure if any of those can be 
relaxed/changed and if so would help these connections out.

Thanks
Josh

^ permalink raw reply

* Re: [PATCH v2 01/15] net: phy: adin: add support for Analog Devices PHYs
From: Andrew Lunn @ 2019-08-08 15:13 UTC (permalink / raw)
  To: Alexandru Ardelean
  Cc: netdev, devicetree, linux-kernel, davem, robh+dt, mark.rutland,
	f.fainelli, hkallweit1
In-Reply-To: <20190808123026.17382-2-alexandru.ardelean@analog.com>

On Thu, Aug 08, 2019 at 03:30:12PM +0300, Alexandru Ardelean wrote:
> This change adds support for Analog Devices Industrial Ethernet PHYs.
> Particularly the PHYs this driver adds support for:
>  * ADIN1200 - Robust, Industrial, Low Power 10/100 Ethernet PHY
>  * ADIN1300 - Robust, Industrial, Low Latency 10/100/1000 Gigabit
>    Ethernet PHY
> 
> The 2 chips are pin & register compatible with one another. The main
> difference being that ADIN1200 doesn't operate in gigabit mode.
> 
> The chips can be operated by the Generic PHY driver as well via the
> standard IEEE PHY registers (0x0000 - 0x000F) which are supported by the
> kernel as well. This assumes that configuration of the PHY has been done
> completely in HW, according to spec.
> 
> Configuration can also be done via registers, which will be supported by
> this driver.
> 
> Datasheets:
>   https://www.analog.com/media/en/technical-documentation/data-sheets/ADIN1300.pdf
>   https://www.analog.com/media/en/technical-documentation/data-sheets/ADIN1200.pdf
> 
> Signed-off-by: Alexandru Ardelean <alexandru.ardelean@analog.com>

Reviewed-by: Andrew Lunn <andrew@lunn.ch>

    Andrew

^ permalink raw reply

* Re: [PATCH TEST] net: bridge: mcast: fix possible uses of stale pointers
From: kbuild test robot @ 2019-08-08 15:10 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: kbuild-all, Martin Weinelt, bridge, Roopa Prabhu, netdev
In-Reply-To: <908e9e90-70cc-7bbe-f83f-0810c9ef3925@cumulusnetworks.com>

Hi Nikolay,

I love your patch! Perhaps something to improve:

[auto build test WARNING on net/master]
[cannot apply to v5.3-rc3 next-20190808]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Nikolay-Aleksandrov/net-bridge-mcast-fix-possible-uses-of-stale-pointers/20190702-083354

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>


coccinelle warnings: (new ones prefixed by >>)

>> net/bridge/br_multicast.c:999:8-14: ERROR: application of sizeof to pointer

Please review and possibly fold the followup patch.

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply

* [PATCH] fix noderef.cocci warnings
From: kbuild test robot @ 2019-08-08 15:10 UTC (permalink / raw)
  To: Nikolay Aleksandrov
  Cc: kbuild-all, Martin Weinelt, bridge, Roopa Prabhu, netdev
In-Reply-To: <908e9e90-70cc-7bbe-f83f-0810c9ef3925@cumulusnetworks.com>

From: kbuild test robot <lkp@intel.com>

net/bridge/br_multicast.c:999:8-14: ERROR: application of sizeof to pointer

 sizeof when applied to a pointer typed expression gives the size of
 the pointer

Generated by: scripts/coccinelle/misc/noderef.cocci

Fixes: 17c91348ed8b ("Use-after-free in br_multicast_rcv")
CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Signed-off-by: kbuild test robot <lkp@intel.com>
---

url:    https://github.com/0day-ci/linux/commits/Nikolay-Aleksandrov/net-bridge-mcast-fix-possible-uses-of-stale-pointers/20190702-083354

 br_multicast.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -996,7 +996,7 @@ static int br_ip6_multicast_mld2_report(
 			return -EINVAL;
 
 		_nsrcs = skb_header_pointer(skb, nsrcs_offset,
-					   sizeof(_nsrcs), &__nsrcs);
+					   sizeof(*_nsrcs), &__nsrcs);
 		if (!_nsrcs)
 			return -EINVAL;
 

^ permalink raw reply

* memory leak in sctp_get_port_local (2)
From: syzbot @ 2019-08-08 14:58 UTC (permalink / raw)
  To: davem, linux-kernel, linux-sctp, marcelo.leitner, netdev, nhorman,
	syzkaller-bugs, vyasevich

Hello,

syzbot found the following crash on:

HEAD commit:    0eb0ce0a Merge tag 'spi-fix-v5.3-rc3' of git://git.kernel...
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1234588c600000
kernel config:  https://syzkaller.appspot.com/x/.config?x=39113f5c48aea971
dashboard link: https://syzkaller.appspot.com/bug?extid=2d7ecdf99f15689032b3
compiler:       gcc (GCC) 9.0.0 20181231 (experimental)
syz repro:      https://syzkaller.appspot.com/x/repro.syz?x=160e1906600000
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=140ab906600000

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+2d7ecdf99f15689032b3@syzkaller.appspotmail.com

executing program
executing program
executing program
executing program
executing program
BUG: memory leak
unreferenced object 0xffff88810fa4b380 (size 64):
   comm "syz-executor900", pid 7117, jiffies 4294946947 (age 16.560s)
   hex dump (first 32 bytes):
     20 4e 00 00 89 e7 4c 8d 00 00 00 00 00 00 00 00   N....L.........
     58 40 dd 16 82 88 ff ff 00 00 00 00 00 00 00 00  X@..............
   backtrace:
     [<00000000f1461735>] kmemleak_alloc_recursive  
include/linux/kmemleak.h:43 [inline]
     [<00000000f1461735>] slab_post_alloc_hook mm/slab.h:522 [inline]
     [<00000000f1461735>] slab_alloc mm/slab.c:3319 [inline]
     [<00000000f1461735>] kmem_cache_alloc+0x13f/0x2c0 mm/slab.c:3483
     [<00000000ff3ccf22>] sctp_bucket_create net/sctp/socket.c:8374 [inline]
     [<00000000ff3ccf22>] sctp_get_port_local+0x189/0x5b0  
net/sctp/socket.c:8121
     [<00000000eed41612>] sctp_do_bind+0xcc/0x1e0 net/sctp/socket.c:402
     [<000000002bf65239>] sctp_bind+0x44/0x70 net/sctp/socket.c:302
     [<00000000b1aaaf57>] inet_bind+0x40/0xc0 net/ipv4/af_inet.c:441
     [<00000000db36b917>] __sys_bind+0x11c/0x140 net/socket.c:1647
     [<00000000679cfe3c>] __do_sys_bind net/socket.c:1658 [inline]
     [<00000000679cfe3c>] __se_sys_bind net/socket.c:1656 [inline]
     [<00000000679cfe3c>] __x64_sys_bind+0x1e/0x30 net/socket.c:1656
     [<000000002aac3ac2>] do_syscall_64+0x76/0x1a0  
arch/x86/entry/common.c:296
     [<000000000c38e074>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88810fa4b380 (size 64):
   comm "syz-executor900", pid 7117, jiffies 4294946947 (age 19.260s)
   hex dump (first 32 bytes):
     20 4e 00 00 89 e7 4c 8d 00 00 00 00 00 00 00 00   N....L.........
     58 40 dd 16 82 88 ff ff 00 00 00 00 00 00 00 00  X@..............
   backtrace:
     [<00000000f1461735>] kmemleak_alloc_recursive  
include/linux/kmemleak.h:43 [inline]
     [<00000000f1461735>] slab_post_alloc_hook mm/slab.h:522 [inline]
     [<00000000f1461735>] slab_alloc mm/slab.c:3319 [inline]
     [<00000000f1461735>] kmem_cache_alloc+0x13f/0x2c0 mm/slab.c:3483
     [<00000000ff3ccf22>] sctp_bucket_create net/sctp/socket.c:8374 [inline]
     [<00000000ff3ccf22>] sctp_get_port_local+0x189/0x5b0  
net/sctp/socket.c:8121
     [<00000000eed41612>] sctp_do_bind+0xcc/0x1e0 net/sctp/socket.c:402
     [<000000002bf65239>] sctp_bind+0x44/0x70 net/sctp/socket.c:302
     [<00000000b1aaaf57>] inet_bind+0x40/0xc0 net/ipv4/af_inet.c:441
     [<00000000db36b917>] __sys_bind+0x11c/0x140 net/socket.c:1647
     [<00000000679cfe3c>] __do_sys_bind net/socket.c:1658 [inline]
     [<00000000679cfe3c>] __se_sys_bind net/socket.c:1656 [inline]
     [<00000000679cfe3c>] __x64_sys_bind+0x1e/0x30 net/socket.c:1656
     [<000000002aac3ac2>] do_syscall_64+0x76/0x1a0  
arch/x86/entry/common.c:296
     [<000000000c38e074>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88810fa4b380 (size 64):
   comm "syz-executor900", pid 7117, jiffies 4294946947 (age 21.990s)
   hex dump (first 32 bytes):
     20 4e 00 00 89 e7 4c 8d 00 00 00 00 00 00 00 00   N....L.........
     58 40 dd 16 82 88 ff ff 00 00 00 00 00 00 00 00  X@..............
   backtrace:
     [<00000000f1461735>] kmemleak_alloc_recursive  
include/linux/kmemleak.h:43 [inline]
     [<00000000f1461735>] slab_post_alloc_hook mm/slab.h:522 [inline]
     [<00000000f1461735>] slab_alloc mm/slab.c:3319 [inline]
     [<00000000f1461735>] kmem_cache_alloc+0x13f/0x2c0 mm/slab.c:3483
     [<00000000ff3ccf22>] sctp_bucket_create net/sctp/socket.c:8374 [inline]
     [<00000000ff3ccf22>] sctp_get_port_local+0x189/0x5b0  
net/sctp/socket.c:8121
     [<00000000eed41612>] sctp_do_bind+0xcc/0x1e0 net/sctp/socket.c:402
     [<000000002bf65239>] sctp_bind+0x44/0x70 net/sctp/socket.c:302
     [<00000000b1aaaf57>] inet_bind+0x40/0xc0 net/ipv4/af_inet.c:441
     [<00000000db36b917>] __sys_bind+0x11c/0x140 net/socket.c:1647
     [<00000000679cfe3c>] __do_sys_bind net/socket.c:1658 [inline]
     [<00000000679cfe3c>] __se_sys_bind net/socket.c:1656 [inline]
     [<00000000679cfe3c>] __x64_sys_bind+0x1e/0x30 net/socket.c:1656
     [<000000002aac3ac2>] do_syscall_64+0x76/0x1a0  
arch/x86/entry/common.c:296
     [<000000000c38e074>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

BUG: memory leak
unreferenced object 0xffff88810fa4b380 (size 64):
   comm "syz-executor900", pid 7117, jiffies 4294946947 (age 22.940s)
   hex dump (first 32 bytes):
     20 4e 00 00 89 e7 4c 8d 00 00 00 00 00 00 00 00   N....L.........
     58 40 dd 16 82 88 ff ff 00 00 00 00 00 00 00 00  X@..............
   backtrace:
     [<00000000f1461735>] kmemleak_alloc_recursive  
include/linux/kmemleak.h:43 [inline]
     [<00000000f1461735>] slab_post_alloc_hook mm/slab.h:522 [inline]
     [<00000000f1461735>] slab_alloc mm/slab.c:3319 [inline]
     [<00000000f1461735>] kmem_cache_alloc+0x13f/0x2c0 mm/slab.c:3483
     [<00000000ff3ccf22>] sctp_bucket_create net/sctp/socket.c:8374 [inline]
     [<00000000ff3ccf22>] sctp_get_port_local+0x189/0x5b0  
net/sctp/socket.c:8121
     [<00000000eed41612>] sctp_do_bind+0xcc/0x1e0 net/sctp/socket.c:402
     [<000000002bf65239>] sctp_bind+0x44/0x70 net/sctp/socket.c:302
     [<00000000b1aaaf57>] inet_bind+0x40/0xc0 net/ipv4/af_inet.c:441
     [<00000000db36b917>] __sys_bind+0x11c/0x140 net/socket.c:1647
     [<00000000679cfe3c>] __do_sys_bind net/socket.c:1658 [inline]
     [<00000000679cfe3c>] __se_sys_bind net/socket.c:1656 [inline]
     [<00000000679cfe3c>] __x64_sys_bind+0x1e/0x30 net/socket.c:1656
     [<000000002aac3ac2>] do_syscall_64+0x76/0x1a0  
arch/x86/entry/common.c:296
     [<000000000c38e074>] entry_SYSCALL_64_after_hwframe+0x44/0xa9

executing program
executing program
executing program
executing program


---
This bug is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this bug report. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.
syzbot can test patches for this bug, for details see:
https://goo.gl/tpsmEJ#testing-patches

^ permalink raw reply

* RE: [PATCH] tipc: set addr_trail_end when using explicit node addresses
From: Jon Maloy @ 2019-08-08 14:49 UTC (permalink / raw)
  To: Chris Packham, ying.xue@windriver.com, davem@davemloft.net
  Cc: netdev@vger.kernel.org, tipc-discussion@lists.sourceforge.net,
	linux-kernel@vger.kernel.org
In-Reply-To: <20190807045543.28373-1-chris.packham@alliedtelesis.co.nz>

You should rather set this one unconditionally in tipc_set_node_addr().
The problems is not about the state machine, but that jiffies is close to the wrap-around time, so that it is perceived as being before the time "0".

BR
///jon


> -----Original Message-----
> From: netdev-owner@vger.kernel.org <netdev-owner@vger.kernel.org> On
> Behalf Of Chris Packham
> Sent: 7-Aug-19 00:56
> To: Jon Maloy <jon.maloy@ericsson.com>; ying.xue@windriver.com;
> davem@davemloft.net
> Cc: netdev@vger.kernel.org; tipc-discussion@lists.sourceforge.net; linux-
> kernel@vger.kernel.org; Chris Packham <chris.packham@alliedtelesis.co.nz>
> Subject: [PATCH] tipc: set addr_trail_end when using explicit node addresses
> 
> When tipc uses auto-generated node addresses it goes through a duplicate
> address detection phase to ensure the address is unique.
> 
> When using explicitly configured node names the DAD phase is skipped.
> However addr_trail_end was being left set to 0 which causes parts of the tipc
> state machine to assume that the address is not yet valid and unnecessarily
> delays the discovery phase. By setting addr_trail_end to jiffies when using
> explicit addresses we ensure that we move straight to discovery.
> 
> Signed-off-by: Chris Packham <chris.packham@alliedtelesis.co.nz>
> ---
>  net/tipc/discover.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/net/tipc/discover.c b/net/tipc/discover.c index
> c138d68e8a69..f83bfe8c9443 100644
> --- a/net/tipc/discover.c
> +++ b/net/tipc/discover.c
> @@ -361,6 +361,8 @@ int tipc_disc_create(struct net *net, struct
> tipc_bearer *b,
>  	if (!tipc_own_addr(net)) {
>  		tn->addr_trial_end = jiffies + msecs_to_jiffies(1000);
>  		msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG);
> +	} else {
> +		tn->addr_trial_end = jiffies;
>  	}
>  	memcpy(&d->dest, dest, sizeof(*dest));
>  	d->net = net;
> --
> 2.22.0


^ permalink raw reply

* Re: [PATCH net-next] r8169: make use of xmit_more
From: Holger Hoffstätte @ 2019-08-08 14:37 UTC (permalink / raw)
  To: Heiner Kallweit, Realtek linux nic maintainers, David Miller
  Cc: netdev@vger.kernel.org, Sander Eikelenboom, Eric Dumazet
In-Reply-To: <2950b2f7-7460-cce0-d964-ad654d897295@gmail.com>


Hello Heiner -

On 7/28/19 11:25 AM, Heiner Kallweit wrote:
> There was a previous attempt to use xmit_more, but the change had to be
> reverted because under load sometimes a transmit timeout occurred [0].
> Maybe this was caused by a missing memory barrier, the new attempt
> keeps the memory barrier before the call to netif_stop_queue like it
> is used by the driver as of today. The new attempt also changes the
> order of some calls as suggested by Eric.
> 
> [0] https://lkml.org/lkml/2019/2/10/39
> 
> Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>

I decided to take one for the team and merged this into my 5.2.x tree (just
fixing up the path) and it has been working fine for the last 2 weeks in two
machines..until today, when for the first time in forever some random NFS traffic
made this old friend come out from under the couch:

[Aug 8 14:13] ------------[ cut here ]------------
[  +0.000006] NETDEV WATCHDOG: eth0 (r8169): transmit queue 0 timed out
[  +0.000021] WARNING: CPU: 3 PID: 0 at net/sched/sch_generic.c:442 dev_watchdog+0x21f/0x230
[  +0.000001] Modules linked in: lz4 lz4_compress lz4_decompress nfsd auth_rpcgss oid_registry lockd grace sunrpc sch_fq_codel btrfs xor zstd_compress raid6_pq zstd_decompress bfq jitterentropy_rng nct6775 hwmon_vid coretemp hwmon x86_pkg_temp_thermal aesni_intel aes_x86_64 i915 glue_helper crypto_simd cryptd i2c_i801 intel_gtt i2c_algo_bit iosf_mbi drm_kms_helper syscopyarea usbhid sysfillrect r8169 sysimgblt fb_sys_fops realtek drm libphy drm_panel_orientation_quirks i2c_core video backlight mq_deadline
[  +0.000026] CPU: 3 PID: 0 Comm: swapper/3 Not tainted 5.2.7 #1
[  +0.000001] Hardware name: System manufacturer System Product Name/P8Z68-V LX, BIOS 4105 07/01/2013
[  +0.000004] RIP: 0010:dev_watchdog+0x21f/0x230
[  +0.000002] Code: 3b 00 75 ea eb ad 4c 89 ef c6 05 1c 45 bd 00 01 e8 66 35 fc ff 44 89 e1 4c 89 ee 48 c7 c7 e8 5e fc 81 48 89 c2 e8 90 df 92 ff <0f> 0b eb 8e 66 66 2e 0f 1f 84 00 00 00 00 00 66 90 66 66 66 66 90
[  +0.000002] RSP: 0018:ffffc90000118e68 EFLAGS: 00010286
[  +0.000002] RAX: 0000000000000000 RBX: ffff8887f7837600 RCX: 0000000000000303
[  +0.000001] RDX: 0000000000000001 RSI: 0000000000000092 RDI: ffffffff827a488c
[  +0.000001] RBP: ffff8887f9fbc440 R08: 0000000000000303 R09: 0000000000000003
[  +0.000001] R10: 000000000001004c R11: 0000000000000001 R12: 0000000000000000
[  +0.000009] R13: ffff8887f9fbc000 R14: ffffffff8173aa20 R15: dead000000000200
[  +0.000001] FS:  0000000000000000(0000) GS:ffff8887ff580000(0000) knlGS:0000000000000000
[  +0.000000] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  +0.000001] CR2: 00007f8d1c04d000 CR3: 0000000002209001 CR4: 00000000000606e0
[  +0.000000] Call Trace:
[  +0.000002]  <IRQ>
[  +0.000005]  call_timer_fn+0x2b/0x120
[  +0.000002]  expire_timers+0xa4/0x100
[  +0.000001]  run_timer_softirq+0x8c/0x170
[  +0.000002]  ? __hrtimer_run_queues+0x13a/0x290
[  +0.000003]  ? sched_clock_cpu+0xe/0x130
[  +0.000003]  __do_softirq+0xeb/0x2de
[  +0.000003]  irq_exit+0x9d/0xe0
[  +0.000002]  smp_apic_timer_interrupt+0x60/0x110
[  +0.000003]  apic_timer_interrupt+0xf/0x20
[  +0.000001]  </IRQ>
[  +0.000003] RIP: 0010:cpuidle_enter_state+0xad/0x930
[  +0.000001] Code: c5 66 66 66 66 90 31 ff e8 90 99 9e ff 80 7c 24 0b 00 74 12 9c 58 f6 c4 02 0f 85 39 08 00 00 31 ff e8 e7 26 a2 ff fb 45 85 e4 <0f> 88 34 02 00 00 49 63 cc 4c 2b 2c 24 48 8d 04 49 48 c1 e0 05 8b
[  +0.000000] RSP: 0018:ffffc9000008be50 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13
[  +0.000001] RAX: ffff8887ff5a9180 RBX: ffffffff822b6c40 RCX: 000000000000001f
[  +0.000001] RDX: 0000000000000000 RSI: 0000000033087154 RDI: 0000000000000000
[  +0.000001] RBP: ffff8887ff5b1310 R08: 000030d021fae397 R09: ffff8887ff59c8c0
[  +0.000000] R10: ffff8887ff59c8c0 R11: 0000000000000006 R12: 0000000000000004
[  +0.000001] R13: 000030d021fae397 R14: 0000000000000004 R15: ffff8887fc281600
[  +0.000001]  cpuidle_enter+0x29/0x40
[  +0.000002]  do_idle+0x1e5/0x280
[  +0.000001]  cpu_startup_entry+0x19/0x20
[  +0.000002]  start_secondary+0x186/0x1c0
[  +0.000001]  secondary_startup_64+0xa4/0xb0
[  +0.000001] ---[ end trace 99493c768580f4fd ]---

The device is:

Aug  7 23:19:09 tux kernel: libphy: r8169: probed
Aug  7 23:19:09 tux kernel: r8169 0000:04:00.0 eth0: RTL8168evl/8111evl, c8:60:00:68:33:cc, XID 2c9, IRQ 36
Aug  7 23:19:09 tux kernel: r8169 0000:04:00.0 eth0: jumbo features [frames: 9200 bytes, tx checksumming: ko]
Aug  7 23:19:12 tux kernel: RTL8211E Gigabit Ethernet r8169-400:00: attached PHY driver [RTL8211E Gigabit Ethernet] (mii_bus:phy_addr=r8169-400:00, irq=IGNORE)
Aug  7 23:19:13 tux kernel: r8169 0000:04:00.0 eth0: No native access to PCI extended config space, falling back to CSI

and using fq_codel, of course.

This cpuidle hiccup used to be completely gone without xmit_more and this was
the first (and so far only) time since merging it (regardless of load).
Also, while I'm using BMQ as CPU scheduler, that hasn't made a difference for
this particular problem in the past (with MuQSS/PDS) either; way back when I had
Eric's previous attempt(s) it also hiccupped with CFS.

Revert or wait for more reports when -next is merged in 5.4?

thanks,
Holger

^ permalink raw reply

* Re: Realtek r8822be wireless card fails to work with new rtw88 kernel module
From: 고준 @ 2019-08-08 14:26 UTC (permalink / raw)
  To: Tony Chuang
  Cc: Brian Norris, linux-wireless, <netdev@vger.kernel.org>,
	Linux Kernel
In-Reply-To: <F7CD281DE3E379468C6D07993EA72F84D1889B04@RTITMBSVM04.realtek.com.tw>

Hello,

Thanks for sharing the patch, Brian. I am seeing some progress when
building 5.3.0-rc1+ with
the wireless-drivers-next patch for the rtw88 kernel module. Before
the patch, my realtek r8822be
was not recognized at all.

After the patch, Realtek ethernet as well as wireless card r8822be are
recognized and I can see
a list of wireless access points. But for some reason, ping to my
local gateway servers (both
Ethernet and wireless) fail. Running tcpdump on my wireless and
ethernet interfaces shows
that ARP requests are showing up, but dns resolution doesn't work. I
can associate with a
wireless access point with wpa_supplicant and my ethernet port is
getting a DHCP lease from
my dhcp server, however.

And for YH~
Here is a dropbox link to debug info containing the output of dmesg,
lsmod, and journalctl -b0 zipped
up into a tarball:

https://www.dropbox.com/s/pl85ob09y6q2qky/debug_5.3.0-rc1%2B_with_rtw88_patch.tar.gz?dl=0

Thanks for your help!
Jun


Link to GPG Public Key:
https://keybase.io/gojun077#show-public


Backup link:
https://keys.openpgp.org/vks/v1/by-fingerprint/79F173A93EB3623D32F86309A56930CF7235138D


Link to GPG Public Key:

https://keybase.io/gojun077#show-public


Backup link:

https://keys.openpgp.org/vks/v1/by-fingerprint/79F173A93EB3623D32F86309A56930CF7235138D



On Wed, Aug 7, 2019 at 11:33 AM Tony Chuang <yhchuang@realtek.com> wrote:
>
> > + yhchuang
> >
> > On Tue, Aug 6, 2019 at 7:32 AM 고준 <gojun077@gmail.com> wrote:
> > >
> > > Hello,
> > >
> > > I recently reported a bug to Ubuntu regarding a regression in wireless
> > > driver support for the Realtek r8822be wireless chipset. The issue
> > > link on launchpad is:
> > >
> > > https://bugs.launchpad.net/bugs/1838133
> > >
> > > After Canonical developers triaged the bug they determined that the
> > > problem lies upstream, and instructed me to send mails to the relevant
> > > kernel module maintainers at Realtek and to the general kernel.org
> > > mailing list.
> > >
> > > I built kernel 5.3.0-rc1+ with the latest realtek drivers from
> > > wireless-drivers-next but my Realtek r8822be doesn't work with
> > > rtw88/rtwpci kernel modules.
> > >
> > > Please let me know if there is any additional information I can
> > > provide that would help in debugging this issue.
> >
> > Any chance this would help you?
> >
> > https://patchwork.kernel.org/patch/11065631/
> >
> > Somebody else was complaining about 8822be regressions that were fixed
> > with that.
> >
>
> I hope it could fix it.
>
> And as "r8822be" was dropped, it is preferred to use "rtw88" instead.
> I have received two kinds of failures that cause driver stop working.
> One is the MSI interrupt should be enabled on certain platforms.
> Another is the RFE type of the card, could you send more dmesg to me?
>
> Yan-Hsuan
>
>

^ permalink raw reply

* [PATCH net-next] taprio: remove unused variable 'entry_list_policy'
From: YueHaibing @ 2019-08-08 14:26 UTC (permalink / raw)
  To: davem, jhs, xiyou.wangcong, jiri, vinicius.gomes
  Cc: linux-kernel, netdev, YueHaibing

net/sched/sch_taprio.c:680:32: warning:
 entry_list_policy defined but not used [-Wunused-const-variable=]

It is not used since commit a3d43c0d56f1 ("taprio: Add
support adding an admin schedule")

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: YueHaibing <yuehaibing@huawei.com>
---
 net/sched/sch_taprio.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index c39db50..046fd2c 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -677,10 +677,6 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = {
 	[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]  = { .type = NLA_U32 },
 };
 
-static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = {
-	[TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED },
-};
-
 static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = {
 	[TCA_TAPRIO_ATTR_PRIOMAP]	       = {
 		.len = sizeof(struct tc_mqprio_qopt)
-- 
2.7.4



^ permalink raw reply related

* [PATCH net-next v2 4/9] net: introduce MACsec ops and add a reference in net_device
From: Antoine Tenart @ 2019-08-08 14:05 UTC (permalink / raw)
  To: davem, sd, andrew, f.fainelli, hkallweit1
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	alexandre.belloni, allan.nielsen, camelia.groza, Simon.Edelhaus
In-Reply-To: <20190808140600.21477-1-antoine.tenart@bootlin.com>

This patch introduces MACsec ops for drivers to support offloading
MACsec operations. A reference to those ops is added in net_device.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 include/linux/netdevice.h | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 88292953aa6f..59ff123d62e3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -53,6 +53,7 @@ struct netpoll_info;
 struct device;
 struct phy_device;
 struct dsa_port;
+struct macsec_context;
 
 struct sfp_bus;
 /* 802.11 specific */
@@ -910,6 +911,29 @@ struct xfrmdev_ops {
 };
 #endif
 
+#if defined(CONFIG_MACSEC)
+struct macsec_ops {
+	/* Device wide */
+	int (*mdo_dev_open)(struct macsec_context *ctx);
+	int (*mdo_dev_stop)(struct macsec_context *ctx);
+	/* SecY */
+	int (*mdo_add_secy)(struct macsec_context *ctx);
+	int (*mdo_upd_secy)(struct macsec_context *ctx);
+	int (*mdo_del_secy)(struct macsec_context *ctx);
+	/* Security channels */
+	int (*mdo_add_rxsc)(struct macsec_context *ctx);
+	int (*mdo_upd_rxsc)(struct macsec_context *ctx);
+	int (*mdo_del_rxsc)(struct macsec_context *ctx);
+	/* Security associations */
+	int (*mdo_add_rxsa)(struct macsec_context *ctx);
+	int (*mdo_upd_rxsa)(struct macsec_context *ctx);
+	int (*mdo_del_rxsa)(struct macsec_context *ctx);
+	int (*mdo_add_txsa)(struct macsec_context *ctx);
+	int (*mdo_upd_txsa)(struct macsec_context *ctx);
+	int (*mdo_del_txsa)(struct macsec_context *ctx);
+};
+#endif
+
 struct dev_ifalias {
 	struct rcu_head rcuhead;
 	char ifalias[];
@@ -1755,6 +1779,8 @@ enum netdev_priv_flags {
  *
  *	@wol_enabled:	Wake-on-LAN is enabled
  *
+ *	@macsec_ops:    MACsec offloading ops
+ *
  *	FIXME: cleanup struct net_device such that network protocol info
  *	moves out.
  */
@@ -2036,6 +2062,11 @@ struct net_device {
 	struct lock_class_key	*qdisc_running_key;
 	bool			proto_down;
 	unsigned		wol_enabled:1;
+
+#if IS_ENABLED(CONFIG_MACSEC)
+	/* MACsec management functions */
+	const struct macsec_ops *macsec_ops;
+#endif
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v2 0/9] net: macsec: initial support for hardware offloading
From: Antoine Tenart @ 2019-08-08 14:05 UTC (permalink / raw)
  To: davem, sd, andrew, f.fainelli, hkallweit1
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	alexandre.belloni, allan.nielsen, camelia.groza, Simon.Edelhaus

Hello,

This series intends to add support for offloading MACsec transformations
in hardware enabled devices. The series is divided in two parts: the
first 6 patches add the infrastructure support to offload a MACsec
configuration to hardware drivers; and the last 3 patches introduce the
MACsec offloading support in the Microsemi Ocelot networking PHY, making
it the first driver to support the MACsec hardware offloading feature.

The series can also be found at:
https://github.com/atenart/linux/tree/net-next/macsec

MACsec hardware offloading infrastructure
-----------------------------------------

Linux has a software implementation of the MACsec standard and so far no
hardware offloading feature was developed and submitted. Some hardware
engines can perform MACsec operations, such as the Intel ixgbe NIC and
the Microsemi Ocelot PHY (the one we use in this series). This means the
MACsec offloading infrastructure should support networking PHY and
Ethernet drivers. A preliminary email[1] was sent about this.

The main idea here is to re-use the logic and data structures of the
software MACsec implementation. This allows not to duplicate definitions
and structure storing the same kind of information. It also allows to
use a unified genlink interface for both MACsec implementations (so that
the same userspace tool, `ip macsec`, is used with the same arguments).
The MACsec offloading support cannot be disabled if an interface
supports it at the moment, but this could be implemented later on if
this is a need (we could think of something like
`ip macsec set macsec0 offloading off`).

Because we do reuse the software implementation logic and because the
choice was made to expose the exact same interface to the user, a
virtual interface is created exactly as if the MACsec software
implementation was used. This was a big question when doing this work,
and another approach would have been to register the genl helpers for
all MACsec implementations and to have the software one a provider (such
as the h/w offloading device drivers are). This would mean there would
be no way to switch between implementations in the future at runtime.
I'm open to discuss this point as I think this is really important and
I'm not sure what is the best solution here.

The MACsec configuration is passed to device drivers supporting it
through MACsec ops which are called (indirectly) from the MACsec
genl helpers. This function calls the MACsec ops of PHY and Ethernet
drivers in two steps: a preparation one, and a commit one. The first
step is allowed to fail and should be used to check if a provided
configuration is compatible with the features provided by a MACsec
engine, while the second step is not allowed to fail and should only be
used to enable a given MACsec configuration. Two extra calls are made:
when a virtual MACsec interface is created and when it is deleted, so
that the hardware driver can stay in sync.

The Rx and TX handlers are modified to take in account the special case
were the MACsec transformation happens in the hardware, whether in a PHY
or in a MAC, as the packets seen by the networking stack on both the
physical and MACsec virtual interface are exactly the same. This leads
to some limitations: the hardware and software implementations can't be
used on the same physical interface, as the policies would be impossible
to fulfill (such as strict validation of the frames). Also only a single
virtual MACsec interface can be attached to a physical port supporting
hardware offloading as it would be impossible to guess onto which
interface a given packet should go (for ingress traffic).

Another limitation as of now is that the counters and statistics are not
reported back from the hardware to the software MACsec implementation.
This isn't an issue when using offloaded MACsec transformations, but it
should be added in the future so that the MACsec state can be reported
to the user (which would also improve the debug).

[1] https://www.spinics.net/lists/netdev/msg513047.html

Microsemi Ocelot PHY MACsec support
-----------------------------------

In order to add support for the MACsec offloading feature in the
Microsemi Ocelot driver, the __phy_read_page and __phy_write_page
helpers had to be exported. This is because the initialization of the
PHY is done while holding the MDIO bus lock, and we need to change the
page to configure the MACsec block.

The support itself is then added in two patches. The first one adds
support for configuring the MACsec block within the PHY, so that it is
up, running and available for future configuration, but is not doing any
modification on the traffic passing through the PHY. The second patch
implements the phy_device MACsec ops in the Microsemi Ocelot PHY driver,
and introduce helpers to configure MACsec transformations and flows to
match specific packets.

Thanks!
Antoine

Since v1:
  - Reworked the MACsec offloading API, moving from a single helper
    called for all MACsec configuration operations, to a per-operation
    function that is provided by the underlying hardware drivers.
  - Those functions now contain a verb to describe the configuration
    action they're offloading.
  - Improved the error handling in the MACsec genl helpers to revert
    the configuration to its previous state when the offloading call
    failed.
  - Reworked the file inclusions.

Antoine Tenart (9):
  net: introduce the MACSEC netdev feature
  net: macsec: move some definitions in a dedicated header
  net: macsec: introduce the macsec_context structure
  net: introduce MACsec ops and add a reference in net_device
  net: phy: add MACsec ops in phy_device
  net: macsec: hardware offloading infrastructure
  net: phy: export __phy_read_page/__phy_write_page
  net: phy: mscc: macsec initialization
  net: phy: mscc: macsec support

 drivers/net/macsec.c             |  542 ++++++++++------
 drivers/net/phy/Kconfig          |    2 +
 drivers/net/phy/mscc.c           | 1024 ++++++++++++++++++++++++++++++
 drivers/net/phy/mscc_fc_buffer.h |   64 ++
 drivers/net/phy/mscc_mac.h       |  159 +++++
 drivers/net/phy/mscc_macsec.h    |  258 ++++++++
 drivers/net/phy/phy-core.c       |    6 +-
 include/linux/netdev_features.h  |    3 +
 include/linux/netdevice.h        |   31 +
 include/linux/phy.h              |   13 +
 include/net/macsec.h             |  203 ++++++
 include/uapi/linux/if_macsec.h   |    3 +-
 net/core/ethtool.c               |    1 +
 13 files changed, 2125 insertions(+), 184 deletions(-)
 create mode 100644 drivers/net/phy/mscc_fc_buffer.h
 create mode 100644 drivers/net/phy/mscc_mac.h
 create mode 100644 drivers/net/phy/mscc_macsec.h
 create mode 100644 include/net/macsec.h

-- 
2.21.0

^ permalink raw reply

* [PATCH net-next v2 5/9] net: phy: add MACsec ops in phy_device
From: Antoine Tenart @ 2019-08-08 14:05 UTC (permalink / raw)
  To: davem, sd, andrew, f.fainelli, hkallweit1
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	alexandre.belloni, allan.nielsen, camelia.groza, Simon.Edelhaus
In-Reply-To: <20190808140600.21477-1-antoine.tenart@bootlin.com>

This patch adds a reference to MACsec ops in the phy_device, to allow
PHYs to support offloading MACsec operations. The phydev lock will be
held while calling those helpers.

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 include/linux/phy.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/linux/phy.h b/include/linux/phy.h
index 462b90b73f93..6947a19587e4 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -22,6 +22,10 @@
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
 
+#ifdef CONFIG_MACSEC
+#include <net/macsec.h>
+#endif
+
 #include <linux/atomic.h>
 
 #define PHY_DEFAULT_FEATURES	(SUPPORTED_Autoneg | \
@@ -345,6 +349,7 @@ struct phy_c45_device_ids {
  * attached_dev: The attached enet driver's device instance ptr
  * adjust_link: Callback for the enet controller to respond to
  * changes in the link state.
+ * macsec_ops: MACsec offloading ops.
  *
  * speed, duplex, pause, supported, advertising, lp_advertising,
  * and autoneg are used like in mii_if_info
@@ -438,6 +443,11 @@ struct phy_device {
 
 	void (*phy_link_change)(struct phy_device *, bool up, bool do_carrier);
 	void (*adjust_link)(struct net_device *dev);
+
+#if defined(CONFIG_MACSEC)
+	/* MACsec management functions */
+	const struct macsec_ops *macsec_ops;
+#endif
 };
 #define to_phy_device(d) container_of(to_mdio_device(d), \
 				      struct phy_device, mdio)
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v2 6/9] net: macsec: hardware offloading infrastructure
From: Antoine Tenart @ 2019-08-08 14:05 UTC (permalink / raw)
  To: davem, sd, andrew, f.fainelli, hkallweit1
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	alexandre.belloni, allan.nielsen, camelia.groza, Simon.Edelhaus
In-Reply-To: <20190808140600.21477-1-antoine.tenart@bootlin.com>

This patch introduces the MACsec hardware offloading infrastructure.

The main idea here is to re-use the logic and data structures of the
software MACsec implementation. This allows not to duplicate definitions
and structure storing the same kind of information. It also allows to
use a unified genlink interface for both MACsec implementations (so that
the same userspace tool, `ip macsec`, is used with the same arguments).
The MACsec offloading support cannot be disabled if an interface
supports it at the moment.

The MACsec configuration is passed to device drivers supporting it
through macsec_hw_offload() which is called from the MACsec genl
helpers. This function calls the macsec ops of PHY and Ethernet
drivers in two steps: a preparation one, and a commit one. The first
step is allowed to fail and should be used to check if a provided
configuration is compatible with the features provided by a MACsec
engine, while the second step is not allowed to fail and should only be
used to enable a given MACsec configuration. Two extra calls are made:
when a virtual MACsec interface is created and when it is deleted, so
that the hardware driver can stay in sync.

The Rx and TX handlers are modified to take in account the special case
were the MACsec transformation happens in the hardware, whether in a PHY
or in a MAC, as the packets seen by the networking stack on both the
physical and MACsec virtual interface are exactly the same. This leads
to some limitations: the hardware and software implementations can't be
used on the same physical interface, as the policies would be impossible
to fulfill (such as strict validation of the frames). Also only a single
virtual MACsec interface can be attached to a physical port supporting
hardware offloading as it would be impossible to guess onto which
interface a given packet should go (for ingress traffic).

Another limitation as of now is that the counters and statistics are not
reported back from the hardware to the software MACsec implementation.
This isn't an issue when using offloaded MACsec transformations, but it
should be added in the future so that the MACsec state can be reported
to the user (which would also improve the debug).

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 drivers/net/macsec.c | 379 +++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 362 insertions(+), 17 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 3815cb6e9bf2..74f0e06a9fc2 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -11,11 +11,13 @@
 #include <linux/module.h>
 #include <crypto/aead.h>
 #include <linux/etherdevice.h>
+#include <linux/netdevice.h>
 #include <linux/rtnetlink.h>
 #include <linux/refcount.h>
 #include <net/genetlink.h>
 #include <net/sock.h>
 #include <net/gro_cells.h>
+#include <linux/phy.h>
 
 #include <uapi/linux/if_macsec.h>
 
@@ -318,6 +320,44 @@ static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len)
 		h->short_length = data_len;
 }
 
+/* Checks if underlying layers implement MACsec offloading functions
+ * and returns a pointer to the MACsec ops struct if any (also updates
+ * the MACsec context device reference if provided).
+ */
+static const struct macsec_ops *macsec_get_ops(struct macsec_dev *dev,
+					       struct macsec_context *ctx)
+{
+	struct phy_device *phydev;
+
+	if (!dev || !dev->real_dev)
+		return NULL;
+
+	/* Check if the PHY device provides MACsec ops */
+	phydev = dev->real_dev->phydev;
+	if (phydev && phydev->macsec_ops) {
+		if (ctx) {
+			memset(ctx, 0, sizeof(*ctx));
+			ctx->phydev = phydev;
+			ctx->is_phy = 1;
+		}
+
+		return phydev->macsec_ops;
+	}
+
+	/* Check if the net device provides MACsec ops */
+	if (dev->real_dev->features & NETIF_F_HW_MACSEC &&
+	    dev->real_dev->macsec_ops) {
+		if (ctx) {
+			memset(ctx, 0, sizeof(*ctx));
+			ctx->netdev = dev->real_dev;
+		}
+
+		return dev->real_dev->macsec_ops;
+	}
+
+	return NULL;
+}
+
 /* validate MACsec packet according to IEEE 802.1AE-2006 9.12 */
 static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len)
 {
@@ -867,8 +907,10 @@ static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci)
 	return NULL;
 }
 
-static void handle_not_macsec(struct sk_buff *skb)
+static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
 {
+	/* Deliver to the uncontrolled port by default */
+	enum rx_handler_result ret = RX_HANDLER_PASS;
 	struct macsec_rxh_data *rxd;
 	struct macsec_dev *macsec;
 
@@ -883,7 +925,8 @@ static void handle_not_macsec(struct sk_buff *skb)
 		struct sk_buff *nskb;
 		struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats);
 
-		if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
+		if (!macsec_get_ops(macsec, NULL) &&
+		    macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) {
 			u64_stats_update_begin(&secy_stats->syncp);
 			secy_stats->stats.InPktsNoTag++;
 			u64_stats_update_end(&secy_stats->syncp);
@@ -902,9 +945,17 @@ static void handle_not_macsec(struct sk_buff *skb)
 			secy_stats->stats.InPktsUntagged++;
 			u64_stats_update_end(&secy_stats->syncp);
 		}
+
+		if (netif_running(macsec->secy.netdev) &&
+		    macsec_get_ops(macsec, NULL)) {
+			ret = RX_HANDLER_EXACT;
+			goto out;
+		}
 	}
 
+out:
 	rcu_read_unlock();
+	return ret;
 }
 
 static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
@@ -929,12 +980,8 @@ static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb)
 		goto drop_direct;
 
 	hdr = macsec_ethhdr(skb);
-	if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) {
-		handle_not_macsec(skb);
-
-		/* and deliver to the uncontrolled port */
-		return RX_HANDLER_PASS;
-	}
+	if (hdr->eth.h_proto != htons(ETH_P_MACSEC))
+		return handle_not_macsec(skb);
 
 	skb = skb_unshare(skb, GFP_ATOMIC);
 	*pskb = skb;
@@ -1439,6 +1486,40 @@ static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = {
 				 .len = MACSEC_MAX_KEY_LEN, },
 };
 
+/* Offloads an operation to a device driver */
+static int macsec_offload(int (* const func)(struct macsec_context *),
+			  struct macsec_context *ctx)
+{
+	int ret;
+
+	if (unlikely(!func))
+		return 0;
+
+	if (ctx->is_phy)
+		mutex_lock(&ctx->phydev->lock);
+
+	/* Phase I: prepare. The drive should fail here if there are going to be
+	 * issues in the commit phase.
+	 */
+	ctx->prepare = true;
+	ret = (*func)(ctx);
+	if (ret)
+		goto phy_unlock;
+
+	/* Phase II: commit. This step cannot fail. */
+	ctx->prepare = false;
+	ret = (*func)(ctx);
+	/* This should never happen: commit is not allowed to fail */
+	if (unlikely(ret))
+		WARN(1, "MACsec offloading commit failed (%d)\n", ret);
+
+phy_unlock:
+	if (ctx->is_phy)
+		mutex_unlock(&ctx->phydev->lock);
+
+	return ret;
+}
+
 static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa)
 {
 	if (!attrs[MACSEC_ATTR_SA_CONFIG])
@@ -1490,11 +1571,14 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev;
 	struct nlattr **attrs = info->attrs;
 	struct macsec_secy *secy;
-	struct macsec_rx_sc *rx_sc;
+	struct macsec_rx_sc *rx_sc, *prev_sc;
 	struct macsec_rx_sa *rx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	unsigned char assoc_num;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	bool was_active;
 	int err;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
@@ -1551,11 +1635,32 @@ static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info)
 		spin_unlock_bh(&rx_sa->lock);
 	}
 
+	was_active = rx_sa->active;
 	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
 		rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
 
-	nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
+	prev_sc = rx_sa->sc;
 	rx_sa->sc = rx_sc;
+
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.rx_sa = rx_sa;
+		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
+		       MACSEC_KEYID_LEN);
+
+		err = macsec_offload(ops->mdo_add_rxsa, &ctx);
+		if (err) {
+			rx_sa->active = was_active;
+			rx_sa->sc = prev_sc;
+			kfree(rx_sa);
+			rtnl_unlock();
+			return err;
+		}
+	}
+
+	nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
 	rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa);
 
 	rtnl_unlock();
@@ -1583,6 +1688,10 @@ static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info)
 	struct nlattr **attrs = info->attrs;
 	struct macsec_rx_sc *rx_sc;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
+	bool was_active;
+	int ret;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1608,9 +1717,22 @@ static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info)
 		return PTR_ERR(rx_sc);
 	}
 
+	was_active = rx_sc->active;
 	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE])
 		rx_sc->active = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
 
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.rx_sc = rx_sc;
+
+		ret = macsec_offload(ops->mdo_add_rxsc, &ctx);
+		if (ret) {
+			rx_sc->active = was_active;
+			rtnl_unlock();
+			return ret;
+		}
+	}
+
 	rtnl_unlock();
 
 	return 0;
@@ -1648,8 +1770,12 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_tx_sc *tx_sc;
 	struct macsec_tx_sa *tx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	unsigned char assoc_num;
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	bool was_operational, was_active;
+	u32 prev_pn;
 	int err;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
@@ -1700,18 +1826,42 @@ static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info)
 		return err;
 	}
 
-	nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
-
 	spin_lock_bh(&tx_sa->lock);
+	prev_pn = tx_sa->next_pn;
 	tx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]);
 	spin_unlock_bh(&tx_sa->lock);
 
+	was_active = tx_sa->active;
 	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
 		tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
 
+	was_operational = secy->operational;
 	if (assoc_num == tx_sc->encoding_sa && tx_sa->active)
 		secy->operational = true;
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.tx_sa = tx_sa;
+		memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]),
+		       MACSEC_KEYID_LEN);
+
+		err = macsec_offload(ops->mdo_add_txsa, &ctx);
+		if (err) {
+			spin_lock_bh(&tx_sa->lock);
+			tx_sa->next_pn = prev_pn;
+			spin_unlock_bh(&tx_sa->lock);
+
+			tx_sa->active = was_active;
+			secy->operational = was_operational;
+			kfree(tx_sa);
+			rtnl_unlock();
+			return err;
+		}
+	}
+
+	nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN);
 	rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa);
 
 	rtnl_unlock();
@@ -1726,9 +1876,12 @@ static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_rx_sc *rx_sc;
 	struct macsec_rx_sa *rx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	u8 assoc_num;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	int ret;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1752,6 +1905,19 @@ static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info)
 		return -EBUSY;
 	}
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.rx_sa = rx_sa;
+
+		ret = macsec_offload(ops->mdo_del_rxsa, &ctx);
+		if (ret) {
+			rtnl_unlock();
+			return ret;
+		}
+	}
+
 	RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL);
 	clear_rx_sa(rx_sa);
 
@@ -1766,8 +1932,11 @@ static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info)
 	struct net_device *dev;
 	struct macsec_secy *secy;
 	struct macsec_rx_sc *rx_sc;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	sci_t sci;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
+	int ret;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1794,6 +1963,17 @@ static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info)
 		return -ENODEV;
 	}
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.rx_sc = rx_sc;
+		ret = macsec_offload(ops->mdo_del_rxsc, &ctx);
+		if (ret) {
+			rtnl_unlock();
+			return ret;
+		}
+	}
+
 	free_rx_sc(rx_sc);
 	rtnl_unlock();
 
@@ -1807,8 +1987,11 @@ static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_tx_sc *tx_sc;
 	struct macsec_tx_sa *tx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	u8 assoc_num;
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	int ret;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1829,6 +2012,19 @@ static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info)
 		return -EBUSY;
 	}
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.tx_sa = tx_sa;
+
+		ret = macsec_offload(ops->mdo_del_txsa, &ctx);
+		if (ret) {
+			rtnl_unlock();
+			return ret;
+		}
+	}
+
 	RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL);
 	clear_tx_sa(tx_sa);
 
@@ -1865,8 +2061,13 @@ static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_tx_sc *tx_sc;
 	struct macsec_tx_sa *tx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	u8 assoc_num;
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	bool was_operational, was_active;
+	u32 prev_pn = 0;
+	int ret = 0;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1887,19 +2088,41 @@ static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info)
 
 	if (tb_sa[MACSEC_SA_ATTR_PN]) {
 		spin_lock_bh(&tx_sa->lock);
+		prev_pn = tx_sa->next_pn;
 		tx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]);
 		spin_unlock_bh(&tx_sa->lock);
 	}
 
+	was_active = tx_sa->active;
 	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
 		tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
 
+	was_operational = secy->operational;
 	if (assoc_num == tx_sc->encoding_sa)
 		secy->operational = tx_sa->active;
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.tx_sa = tx_sa;
+
+		ret = macsec_offload(ops->mdo_upd_txsa, &ctx);
+		if (ret) {
+			if (tb_sa[MACSEC_SA_ATTR_PN]) {
+				spin_lock_bh(&tx_sa->lock);
+				tx_sa->next_pn = prev_pn;
+				spin_unlock_bh(&tx_sa->lock);
+			}
+
+			tx_sa->active = was_active;
+			secy->operational = was_operational;
+		}
+	}
+
 	rtnl_unlock();
 
-	return 0;
+	return ret;
 }
 
 static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
@@ -1909,9 +2132,14 @@ static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_rx_sc *rx_sc;
 	struct macsec_rx_sa *rx_sa;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	u8 assoc_num;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
 	struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1];
+	bool was_active;
+	u32 prev_pn = 0;
+	int ret = 0;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1935,15 +2163,35 @@ static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info)
 
 	if (tb_sa[MACSEC_SA_ATTR_PN]) {
 		spin_lock_bh(&rx_sa->lock);
+		prev_pn = rx_sa->next_pn;
 		rx_sa->next_pn = nla_get_u32(tb_sa[MACSEC_SA_ATTR_PN]);
 		spin_unlock_bh(&rx_sa->lock);
 	}
 
+	was_active = rx_sa->active;
 	if (tb_sa[MACSEC_SA_ATTR_ACTIVE])
 		rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]);
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.sa.assoc_num = assoc_num;
+		ctx.sa.rx_sa = rx_sa;
+
+		ret = macsec_offload(ops->mdo_upd_rxsa, &ctx);
+		if (ret) {
+			if (tb_sa[MACSEC_SA_ATTR_PN]) {
+				spin_lock_bh(&rx_sa->lock);
+				rx_sa->next_pn = prev_pn;
+				spin_unlock_bh(&rx_sa->lock);
+			}
+
+			rx_sa->active = was_active;
+		}
+	}
+
 	rtnl_unlock();
-	return 0;
+	return ret;
 }
 
 static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
@@ -1953,6 +2201,11 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
 	struct macsec_secy *secy;
 	struct macsec_rx_sc *rx_sc;
 	struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1];
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
+	unsigned int prev_n_rx_sc;
+	bool was_active;
+	int ret;
 
 	if (!attrs[MACSEC_ATTR_IFINDEX])
 		return -EINVAL;
@@ -1970,6 +2223,8 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
 		return PTR_ERR(rx_sc);
 	}
 
+	was_active = rx_sc->active;
+	prev_n_rx_sc = secy->n_rx_sc;
 	if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) {
 		bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]);
 
@@ -1979,6 +2234,19 @@ static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info)
 		rx_sc->active = new;
 	}
 
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.rx_sc = rx_sc;
+
+		ret = macsec_offload(ops->mdo_upd_rxsc, &ctx);
+		if (ret) {
+			secy->n_rx_sc = prev_n_rx_sc;
+			rx_sc->active = was_active;
+			rtnl_unlock();
+			return 0;
+		}
+	}
+
 	rtnl_unlock();
 
 	return 0;
@@ -2546,11 +2814,15 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 {
 	struct macsec_dev *macsec = netdev_priv(dev);
 	struct macsec_secy *secy = &macsec->secy;
+	struct macsec_tx_sc *tx_sc = &secy->tx_sc;
 	struct pcpu_secy_stats *secy_stats;
+	struct macsec_tx_sa *tx_sa;
 	int ret, len;
 
+	tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]);
+
 	/* 10.5 */
-	if (!secy->protect_frames) {
+	if (!secy->protect_frames || macsec_get_ops(netdev_priv(dev), NULL)) {
 		secy_stats = this_cpu_ptr(macsec->stats);
 		u64_stats_update_begin(&secy_stats->syncp);
 		secy_stats->stats.OutPktsUntagged++;
@@ -2645,6 +2917,8 @@ static int macsec_dev_open(struct net_device *dev)
 {
 	struct macsec_dev *macsec = macsec_priv(dev);
 	struct net_device *real_dev = macsec->real_dev;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 	int err;
 
 	err = dev_uc_add(real_dev, dev->dev_addr);
@@ -2663,6 +2937,14 @@ static int macsec_dev_open(struct net_device *dev)
 			goto clear_allmulti;
 	}
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		err = macsec_offload(ops->mdo_dev_open, &ctx);
+		if (err)
+			goto clear_allmulti;
+	}
+
 	if (netif_carrier_ok(real_dev))
 		netif_carrier_on(dev);
 
@@ -2680,9 +2962,16 @@ static int macsec_dev_stop(struct net_device *dev)
 {
 	struct macsec_dev *macsec = macsec_priv(dev);
 	struct net_device *real_dev = macsec->real_dev;
+	const struct macsec_ops *ops;
+	struct macsec_context ctx;
 
 	netif_carrier_off(dev);
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops)
+		macsec_offload(ops->mdo_dev_stop, &ctx);
+
 	dev_mc_unsync(real_dev, dev);
 	dev_uc_unsync(real_dev, dev);
 
@@ -2922,6 +3211,11 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
 			     struct nlattr *data[],
 			     struct netlink_ext_ack *extack)
 {
+	struct macsec_dev *macsec = macsec_priv(dev);
+	struct macsec_context ctx;
+	const struct macsec_ops *ops;
+	int ret;
+
 	if (!data)
 		return 0;
 
@@ -2931,7 +3225,18 @@ static int macsec_changelink(struct net_device *dev, struct nlattr *tb[],
 	    data[IFLA_MACSEC_PORT])
 		return -EINVAL;
 
-	return macsec_changelink_common(dev, data);
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.secy = &macsec->secy;
+		return macsec_offload(ops->mdo_upd_secy, &ctx);
+	}
+
+	ret = macsec_changelink_common(dev, data);
+	if (ret)
+		return ret;
+
+	return 0;
 }
 
 static void macsec_del_dev(struct macsec_dev *macsec)
@@ -2973,6 +3278,15 @@ static void macsec_dellink(struct net_device *dev, struct list_head *head)
 	struct macsec_dev *macsec = macsec_priv(dev);
 	struct net_device *real_dev = macsec->real_dev;
 	struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev);
+	struct macsec_context ctx;
+	const struct macsec_ops *ops;
+
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.secy = &macsec->secy;
+		macsec_offload(ops->mdo_del_secy, &ctx);
+	}
 
 	macsec_common_dellink(dev, head);
 
@@ -3069,7 +3383,10 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 			  struct netlink_ext_ack *extack)
 {
 	struct macsec_dev *macsec = macsec_priv(dev);
-	struct net_device *real_dev;
+	struct net_device *real_dev, *loop_dev;
+	struct macsec_context ctx;
+	const struct macsec_ops *ops;
+	struct net *loop_net;
 	int err;
 	sci_t sci;
 	u8 icv_len = DEFAULT_ICV_LEN;
@@ -3081,6 +3398,25 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 	if (!real_dev)
 		return -ENODEV;
 
+	for_each_net(loop_net) {
+		for_each_netdev(loop_net, loop_dev) {
+			struct macsec_dev *priv;
+
+			if (!netif_is_macsec(loop_dev))
+				continue;
+
+			priv = macsec_priv(loop_dev);
+
+			/* A limitation of the MACsec h/w offloading is only a
+			 * single MACsec interface can be created for a given
+			 * real interface.
+			 */
+			if (macsec_get_ops(netdev_priv(dev), NULL) &&
+			    priv->real_dev == real_dev)
+				return -EBUSY;
+		}
+	}
+
 	dev->priv_flags |= IFF_MACSEC;
 
 	macsec->real_dev = real_dev;
@@ -3134,6 +3470,15 @@ static int macsec_newlink(struct net *net, struct net_device *dev,
 			goto del_dev;
 	}
 
+	/* If h/w offloading is available, propagate to the device */
+	ops = macsec_get_ops(netdev_priv(dev), &ctx);
+	if (ops) {
+		ctx.secy = &macsec->secy;
+		err = macsec_offload(ops->mdo_add_secy, &ctx);
+		if (err)
+			goto del_dev;
+	}
+
 	err = register_macsec_dev(real_dev, dev);
 	if (err < 0)
 		goto del_dev;
-- 
2.21.0


^ permalink raw reply related

* [PATCH net-next v2 3/9] net: macsec: introduce the macsec_context structure
From: Antoine Tenart @ 2019-08-08 14:05 UTC (permalink / raw)
  To: davem, sd, andrew, f.fainelli, hkallweit1
  Cc: Antoine Tenart, netdev, linux-kernel, thomas.petazzoni,
	alexandre.belloni, allan.nielsen, camelia.groza, Simon.Edelhaus
In-Reply-To: <20190808140600.21477-1-antoine.tenart@bootlin.com>

This patch introduces the macsec_context structure. It will be used
in the kernel to exchange information between the common MACsec
implementation (macsec.c) and the MACsec hardware offloading
implementations. This structure contains pointers to MACsec specific
structures which contain the actual MACsec configuration, and to the
underlying device (netdev or phydev).

Signed-off-by: Antoine Tenart <antoine.tenart@bootlin.com>
---
 include/net/macsec.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/include/net/macsec.h b/include/net/macsec.h
index 5db18a272ffd..1c82026dc17e 100644
--- a/include/net/macsec.h
+++ b/include/net/macsec.h
@@ -176,4 +176,28 @@ struct macsec_secy {
 	struct macsec_rx_sc __rcu *rx_sc;
 };
 
+/**
+ * struct macsec_context - MACsec context for hardware offloading
+ */
+struct macsec_context {
+	union {
+		struct net_device *netdev;
+		struct phy_device *phydev;
+	};
+
+	const struct macsec_secy *secy;
+	const struct macsec_rx_sc *rx_sc;
+	struct {
+		unsigned char assoc_num;
+		u8 key[MACSEC_KEYID_LEN];
+		union {
+			const struct macsec_rx_sa *rx_sa;
+			const struct macsec_tx_sa *tx_sa;
+		};
+	} sa;
+
+	u8 prepare:1;
+	u8 is_phy:1;
+};
+
 #endif /* _NET_MACSEC_H_ */
-- 
2.21.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox