Netdev List

Netdev List
 help / color / mirror / Atom feed

* [RFCv2 bpf-next 1/7] bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
From: Daniel Xu @ 2023-11-01 21:57 UTC (permalink / raw)
  To: kuba, hawk, edumazet, steffen.klassert, daniel, Herbert Xu, ast,
	john.fastabend, pabeni, davem, antony.antony
  Cc: linux-kernel, netdev, bpf, devel
In-Reply-To: <cover.1698875025.git.dxu@dxuuu.xyz>

This commit adds an unstable kfunc helper to access internal xfrm_state
associated with an SA. This is intended to be used for the upcoming
IPsec pcpu work to assign special pcpu SAs to a particular CPU. In other
words: for custom software RSS.

That being said, the function that this kfunc wraps is fairly generic
and used for a lot of xfrm tasks. I'm sure people will find uses
elsewhere over time.

Co-developed-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Antony Antony <antony.antony@secunet.com>
Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 include/net/xfrm.h        |   9 ++++
 net/xfrm/Makefile         |   1 +
 net/xfrm/xfrm_policy.c    |   2 +
 net/xfrm/xfrm_state_bpf.c | 105 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 117 insertions(+)
 create mode 100644 net/xfrm/xfrm_state_bpf.c

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index c9bb0f892f55..1d107241b901 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2190,4 +2190,13 @@ static inline int register_xfrm_interface_bpf(void)
 
 #endif
 
+#if IS_ENABLED(CONFIG_DEBUG_INFO_BTF)
+int register_xfrm_state_bpf(void);
+#else
+static inline int register_xfrm_state_bpf(void)
+{
+	return 0;
+}
+#endif
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index cd47f88921f5..547cec77ba03 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,3 +21,4 @@ obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
 obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
 obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
 obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
+obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c13dc3ef7910..1b7e75159727 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4218,6 +4218,8 @@ void __init xfrm_init(void)
 #ifdef CONFIG_XFRM_ESPINTCP
 	espintcp_init();
 #endif
+
+	register_xfrm_state_bpf();
 }
 
 #ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state_bpf.c b/net/xfrm/xfrm_state_bpf.c
new file mode 100644
index 000000000000..4aaac134b97a
--- /dev/null
+++ b/net/xfrm/xfrm_state_bpf.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM state BPF helpers.
+ *
+ * Note that it is allowed to break compatibility for these functions since the
+ * interface they are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+#include <net/xdp.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_state_opts - Options for XFRM state lookup helpers
+ *
+ * Members:
+ * @error      - Out parameter, set for any errors encountered
+ *		 Values:
+ *		   -EINVAL - netns_id is less than -1
+ *		   -EINVAL - Passed NULL for opts
+ *		   -EINVAL - opts__sz isn't BPF_XFRM_STATE_OPTS_SZ
+ *		   -ENONET - No network namespace found for netns_id
+ * @netns_id	- Specify the network namespace for lookup
+ *		 Values:
+ *		   BPF_F_CURRENT_NETNS (-1)
+ *		     Use namespace associated with ctx
+ *		   [0, S32_MAX]
+ *		     Network Namespace ID
+ * @mark	- XFRM mark to match on
+ * @daddr	- Destination address to match on
+ * @spi		- Security parameter index to match on
+ * @proto	- L3 protocol to match on
+ * @family	- L3 protocol family to match on
+ */
+struct bpf_xfrm_state_opts {
+	s32 error;
+	s32 netns_id;
+	u32 mark;
+	xfrm_address_t daddr;
+	__be32 spi;
+	u8 proto;
+	u16 family;
+};
+
+enum {
+	BPF_XFRM_STATE_OPTS_SZ = sizeof(struct bpf_xfrm_state_opts),
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in xfrm_state BTF");
+
+/* bpf_xdp_get_xfrm_state - Get XFRM state
+ *
+ * Parameters:
+ * @ctx 	- Pointer to ctx (xdp_md) in XDP program
+ *		    Cannot be NULL
+ * @opts	- Options for lookup (documented above)
+ *		    Cannot be NULL
+ * @opts__sz	- Length of the bpf_xfrm_state_opts structure
+ *		    Must be BPF_XFRM_STATE_OPTS_SZ
+ */
+__bpf_kfunc struct xfrm_state *
+bpf_xdp_get_xfrm_state(struct xdp_md *ctx, struct bpf_xfrm_state_opts *opts, u32 opts__sz)
+{
+	struct xdp_buff *xdp = (struct xdp_buff *)ctx;
+	struct net *net = dev_net(xdp->rxq->dev);
+
+	if (!opts || opts__sz != BPF_XFRM_STATE_OPTS_SZ) {
+		opts->error = -EINVAL;
+		return NULL;
+	}
+
+	if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) {
+		opts->error = -EINVAL;
+		return NULL;
+	}
+
+	if (opts->netns_id >= 0) {
+		net = get_net_ns_by_id(net, opts->netns_id);
+		if (unlikely(!net)) {
+			opts->error = -ENONET;
+			return NULL;
+		}
+	}
+
+	return xfrm_state_lookup(net, opts->mark, &opts->daddr, opts->spi,
+				 opts->proto, opts->family);
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_state_kfunc_set)
+BTF_ID_FLAGS(func, bpf_xdp_get_xfrm_state, KF_RET_NULL | KF_ACQUIRE)
+BTF_SET8_END(xfrm_state_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_state_xdp_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xfrm_state_kfunc_set,
+};
+
+int __init register_xfrm_state_bpf(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP,
+					 &xfrm_state_xdp_kfunc_set);
+}
-- 
2.42.0


^ permalink raw reply related

* [RFCv2 bpf-next 0/7] Add bpf_xdp_get_xfrm_state() kfunc
From: Daniel Xu @ 2023-11-01 21:57 UTC (permalink / raw)
  To: linux-kselftest, netdev, linux-kernel, bpf, steffen.klassert,
	antony.antony
  Cc: devel

This patchset adds two kfunc helpers, bpf_xdp_get_xfrm_state() and
bpf_xdp_xfrm_state_release() that wrap xfrm_state_lookup() and
xfrm_state_put(). The intent is to support software RSS (via XDP) for
the ongoing/upcoming ipsec pcpu work [0]. Recent experiments performed
on (hopefully) reproducible AWS testbeds indicate that single tunnel
pcpu ipsec can reach line rate on 100G ENA nics.

Note this patchset only tests/shows generic xfrm_state access. The
"secret sauce" (if you can really even call it that) involves accessing
a soon-to-be-upstreamed pcpu_num field in xfrm_state. Early example is
available here [1].

[0]: https://datatracker.ietf.org/doc/html/draft-ietf-ipsecme-multi-sa-performance-02
[1]: https://github.com/danobi/xdp-tools/blob/e89a1c617aba3b50d990f779357d6ce2863ecb27/xdp-bench/xdp_redirect_cpumap.bpf.c#L385-L406

Changes from RFCv1:
* Add Antony's commit tags
* Add KF_ACQUIRE and KF_RELEASE semantics

Daniel Xu (7):
  bpf: xfrm: Add bpf_xdp_get_xfrm_state() kfunc
  bpf: xfrm: Add bpf_xdp_xfrm_state_release() kfunc
  bpf: selftests: test_tunnel: Use ping -6 over ping6
  bpf: selftests: test_tunnel: Mount bpffs if necessary
  bpf: selftests: test_tunnel: Use vmlinux.h declarations
  bpf: selftests: test_tunnel: Disable CO-RE relocations
  bpf: xfrm: Add selftest for bpf_xdp_get_xfrm_state()

 include/net/xfrm.h                            |   9 ++
 net/xfrm/Makefile                             |   1 +
 net/xfrm/xfrm_policy.c                        |   2 +
 net/xfrm/xfrm_state_bpf.c                     | 121 ++++++++++++++++++
 .../selftests/bpf/progs/bpf_tracing_net.h     |   1 +
 .../selftests/bpf/progs/test_tunnel_kern.c    |  98 ++++++++------
 tools/testing/selftests/bpf/test_tunnel.sh    |  43 +++++--
 7 files changed, 221 insertions(+), 54 deletions(-)
 create mode 100644 net/xfrm/xfrm_state_bpf.c

-- 
2.42.0


^ permalink raw reply

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: Oleg Nesterov @ 2023-11-01 21:52 UTC (permalink / raw)
  To: Al Viro
  Cc: David Howells, Marc Dionne, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs, netdev,
	linux-kernel
In-Reply-To: <20231101205238.GI1957730@ZenIV>

On 11/01, Al Viro wrote:
>
> On Wed, Nov 01, 2023 at 09:23:03PM +0100, Oleg Nesterov wrote:
>
> > Yes this is confusing. Again, even the documentation is wrong! That is why
> > I am trying to remove the misuse of read_seqbegin_or_lock(), then I am going
> > to change the semantics of need_seqretry() to enforce the locking on the 2nd
> > pass.
>
> What for?  Sure, documentation needs to be fixed,

So do you agree that the current usage of read_seqbegin_or_lock() in
rxrpc_find_service_conn_rcu() is misleading ? Do you agree it can use
read_seqbegin/read_seqretry without changing the current behaviour?

> but *not* in direction you
> suggested in that patch.

Hmm. then how do you think the doc should be changed? To describe the
current behaviour.

> Why would you want to force that "switch to locked on the second pass" policy
> on every possible caller?

Because this is what (I think) read_seqbegin_or_lock() is supposed to do.
It should take the lock for writing if the lockless access failed. At least
according to the documentation.

This needs another discussion and perhaps this makes no sense. But I'd
like to turn need_seqretry(seq) into something like

	static inline int need_seqretry(seqlock_t *lock, int *seq)
	{
		int ret = !(*seq & 1) && read_seqretry(lock, *seq);

		if (ret)
			*seq = 1; /* make this counter odd */

		return ret;
	}

and update the users which actually want read_seqlock_excl() on the 2nd pass.
thread_group_cputime(), fs/d_path.c and fs/dcache.c.

For example, __dentry_path()

	--- a/fs/d_path.c
	+++ b/fs/d_path.c
	@@ -349,10 +349,9 @@ static char *__dentry_path(const struct dentry *d, struct prepend_buffer *p)
		}
		if (!(seq & 1))
			rcu_read_unlock();
	-	if (need_seqretry(&rename_lock, seq)) {
	-		seq = 1;
	+	if (need_seqretry(&rename_lock, &seq))
			goto restart;
	-	}
	+
		done_seqretry(&rename_lock, seq);
		if (b.len == p->len)
			prepend_char(&b, '/');

but again, this need another discussion.

Oleg.

^ permalink raw reply

* Re: [PATCH bpf-next v3 1/2] bpf: add skcipher API support to TC/XDP programs
From: Martin KaFai Lau @ 2023-11-01 21:49 UTC (permalink / raw)
  To: Vadim Fedorenko
  Cc: bpf, netdev, linux-crypto, Vadim Fedorenko, Jakub Kicinski,
	Andrii Nakryiko, Alexei Starovoitov, Mykola Lysenko
In-Reply-To: <20231031134900.1432945-1-vadfed@meta.com>

On 10/31/23 6:48 AM, Vadim Fedorenko wrote:
> --- /dev/null
> +++ b/kernel/bpf/crypto.c
> @@ -0,0 +1,280 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright (c) 2023 Meta, Inc */
> +#include <linux/bpf.h>
> +#include <linux/bpf_mem_alloc.h>
> +#include <linux/btf.h>
> +#include <linux/btf_ids.h>
> +#include <linux/filter.h>
> +#include <linux/scatterlist.h>
> +#include <linux/skbuff.h>
> +#include <crypto/skcipher.h>
> +
> +/**
> + * struct bpf_crypto_skcipher_ctx - refcounted BPF sync skcipher context structure
> + * @tfm:	The pointer to crypto_sync_skcipher struct.
> + * @rcu:	The RCU head used to free the crypto context with RCU safety.
> + * @usage:	Object reference counter. When the refcount goes to 0, the
> + *		memory is released back to the BPF allocator, which provides
> + *		RCU safety.
> + */
> +struct bpf_crypto_skcipher_ctx {
> +	struct crypto_sync_skcipher *tfm;
> +	struct rcu_head rcu;
> +	refcount_t usage;
> +};
> +
> +__diag_push();
> +__diag_ignore_all("-Wmissing-prototypes",
> +		  "Global kfuncs as their definitions will be in BTF");
> +
> +static void *__bpf_dynptr_data_ptr(const struct bpf_dynptr_kern *ptr)
> +{
> +	enum bpf_dynptr_type type;
> +
> +	if (!ptr->data)
> +		return NULL;
> +
> +	type = bpf_dynptr_get_type(ptr);
> +
> +	switch (type) {
> +	case BPF_DYNPTR_TYPE_LOCAL:
> +	case BPF_DYNPTR_TYPE_RINGBUF:
> +		return ptr->data + ptr->offset;
> +	case BPF_DYNPTR_TYPE_SKB:
> +		return skb_pointer_if_linear(ptr->data, ptr->offset, __bpf_dynptr_size(ptr));
> +	case BPF_DYNPTR_TYPE_XDP:
> +	{
> +		void *xdp_ptr = bpf_xdp_pointer(ptr->data, ptr->offset, __bpf_dynptr_size(ptr));

I suspect what it is doing here (for skb and xdp in particular) is very similar 
to bpf_dynptr_slice. Please check if bpf_dynptr_slice(ptr, 0, NULL, sz) will work.


> +		if (!IS_ERR_OR_NULL(xdp_ptr))
> +			return xdp_ptr;
> +
> +		return NULL;
> +	}
> +	default:
> +		WARN_ONCE(true, "unknown dynptr type %d\n", type);
> +		return NULL;
> +	}
> +}
> +
> +/**
> + * bpf_crypto_skcipher_ctx_create() - Create a mutable BPF crypto context.
> + *
> + * Allocates a crypto context that can be used, acquired, and released by
> + * a BPF program. The crypto context returned by this function must either
> + * be embedded in a map as a kptr, or freed with bpf_crypto_skcipher_ctx_release().
> + *
> + * bpf_crypto_skcipher_ctx_create() allocates memory using the BPF memory
> + * allocator, and will not block. It may return NULL if no memory is available.
> + * @algo: bpf_dynptr which holds string representation of algorithm.
> + * @key:  bpf_dynptr which holds cipher key to do crypto.
> + */
> +__bpf_kfunc struct bpf_crypto_skcipher_ctx *
> +bpf_crypto_skcipher_ctx_create(const struct bpf_dynptr_kern *palgo,

Song's patch on __const_str should help on the palgo (which is a string) also:
https://lore.kernel.org/bpf/20231024235551.2769174-4-song@kernel.org/

> +			       const struct bpf_dynptr_kern *pkey, int *err)
> +{
> +	struct bpf_crypto_skcipher_ctx *ctx;
> +	char *algo;
> +
> +	if (__bpf_dynptr_size(palgo) > CRYPTO_MAX_ALG_NAME) {
> +		*err = -EINVAL;
> +		return NULL;
> +	}
> +
> +	algo = __bpf_dynptr_data_ptr(palgo);
> +
> +	if (!crypto_has_skcipher(algo, CRYPTO_ALG_TYPE_SKCIPHER, CRYPTO_ALG_TYPE_MASK)) {
> +		*err = -EOPNOTSUPP;
> +		return NULL;
> +	}
> +
> +	ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx) {
> +		*err = -ENOMEM;
> +		return NULL;
> +	}
> +
> +	memset(ctx, 0, sizeof(*ctx));

nit. kzalloc()

> +
> +	ctx->tfm = crypto_alloc_sync_skcipher(algo, 0, 0);
> +	if (IS_ERR(ctx->tfm)) {
> +		*err = PTR_ERR(ctx->tfm);
> +		ctx->tfm = NULL;
> +		goto err;
> +	}
> +
> +	*err = crypto_sync_skcipher_setkey(ctx->tfm, __bpf_dynptr_data_ptr(pkey),
> +					   __bpf_dynptr_size(pkey));
> +	if (*err)
> +		goto err;
> +
> +	refcount_set(&ctx->usage, 1);
> +
> +	return ctx;
> +err:
> +	if (ctx->tfm)
> +		crypto_free_sync_skcipher(ctx->tfm);
> +	kfree(ctx);
> +
> +	return NULL;
> +}

[ ... ]

> +static int bpf_crypto_skcipher_crypt(struct crypto_sync_skcipher *tfm,
> +				     const struct bpf_dynptr_kern *src,
> +				     struct bpf_dynptr_kern *dst,
> +				     const struct bpf_dynptr_kern *iv,
> +				     bool decrypt)
> +{
> +	struct skcipher_request *req = NULL;
> +	struct scatterlist sgin, sgout;
> +	int err;
> +
> +	if (crypto_sync_skcipher_get_flags(tfm) & CRYPTO_TFM_NEED_KEY)
> +		return -EINVAL;
> +
> +	if (__bpf_dynptr_is_rdonly(dst))
> +		return -EINVAL;
> +
> +	if (!__bpf_dynptr_size(dst) || !__bpf_dynptr_size(src))
> +		return -EINVAL;
> +
> +	if (__bpf_dynptr_size(iv) != crypto_sync_skcipher_ivsize(tfm))
> +		return -EINVAL;
> +
> +	req = skcipher_request_alloc(&tfm->base, GFP_ATOMIC);

Doing alloc per packet may kill performance. Is it possible to optimize it 
somehow? What is the usual size of the req (e.g. the example in the selftest)?

> +	if (!req)
> +		return -ENOMEM;
> +
> +	sg_init_one(&sgin, __bpf_dynptr_data_ptr(src), __bpf_dynptr_size(src));
> +	sg_init_one(&sgout, __bpf_dynptr_data_ptr(dst), __bpf_dynptr_size(dst));
> +
> +	skcipher_request_set_crypt(req, &sgin, &sgout, __bpf_dynptr_size(src),
> +				   __bpf_dynptr_data_ptr(iv));
> +
> +	err = decrypt ? crypto_skcipher_decrypt(req) : crypto_skcipher_encrypt(req);

I am hitting this with the selftest in patch 2:

[   39.806675] =============================
[   39.807243] WARNING: suspicious RCU usage
[   39.807825] 6.6.0-rc7-02091-g17e023652cc1 #336 Tainted: G           O
[   39.808798] -----------------------------
[   39.809368] kernel/sched/core.c:10149 Illegal context switch in RCU-bh 
read-side critical section!
[   39.810589]
[   39.810589] other info that might help us debug this:
[   39.810589]
[   39.811696]
[   39.811696] rcu_scheduler_active = 2, debug_locks = 1
[   39.812616] 2 locks held by test_progs/1769:
[   39.813249]  #0: ffffffff84dce140 (rcu_read_lock){....}-{1:3}, at: 
ip6_finish_output2+0x625/0x1b70
[   39.814539]  #1: ffffffff84dce1a0 (rcu_read_lock_bh){....}-{1:3}, at: 
__dev_queue_xmit+0x1df/0x2c40
[   39.815813]
[   39.815813] stack backtrace:
[   39.816441] CPU: 1 PID: 1769 Comm: test_progs Tainted: G           O 
6.6.0-rc7-02091-g17e023652cc1 #336
[   39.817774] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
[   39.819312] Call Trace:
[   39.819655]  <TASK>
[   39.819967]  dump_stack_lvl+0x130/0x1d0
[   39.822669]  dump_stack+0x14/0x20
[   39.823136]  lockdep_rcu_suspicious+0x220/0x350
[   39.823777]  __might_resched+0xe0/0x660
[   39.827915]  __might_sleep+0x89/0xf0
[   39.828423]  skcipher_walk_virt+0x55/0x120
[   39.828990]  crypto_ecb_decrypt+0x159/0x310
[   39.833846]  crypto_skcipher_decrypt+0xa0/0xd0
[   39.834481]  bpf_crypto_skcipher_crypt+0x29a/0x3c0
[   39.837100]  bpf_crypto_skcipher_decrypt+0x56/0x70
[   39.837770]  bpf_prog_fa576505ab32d186_decrypt_sanity+0x180/0x185
[   39.838627]  cls_bpf_classify+0x3b6/0xf80
[   39.839807]  tcf_classify+0x2f4/0x550

> +
> +	skcipher_request_free(req);
> +
> +	return err;
> +}
> +



^ permalink raw reply

* Re: [PATCH] net: phy: broadcom: Wire suspend/resume for BCM54612E
From: Marco von Rosenberg @ 2023-11-01 21:42 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: Florian Fainelli, Broadcom internal kernel review list,
	Heiner Kallweit, Russell King, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, netdev, linux-kernel
In-Reply-To: <9cb4f059-edea-4c81-9ee4-e6020cccb8a5@lunn.ch>

On Tuesday, October 31, 2023 1:31:11 AM CET Andrew Lunn wrote:
> Are we talking about a device which as been suspended? The PHY has
> been left running because there is no suspend callback? Something then
> triggers a resume. The bootloader then suspends the active PHY? Linux
> then boots, detects its a resume, so does not touch the hardware
> because there is no resume callback? The suspended PHY is then
> useless.

Hi Andrew,

thanks for your feedback. I guess a bit of context is missing here. The issue 
has nothing to do with an ordinary suspension of the OS. The main point is 
that on initial power-up, the bootloader suspends the PHY before booting 
Linux. With a resume callback defined, Linux would call it on boot and make the 
PHY usable. However, since there is no resume callback defined for this PHY, 
Linux doesn't touch the hardware and thus the PHY is not usable.
So this specific issue is primarily solved by adding the resume callback. The 
suspend callback is just added for completeness.

Does this clarify the issue? If so, I'll adjust the commit message and submit
an updated patch.

	Marco

^ permalink raw reply

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: David Howells @ 2023-11-01 21:22 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: dhowells, Marc Dionne, Alexander Viro, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs,
	netdev, linux-kernel
In-Reply-To: <20231101204023.GC32034@redhat.com>

Oleg Nesterov <oleg@redhat.com> wrote:

> Just none of read_seqbegin_or_lock/need_seqretry/done_seqretry
> helpers make any sense in this code.

I disagree.  I think in at least a couple of cases I do want a locked second
path - ideally locked shared if seqlock can be made to use an rwlock instead
of a spinlock.

David


^ permalink raw reply

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: David Howells @ 2023-11-01 21:20 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: dhowells, Marc Dionne, Alexander Viro, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs,
	netdev, linux-kernel
In-Reply-To: <20231101202302.GB32034@redhat.com>

Oleg Nesterov <oleg@redhat.com> wrote:

> > 	1	0	need_seqretry() [seq=even; sequence!=seq: retry]
>
> Yes, if CPU_1 races with write_seqlock() need_seqretry() returns true,
>
> > 	1	1	read_seqbegin_or_lock() [exclusive]
>
> No. "seq" is still even, so read_seqbegin_or_lock() won't do read_seqlock_excl(),
> it will do

Yeah, you're right.  I missed the fact that I got in the second example that
read_seqbegin_or_lock() spins until it sees a positive seq.

However, I think just changing all of these to always-lockless isn't
necessarily the most optimal way.  Yes, it will work... eventually.  But the
point is to limit the number of iterations.

So I have the following:

 (1) rxrpc_find_service_conn_rcu().

     I want to move the first part of the reaper to the I/O thread at some
     point, then the locking here can go away entirely.  However, this is
     drivable by external events, so I would prefer to limit the number of
     passes to just two and take a lock on the second pass.  Holding up the
     reaper thread for the moment is fine; holding up the I/O thread is not.

 (2) afs_lookup_volume_rcu().

     There can be a lot of volumes known by a system.  A thousand would
     require a 10-step walk and this is drivable by remote operation, so I
     think this should probably take a lock on the second pass too.

 (3) afs_check_validity().
 (4) afs_get_attr().

     These are both pretty short, so your solution is probably good for them.
     That said, afs_vnode_commit_status() can spend a long time under the
     write lock - and pretty much every file RPC op returns a status update.

 (5) afs_find_server().

     There could be a lot of servers in the list and each server can have
     multiple addresses, so I think this would be better with an exclusive
     second pass.

     The server list isn't likely to change all that often, but when it does
     change, there's a good chance several servers are going to be
     added/removed one after the other.  Further, this is only going to be
     used for incoming cache management/callback requests from the server,
     which hopefully aren't going to happen too often - but it is remotely
     drivable.

 (6) afs_find_server_by_uuid().

     Similarly to (5), there could be a lot of servers to search through, but
     they are in a tree not a flat list, so it should be faster to process.
     Again, it's not likely to change that often and, again, when it does
     change it's likely to involve multiple changes.  This can be driven
     remotely by an incoming cache management request but is mostly going to
     be driven by setting up or reconfiguring a volume's server list -
     something that also isn't likely to happen often.

I wonder if struct seqlock would make more sense with an rwlock rather than a
spinlock.  As it is, it does an exclusive spinlock for the readpath which is
kind of overkill.

David


^ permalink raw reply

* Re: [PATCH 2/2] tg3: Fix the TX ring stall
From: Michael Chan @ 2023-11-01 21:08 UTC (permalink / raw)
  To: alexey.pakhunov
  Cc: siva.kallam, vincent.wong2, netdev, linux-kernel, prashant, mchan
In-Reply-To: <20231101191858.2611154-3-alexey.pakhunov@spacex.com>

[-- Attachment #1: Type: text/plain, Size: 1462 bytes --]

On Wed, Nov 1, 2023 at 12:19 PM <alexey.pakhunov@spacex.com> wrote:
>
> From: Alex Pakhunov <alexey.pakhunov@spacex.com>
>
> The TX ring maintained by the tg3 driver can end up in a state, when it
> has packets queued for sending but the NIC hardware is not informed, so no
> progress is made. This leads to a multi-second interruption in network
> traffic followed by dev_watchdog() firing and resetting the queue.
>
> The specific sequence of steps is:
>
> 1. tg3_start_xmit() is called at least once and queues packet(s) without
>    updating tnapi->prodmbox (netdev_xmit_more() returns true)
> 2. tg3_start_xmit() is called with an SKB which causes tg3_tso_bug() to be
>    called.
> 3. tg3_tso_bug() determines that the SKB is too large, ...
>
>         if (unlikely(tg3_tx_avail(tnapi) <= frag_cnt_est)) {
>
>    ... stops the queue, and returns NETDEV_TX_BUSY:
>
>         netif_tx_stop_queue(txq);
>         ...
>         if (tg3_tx_avail(tnapi) <= frag_cnt_est)
>                 return NETDEV_TX_BUSY;
>
> 4. Since all tg3_tso_bug() call sites directly return, the code updating
>    tnapi->prodmbox is skipped.

Thanks for the patch.  An alternative fix that may be simpler is to
add a goto after calling tg3_tso_bug().  Something like this:

        tg3_tso_bug();
        goto update_tx_mbox;
...

update_tx_mbox:
        if (!netdev_xmit_more() || netif_xmit_stopped())
                tw32_tx_mbox();
...

[-- Attachment #2: S/MIME Cryptographic Signature --]
[-- Type: application/pkcs7-signature, Size: 4209 bytes --]

^ permalink raw reply

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: Al Viro @ 2023-11-01 20:52 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: David Howells, Marc Dionne, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs, netdev,
	linux-kernel
In-Reply-To: <20231101202302.GB32034@redhat.com>

On Wed, Nov 01, 2023 at 09:23:03PM +0100, Oleg Nesterov wrote:

> Yes this is confusing. Again, even the documentation is wrong! That is why
> I am trying to remove the misuse of read_seqbegin_or_lock(), then I am going
> to change the semantics of need_seqretry() to enforce the locking on the 2nd
> pass.

What for?  Sure, documentation needs to be fixed, but *not* in direction you
suggested in that patch.

Why would you want to force that "switch to locked on the second pass" policy
on every possible caller?

^ permalink raw reply

* [PATCH bpf-next v9 09/12] bpf, net: switch to dynamic registration
From: thinker.li @ 2023-11-01 20:45 UTC (permalink / raw)
  To: bpf, ast, martin.lau, song, kernel-team, andrii, drosen
  Cc: sinquersw, kuifeng, Kui-Feng Lee, netdev
In-Reply-To: <20231101204519.677870-1-thinker.li@gmail.com>

From: Kui-Feng Lee <thinker.li@gmail.com>

Replace the static list of struct_ops types with per-btf struct_ops_tab to
enable dynamic registration.

Both bpf_dummy_ops and bpf_tcp_ca now utilize the registration function
instead of being listed in bpf_struct_ops_types.h.

Cc: netdev@vger.kernel.org
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
 include/linux/bpf.h               | 24 +++++++--
 include/linux/btf.h               |  2 +
 kernel/bpf/bpf_struct_ops.c       | 90 ++++++++++---------------------
 kernel/bpf/bpf_struct_ops_types.h | 12 -----
 kernel/bpf/btf.c                  | 38 +++++++++++--
 net/bpf/bpf_dummy_struct_ops.c    | 14 ++++-
 net/ipv4/bpf_tcp_ca.c             | 16 ++++--
 7 files changed, 107 insertions(+), 89 deletions(-)
 delete mode 100644 kernel/bpf/bpf_struct_ops_types.h

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 2a9bc482d85e..785d53b5b8c7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1643,7 +1643,6 @@ struct bpf_struct_ops_desc {
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
 const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *btf, u32 type_id);
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log);
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
 int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key,
@@ -1689,10 +1688,6 @@ static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(struct btf *
 {
 	return NULL;
 }
-static inline void bpf_struct_ops_init(struct btf *btf,
-				       struct bpf_verifier_log *log)
-{
-}
 static inline bool bpf_try_module_get(const void *data, struct module *owner)
 {
 	return try_module_get(owner);
@@ -3231,6 +3226,8 @@ static inline bool bpf_is_subprog(const struct bpf_prog *prog)
 	return prog->aux->func_idx != 0;
 }
 
+int register_bpf_struct_ops(struct bpf_struct_ops *st_ops);
+
 enum bpf_struct_ops_state {
 	BPF_STRUCT_OPS_STATE_INIT,
 	BPF_STRUCT_OPS_STATE_INUSE,
@@ -3243,4 +3240,21 @@ struct bpf_struct_ops_common_value {
 	enum bpf_struct_ops_state state;
 };
 
+/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
+ * the map's value exposed to the userspace and its btf-type-id is
+ * stored at the map->btf_vmlinux_value_type_id.
+ *
+ */
+#define DEFINE_STRUCT_OPS_VALUE_TYPE(_name)			\
+extern struct bpf_struct_ops bpf_##_name;			\
+								\
+struct bpf_struct_ops_##_name {					\
+	struct bpf_struct_ops_common_value common;		\
+	struct _name data ____cacheline_aligned_in_smp;		\
+}
+
+extern int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+				    struct btf *btf,
+				    struct bpf_verifier_log *log);
+
 #endif /* _LINUX_BPF_H */
diff --git a/include/linux/btf.h b/include/linux/btf.h
index e613b6b45dc4..91472d41ded7 100644
--- a/include/linux/btf.h
+++ b/include/linux/btf.h
@@ -12,6 +12,8 @@
 #include <uapi/linux/bpf.h>
 
 #define BTF_TYPE_EMIT(type) ((void)(type *)0)
+#define BTF_STRUCT_OPS_TYPE_EMIT(type) {((void)(struct type *)0);	\
+		((void)(struct bpf_struct_ops_##type *)0); }
 #define BTF_TYPE_EMIT_ENUM(enum_val) ((void)enum_val)
 
 /* These need to be macros, as the expressions are used in assembler input */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 2d853431bf09..caacc251655a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -61,35 +61,6 @@ static DEFINE_MUTEX(update_mutex);
 #define VALUE_PREFIX "bpf_struct_ops_"
 #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1)
 
-/* bpf_struct_ops_##_name (e.g. bpf_struct_ops_tcp_congestion_ops) is
- * the map's value exposed to the userspace and its btf-type-id is
- * stored at the map->btf_vmlinux_value_type_id.
- *
- */
-#define BPF_STRUCT_OPS_TYPE(_name)				\
-extern struct bpf_struct_ops bpf_##_name;			\
-								\
-struct bpf_struct_ops_##_name {					\
-	struct bpf_struct_ops_common_value common;		\
-	struct _name data ____cacheline_aligned_in_smp;		\
-};
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-enum {
-#define BPF_STRUCT_OPS_TYPE(_name) BPF_STRUCT_OPS_TYPE_##_name,
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-	__NR_BPF_STRUCT_OPS_TYPE,
-};
-
-static struct bpf_struct_ops_desc bpf_struct_ops[] = {
-#define BPF_STRUCT_OPS_TYPE(_name)				\
-	[BPF_STRUCT_OPS_TYPE_##_name] = { .st_ops = &bpf_##_name },
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-};
-
 const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
 };
 
@@ -144,9 +115,9 @@ static bool is_valid_value_type(struct btf *btf, s32 value_id,
 	return true;
 }
 
-static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
-				     struct btf *btf,
-				     struct bpf_verifier_log *log)
+int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+			     struct btf *btf,
+			     struct bpf_verifier_log *log)
 {
 	struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	const struct btf_member *member;
@@ -160,7 +131,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	    sizeof(value_name)) {
 		pr_warn("struct_ops name %s is too long\n",
 			st_ops->name);
-		return;
+		return -EINVAL;
 	}
 	sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name);
 
@@ -169,13 +140,13 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	if (type_id < 0) {
 		pr_warn("Cannot find struct %s in btf_vmlinux\n",
 			st_ops->name);
-		return;
+		return -EINVAL;
 	}
 	t = btf_type_by_id(btf, type_id);
 	if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) {
 		pr_warn("Cannot support #%u members in struct %s\n",
 			btf_type_vlen(t), st_ops->name);
-		return;
+		return -EINVAL;
 	}
 
 	value_id = btf_find_by_name_kind(btf, value_name,
@@ -183,10 +154,10 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 	if (value_id < 0) {
 		pr_warn("Cannot find struct %s in btf_vmlinux\n",
 			value_name);
-		return;
+		return -EINVAL;
 	}
 	if (!is_valid_value_type(btf, value_id, t, value_name))
-		return;
+		return -EINVAL;
 
 	for_each_member(i, t, member) {
 		const struct btf_type *func_proto;
@@ -195,13 +166,13 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		if (!*mname) {
 			pr_warn("anon member in struct %s is not supported\n",
 				st_ops->name);
-			break;
+			return -EOPNOTSUPP;
 		}
 
 		if (__btf_member_bitfield_size(t, member)) {
 			pr_warn("bit field member %s in struct %s is not supported\n",
 				mname, st_ops->name);
-			break;
+			return -EOPNOTSUPP;
 		}
 
 		func_proto = btf_type_resolve_func_ptr(btf,
@@ -213,7 +184,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 					   &st_ops->func_models[i])) {
 			pr_warn("Error in parsing func ptr %s in struct %s\n",
 				mname, st_ops->name);
-			break;
+			return -EINVAL;
 		}
 	}
 
@@ -221,6 +192,7 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 		if (st_ops->init(btf)) {
 			pr_warn("Error in init bpf_struct_ops %s\n",
 				st_ops->name);
+			return -EINVAL;
 		} else {
 			st_ops_desc->type_id = type_id;
 			st_ops_desc->type = t;
@@ -229,35 +201,24 @@ static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
 								 value_id);
 		}
 	}
-}
 
-void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
-{
-	struct bpf_struct_ops_desc *st_ops_desc;
-	u32 i;
-
-	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
-#define BPF_STRUCT_OPS_TYPE(_name) BTF_TYPE_EMIT(struct bpf_struct_ops_##_name);
-#include "bpf_struct_ops_types.h"
-#undef BPF_STRUCT_OPS_TYPE
-
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		st_ops_desc = &bpf_struct_ops[i];
-		bpf_struct_ops_desc_init(st_ops_desc, btf, log);
-	}
+	return 0;
 }
 
 static const struct bpf_struct_ops_desc *
 bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
 {
+	const struct bpf_struct_ops_desc *st_ops_list;
 	unsigned int i;
+	u32 cnt = 0;
 
-	if (!value_id || !btf)
+	if (!value_id)
 		return NULL;
 
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i].value_id == value_id)
-			return &bpf_struct_ops[i];
+	st_ops_list = btf_get_struct_ops(btf, &cnt);
+	for (i = 0; i < cnt; i++) {
+		if (st_ops_list[i].value_id == value_id)
+			return &st_ops_list[i];
 	}
 
 	return NULL;
@@ -266,14 +227,17 @@ bpf_struct_ops_find_value(struct btf *btf, u32 value_id)
 const struct bpf_struct_ops_desc *
 bpf_struct_ops_find(struct btf *btf, u32 type_id)
 {
+	const struct bpf_struct_ops_desc *st_ops_list;
 	unsigned int i;
+	u32 cnt;
 
-	if (!type_id || !btf)
+	if (!type_id)
 		return NULL;
 
-	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i].type_id == type_id)
-			return &bpf_struct_ops[i];
+	st_ops_list = btf_get_struct_ops(btf, &cnt);
+	for (i = 0; i < cnt; i++) {
+		if (st_ops_list[i].type_id == type_id)
+			return &st_ops_list[i];
 	}
 
 	return NULL;
diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
deleted file mode 100644
index 5678a9ddf817..000000000000
--- a/kernel/bpf/bpf_struct_ops_types.h
+++ /dev/null
@@ -1,12 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* internal file - do not include directly */
-
-#ifdef CONFIG_BPF_JIT
-#ifdef CONFIG_NET
-BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
-#endif
-#ifdef CONFIG_INET
-#include <net/tcp.h>
-BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
-#endif
-#endif
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 2fd6fa0ea1f4..c8a1bdbe7d9a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -5790,8 +5790,6 @@ struct btf *btf_parse_vmlinux(void)
 	/* btf_parse_vmlinux() runs under bpf_verifier_lock */
 	bpf_ctx_convert.t = btf_type_by_id(btf, bpf_ctx_convert_btf_id[0]);
 
-	bpf_struct_ops_init(btf, log);
-
 	refcount_set(&btf->refcnt, 1);
 
 	err = btf_alloc_id(btf);
@@ -8620,10 +8618,11 @@ bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log,
 }
 
 static int
-btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops)
+btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops,
+		   struct bpf_verifier_log *log)
 {
 	struct btf_struct_ops_tab *tab, *new_tab;
-	int i;
+	int i, err;
 
 	if (!btf)
 		return -ENOENT;
@@ -8660,6 +8659,10 @@ btf_add_struct_ops(struct btf *btf, struct bpf_struct_ops *st_ops)
 
 	tab->ops[btf->struct_ops_tab->cnt].st_ops = st_ops;
 
+	err = bpf_struct_ops_desc_init(&tab->ops[btf->struct_ops_tab->cnt], btf, log);
+	if (err)
+		return err;
+
 	btf->struct_ops_tab->cnt++;
 
 	return 0;
@@ -8676,3 +8679,30 @@ const struct bpf_struct_ops_desc *btf_get_struct_ops(struct btf *btf, u32 *ret_c
 	return (const struct bpf_struct_ops_desc *)btf->struct_ops_tab->ops;
 }
 
+int register_bpf_struct_ops(struct bpf_struct_ops *st_ops)
+{
+	struct bpf_verifier_log *log;
+	struct btf *btf;
+	int err = 0;
+
+	btf = btf_get_module_btf(st_ops->owner);
+	if (!btf)
+		return -EINVAL;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL | __GFP_NOWARN);
+	if (!log) {
+		err = -ENOMEM;
+		goto errout;
+	}
+
+	log->level = BPF_LOG_KERNEL;
+
+	err = btf_add_struct_ops(btf, st_ops, log);
+
+errout:
+	kfree(log);
+	btf_put(btf);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(register_bpf_struct_ops);
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index ffa224053a6c..148a5851c4fa 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -7,7 +7,7 @@
 #include <linux/bpf.h>
 #include <linux/btf.h>
 
-extern struct bpf_struct_ops bpf_bpf_dummy_ops;
+static struct bpf_struct_ops bpf_bpf_dummy_ops;
 
 /* A common type for test_N with return value in bpf_dummy_ops */
 typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...);
@@ -223,11 +223,13 @@ static int bpf_dummy_reg(void *kdata)
 	return -EOPNOTSUPP;
 }
 
+DEFINE_STRUCT_OPS_VALUE_TYPE(bpf_dummy_ops);
+
 static void bpf_dummy_unreg(void *kdata)
 {
 }
 
-struct bpf_struct_ops bpf_bpf_dummy_ops = {
+static struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.verifier_ops = &bpf_dummy_verifier_ops,
 	.init = bpf_dummy_init,
 	.check_member = bpf_dummy_ops_check_member,
@@ -235,4 +237,12 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = {
 	.reg = bpf_dummy_reg,
 	.unreg = bpf_dummy_unreg,
 	.name = "bpf_dummy_ops",
+	.owner = THIS_MODULE,
 };
+
+static int __init bpf_dummy_struct_ops_init(void)
+{
+	BTF_STRUCT_OPS_TYPE_EMIT(bpf_dummy_ops);
+	return register_bpf_struct_ops(&bpf_bpf_dummy_ops);
+}
+late_initcall(bpf_dummy_struct_ops_init);
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 3c8b76578a2a..b36a19274e5b 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -12,7 +12,7 @@
 #include <net/bpf_sk_storage.h>
 
 /* "extern" is to avoid sparse warning.  It is only used in bpf_struct_ops.c. */
-extern struct bpf_struct_ops bpf_tcp_congestion_ops;
+static struct bpf_struct_ops bpf_tcp_congestion_ops;
 
 static u32 unsupported_ops[] = {
 	offsetof(struct tcp_congestion_ops, get_info),
@@ -277,7 +277,9 @@ static int bpf_tcp_ca_validate(void *kdata)
 	return tcp_validate_congestion_control(kdata);
 }
 
-struct bpf_struct_ops bpf_tcp_congestion_ops = {
+DEFINE_STRUCT_OPS_VALUE_TYPE(tcp_congestion_ops);
+
+static struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.verifier_ops = &bpf_tcp_ca_verifier_ops,
 	.reg = bpf_tcp_ca_reg,
 	.unreg = bpf_tcp_ca_unreg,
@@ -287,10 +289,18 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = {
 	.init = bpf_tcp_ca_init,
 	.validate = bpf_tcp_ca_validate,
 	.name = "tcp_congestion_ops",
+	.owner = THIS_MODULE,
 };
 
 static int __init bpf_tcp_ca_kfunc_init(void)
 {
-	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+	int ret;
+
+	BTF_STRUCT_OPS_TYPE_EMIT(tcp_congestion_ops);
+
+	ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set);
+	ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops);
+
+	return ret;
 }
 late_initcall(bpf_tcp_ca_kfunc_init);
-- 
2.34.1


^ permalink raw reply related

* [PATCH bpf-next v9 03/12] bpf, net: introduce bpf_struct_ops_desc.
From: thinker.li @ 2023-11-01 20:45 UTC (permalink / raw)
  To: bpf, ast, martin.lau, song, kernel-team, andrii, drosen
  Cc: sinquersw, kuifeng, Kui-Feng Lee, netdev
In-Reply-To: <20231101204519.677870-1-thinker.li@gmail.com>

From: Kui-Feng Lee <thinker.li@gmail.com>

Move some of members of bpf_struct_ops to bpf_struct_ops_desc.  When we
introduce the APIs to register new bpf_struct_ops types from modules,
bpf_struct_ops may destroyed when the module is unloaded.  Moving these
members to bpf_struct_ops_desc make these data available even the module is
unloaded.

Cc: netdev@vger.kernel.org
Signed-off-by: Kui-Feng Lee <thinker.li@gmail.com>
---
 include/linux/bpf.h            | 13 ++++--
 kernel/bpf/bpf_struct_ops.c    | 80 +++++++++++++++++-----------------
 kernel/bpf/verifier.c          |  8 ++--
 net/bpf/bpf_dummy_struct_ops.c |  9 +++-
 net/ipv4/bpf_tcp_ca.c          |  8 +++-
 5 files changed, 70 insertions(+), 48 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b4825d3cdb29..b55e27162df0 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1626,17 +1626,22 @@ struct bpf_struct_ops {
 	void (*unreg)(void *kdata);
 	int (*update)(void *kdata, void *old_kdata);
 	int (*validate)(void *kdata);
-	const struct btf_type *type;
-	const struct btf_type *value_type;
 	const char *name;
 	struct btf_func_model func_models[BPF_STRUCT_OPS_MAX_NR_MEMBERS];
+};
+
+struct bpf_struct_ops_desc {
+	struct bpf_struct_ops *st_ops;
+
+	const struct btf_type *type;
+	const struct btf_type *value_type;
 	u32 type_id;
 	u32 value_id;
 };
 
 #if defined(CONFIG_BPF_JIT) && defined(CONFIG_BPF_SYSCALL)
 #define BPF_MODULE_OWNER ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id);
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id);
 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log);
 bool bpf_struct_ops_get(const void *kdata);
 void bpf_struct_ops_put(const void *kdata);
@@ -1679,7 +1684,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 			    union bpf_attr __user *uattr);
 #endif
 #else
-static inline const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+static inline const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
 {
 	return NULL;
 }
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 4ca4ca4998e0..d804801c7864 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -32,7 +32,7 @@ struct bpf_struct_ops_value {
 struct bpf_struct_ops_map {
 	struct bpf_map map;
 	struct rcu_head rcu;
-	const struct bpf_struct_ops *st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	/* protect map_update */
 	struct mutex lock;
 	/* link has all the bpf_links that is populated
@@ -92,9 +92,9 @@ enum {
 	__NR_BPF_STRUCT_OPS_TYPE,
 };
 
-static struct bpf_struct_ops * const bpf_struct_ops[] = {
+static struct bpf_struct_ops_desc bpf_struct_ops[] = {
 #define BPF_STRUCT_OPS_TYPE(_name)				\
-	[BPF_STRUCT_OPS_TYPE_##_name] = &bpf_##_name,
+	[BPF_STRUCT_OPS_TYPE_##_name] = { .st_ops = &bpf_##_name },
 #include "bpf_struct_ops_types.h"
 #undef BPF_STRUCT_OPS_TYPE
 };
@@ -115,10 +115,11 @@ enum {
 	IDX_MODULE_ID,
 };
 
-static void bpf_struct_ops_init_one(struct bpf_struct_ops *st_ops,
-				    struct btf *btf,
-				    struct bpf_verifier_log *log)
+static void bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
+				     struct btf *btf,
+				     struct bpf_verifier_log *log)
 {
+	struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	const struct btf_member *member;
 	const struct btf_type *t;
 	s32 type_id, value_id;
@@ -190,18 +191,18 @@ static void bpf_struct_ops_init_one(struct bpf_struct_ops *st_ops,
 			pr_warn("Error in init bpf_struct_ops %s\n",
 				st_ops->name);
 		} else {
-			st_ops->type_id = type_id;
-			st_ops->type = t;
-			st_ops->value_id = value_id;
-			st_ops->value_type = btf_type_by_id(btf,
-							    value_id);
+			st_ops_desc->type_id = type_id;
+			st_ops_desc->type = t;
+			st_ops_desc->value_id = value_id;
+			st_ops_desc->value_type = btf_type_by_id(btf,
+								 value_id);
 		}
 	}
 }
 
 void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 {
-	struct bpf_struct_ops *st_ops;
+	struct bpf_struct_ops_desc *st_ops_desc;
 	u32 i;
 
 	/* Ensure BTF type is emitted for "struct bpf_struct_ops_##_name" */
@@ -210,14 +211,14 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
 #undef BPF_STRUCT_OPS_TYPE
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		st_ops = bpf_struct_ops[i];
-		bpf_struct_ops_init_one(st_ops, btf, log);
+		st_ops_desc = &bpf_struct_ops[i];
+		bpf_struct_ops_desc_init(st_ops_desc, btf, log);
 	}
 }
 
 extern struct btf *btf_vmlinux;
 
-static const struct bpf_struct_ops *
+static const struct bpf_struct_ops_desc *
 bpf_struct_ops_find_value(u32 value_id)
 {
 	unsigned int i;
@@ -226,14 +227,14 @@ bpf_struct_ops_find_value(u32 value_id)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i]->value_id == value_id)
-			return bpf_struct_ops[i];
+		if (bpf_struct_ops[i].value_id == value_id)
+			return &bpf_struct_ops[i];
 	}
 
 	return NULL;
 }
 
-const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
+const struct bpf_struct_ops_desc *bpf_struct_ops_find(u32 type_id)
 {
 	unsigned int i;
 
@@ -241,8 +242,8 @@ const struct bpf_struct_ops *bpf_struct_ops_find(u32 type_id)
 		return NULL;
 
 	for (i = 0; i < ARRAY_SIZE(bpf_struct_ops); i++) {
-		if (bpf_struct_ops[i]->type_id == type_id)
-			return bpf_struct_ops[i];
+		if (bpf_struct_ops[i].type_id == type_id)
+			return &bpf_struct_ops[i];
 	}
 
 	return NULL;
@@ -302,7 +303,7 @@ static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key)
 
 static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map)
 {
-	const struct btf_type *t = st_map->st_ops->type;
+	const struct btf_type *t = st_map->st_ops_desc->type;
 	u32 i;
 
 	for (i = 0; i < btf_type_vlen(t); i++) {
@@ -376,11 +377,12 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 					   void *value, u64 flags)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-	const struct bpf_struct_ops *st_ops = st_map->st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+	const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops;
 	struct bpf_struct_ops_value *uvalue, *kvalue;
 	const struct btf_type *module_type;
 	const struct btf_member *member;
-	const struct btf_type *t = st_ops->type;
+	const struct btf_type *t = st_ops_desc->type;
 	struct bpf_tramp_links *tlinks;
 	void *udata, *kdata;
 	int prog_fd, err;
@@ -393,7 +395,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 	if (*(u32 *)key != 0)
 		return -E2BIG;
 
-	err = check_zero_holes(st_ops->value_type, value);
+	err = check_zero_holes(st_ops_desc->value_type, value);
 	if (err)
 		return err;
 
@@ -486,7 +488,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
 		}
 
 		if (prog->type != BPF_PROG_TYPE_STRUCT_OPS ||
-		    prog->aux->attach_btf_id != st_ops->type_id ||
+		    prog->aux->attach_btf_id != st_ops_desc->type_id ||
 		    prog->expected_attach_type != i) {
 			bpf_prog_put(prog);
 			err = -EINVAL;
@@ -582,7 +584,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
 			     BPF_STRUCT_OPS_STATE_TOBEFREE);
 	switch (prev_state) {
 	case BPF_STRUCT_OPS_STATE_INUSE:
-		st_map->st_ops->unreg(&st_map->kvalue.data);
+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
 		bpf_map_put(map);
 		return 0;
 	case BPF_STRUCT_OPS_STATE_TOBEFREE:
@@ -663,22 +665,22 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr)
 
 static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 {
-	const struct bpf_struct_ops *st_ops;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	size_t st_map_size;
 	struct bpf_struct_ops_map *st_map;
 	const struct btf_type *t, *vt;
 	struct bpf_map *map;
 	int ret;
 
-	st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
-	if (!st_ops)
+	st_ops_desc = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id);
+	if (!st_ops_desc)
 		return ERR_PTR(-ENOTSUPP);
 
-	vt = st_ops->value_type;
+	vt = st_ops_desc->value_type;
 	if (attr->value_size != vt->size)
 		return ERR_PTR(-EINVAL);
 
-	t = st_ops->type;
+	t = st_ops_desc->type;
 
 	st_map_size = sizeof(*st_map) +
 		/* kvalue stores the
@@ -690,7 +692,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 	if (!st_map)
 		return ERR_PTR(-ENOMEM);
 
-	st_map->st_ops = st_ops;
+	st_map->st_ops_desc = st_ops_desc;
 	map = &st_map->map;
 
 	ret = bpf_jit_charge_modmem(PAGE_SIZE);
@@ -728,8 +730,8 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr)
 static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-	const struct bpf_struct_ops *st_ops = st_map->st_ops;
-	const struct btf_type *vt = st_ops->value_type;
+	const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc;
+	const struct btf_type *vt = st_ops_desc->value_type;
 	u64 usage;
 
 	usage = sizeof(*st_map) +
@@ -803,7 +805,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
 		/* st_link->map can be NULL if
 		 * bpf_struct_ops_link_create() fails to register.
 		 */
-		st_map->st_ops->unreg(&st_map->kvalue.data);
+		st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
 		bpf_map_put(&st_map->map);
 	}
 	kfree(st_link);
@@ -850,7 +852,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 	if (!bpf_struct_ops_valid_to_reg(new_map))
 		return -EINVAL;
 
-	if (!st_map->st_ops->update)
+	if (!st_map->st_ops_desc->st_ops->update)
 		return -EOPNOTSUPP;
 
 	mutex_lock(&update_mutex);
@@ -863,12 +865,12 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
 
 	old_st_map = container_of(old_map, struct bpf_struct_ops_map, map);
 	/* The new and old struct_ops must be the same type. */
-	if (st_map->st_ops != old_st_map->st_ops) {
+	if (st_map->st_ops_desc != old_st_map->st_ops_desc) {
 		err = -EINVAL;
 		goto err_out;
 	}
 
-	err = st_map->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
+	err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
 	if (err)
 		goto err_out;
 
@@ -919,7 +921,7 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
 	if (err)
 		goto err_out;
 
-	err = st_map->st_ops->reg(st_map->kvalue.data);
+	err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
 	if (err) {
 		bpf_link_cleanup(&link_primer);
 		link = NULL;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 857d76694517..290e3a7ee72f 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -20081,6 +20081,7 @@ static void print_verification_stats(struct bpf_verifier_env *env)
 static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 {
 	const struct btf_type *t, *func_proto;
+	const struct bpf_struct_ops_desc *st_ops_desc;
 	const struct bpf_struct_ops *st_ops;
 	const struct btf_member *member;
 	struct bpf_prog *prog = env->prog;
@@ -20093,14 +20094,15 @@ static int check_struct_ops_btf_id(struct bpf_verifier_env *env)
 	}
 
 	btf_id = prog->aux->attach_btf_id;
-	st_ops = bpf_struct_ops_find(btf_id);
-	if (!st_ops) {
+	st_ops_desc = bpf_struct_ops_find(btf_id);
+	if (!st_ops_desc) {
 		verbose(env, "attach_btf_id %u is not a supported struct\n",
 			btf_id);
 		return -ENOTSUPP;
 	}
+	st_ops = st_ops_desc->st_ops;
 
-	t = st_ops->type;
+	t = st_ops_desc->type;
 	member_idx = prog->expected_attach_type;
 	if (member_idx >= btf_type_vlen(t)) {
 		verbose(env, "attach to invalid member idx %u of struct %s\n",
diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c
index 5918d1b32e19..ffa224053a6c 100644
--- a/net/bpf/bpf_dummy_struct_ops.c
+++ b/net/bpf/bpf_dummy_struct_ops.c
@@ -17,6 +17,8 @@ struct bpf_dummy_ops_test_args {
 	struct bpf_dummy_ops_state state;
 };
 
+static struct btf *bpf_dummy_ops_btf;
+
 static struct bpf_dummy_ops_test_args *
 dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr)
 {
@@ -85,9 +87,13 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 	void *image = NULL;
 	unsigned int op_idx;
 	int prog_ret;
+	u32 type_id;
 	int err;
 
-	if (prog->aux->attach_btf_id != st_ops->type_id)
+	type_id = btf_find_by_name_kind(bpf_dummy_ops_btf,
+					bpf_bpf_dummy_ops.name,
+					BTF_KIND_STRUCT);
+	if (prog->aux->attach_btf_id != type_id)
 		return -EOPNOTSUPP;
 
 	func_proto = prog->aux->attach_func_proto;
@@ -143,6 +149,7 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 static int bpf_dummy_init(struct btf *btf)
 {
+	bpf_dummy_ops_btf = btf;
 	return 0;
 }
 
diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
index 39dcccf0f174..3c8b76578a2a 100644
--- a/net/ipv4/bpf_tcp_ca.c
+++ b/net/ipv4/bpf_tcp_ca.c
@@ -20,6 +20,7 @@ static u32 unsupported_ops[] = {
 
 static const struct btf_type *tcp_sock_type;
 static u32 tcp_sock_id, sock_id;
+static const struct btf_type *tcp_congestion_ops_type;
 
 static int bpf_tcp_ca_init(struct btf *btf)
 {
@@ -36,6 +37,11 @@ static int bpf_tcp_ca_init(struct btf *btf)
 	tcp_sock_id = type_id;
 	tcp_sock_type = btf_type_by_id(btf, tcp_sock_id);
 
+	type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT);
+	if (type_id < 0)
+		return -EINVAL;
+	tcp_congestion_ops_type = btf_type_by_id(btf, type_id);
+
 	return 0;
 }
 
@@ -149,7 +155,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog)
 	u32 midx;
 
 	midx = prog->expected_attach_type;
-	t = bpf_tcp_congestion_ops.type;
+	t = tcp_congestion_ops_type;
 	m = &btf_type_member(t)[midx];
 
 	return __btf_member_bit_offset(t, m) / 8;
-- 
2.34.1


^ permalink raw reply related

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: Oleg Nesterov @ 2023-11-01 20:40 UTC (permalink / raw)
  To: David Howells
  Cc: Marc Dionne, Alexander Viro, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs, netdev,
	linux-kernel
In-Reply-To: <20231101202302.GB32034@redhat.com>

In case I was not clear, I am not saying this code is buggy.

Just none of read_seqbegin_or_lock/need_seqretry/done_seqretry
helpers make any sense in this code. It can use read_seqbegin/
read_seqretry and this won't change the current behaviour.

On 11/01, Oleg Nesterov wrote:
>
> On 11/01, David Howells wrote:
> >
> > Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > > read_seqbegin_or_lock() makes no sense unless you make "seq" odd
> > > after the lockless access failed.
> >
> > I think you're wrong.
>
> I think you missed the point ;)
>
> > write_seqlock() turns it odd.
>
> It changes seqcount_t->sequence but not "seq" so this doesn't matter.
>
> > For instance, if the read lock is taken first:
> >
> > 	sequence seq	CPU 1				CPU 2
> > 	======= =======	===============================	===============
> > 	0
> > 	0	0	seq = 0  MUST BE EVEN
>
> This is correct,
>
> > ACCORDING TO DOC
>
> documentation is wrong, please see
>
> 	[PATCH 1/2] seqlock: fix the wrong read_seqbegin_or_lock/need_seqretry documentation
> 	https://lore.kernel.org/all/20231024120808.GA15382@redhat.com/
>
> > 	0	0	read_seqbegin_or_lock() [lockless]
> > 			...
> > 	1	0					write_seqlock()
> > 	1	0	need_seqretry() [seq=even; sequence!=seq: retry]
>
> Yes, if CPU_1 races with write_seqlock() need_seqretry() returns true,
>
> > 	1	1	read_seqbegin_or_lock() [exclusive]
>
> No. "seq" is still even, so read_seqbegin_or_lock() won't do read_seqlock_excl(),
> it will do
>
> 	seq = read_seqbegin(lock);
>
> again.
>
> > Note that it spins in __read_seqcount_begin() until we get an even seq,
> > indicating that no write is currently in progress - at which point we can
> > perform a lockless pass.
>
> Exactly. And this means that "seq" is always even.
>
> > > See thread_group_cputime() as an example, note that it does nextseq = 1 for
> > > the 2nd round.
> >
> > That's not especially convincing.
>
> See also the usage of read_seqbegin_or_lock() in fs/dcache.c and fs/d_path.c.
> All other users are wrong.
>
> Lets start from the very beginning. This code does
>
>         int seq = 0;
>         do {
>                 read_seqbegin_or_lock(service_conn_lock, &seq);
>
>                 do_something();
>
>         } while (need_seqretry(service_conn_lock, seq));
>
>         done_seqretry(service_conn_lock, seq);
>
> Initially seq is even (it is zero), so read_seqbegin_or_lock(&seq) does
>
> 	*seq = read_seqbegin(lock);
>
> and returns. Note that "seq" is still even.
>
> Now. If need_seqretry(seq) detects the race with write_seqlock() it returns
> true but it does NOT change this "seq", it is still even. So on the next
> iteration read_seqbegin_or_lock() will do
>
> 	*seq = read_seqbegin(lock);
>
> again, it won't take this lock for writing. And again, seq will be even.
> And so on.
>
> And this means that the code above is equivalent to
>
> 	do {
> 		seq = read_seqbegin(service_conn_lock);
>
> 		do_something();
>
> 	} while (read_seqretry(service_conn_lock, seq));
>
> and this is what this patch does.
>
> Yes this is confusing. Again, even the documentation is wrong! That is why
> I am trying to remove the misuse of read_seqbegin_or_lock(), then I am going
> to change the semantics of need_seqretry() to enforce the locking on the 2nd
> pass.
>
> Oleg.


^ permalink raw reply

* Re: Does anyone use Appletalk?
From: Dan Williams @ 2023-11-01 20:27 UTC (permalink / raw)
  To: John Paul Adrian Glaubitz, Geert Uytterhoeven
  Cc: linux-m68k, Arnd Bergmann, Jakub Kicinski, netdev
In-Reply-To: <5e3e52a48ba9cc0109a98cf4c5371c3f80c4b4cc.camel@physik.fu-berlin.de>

On Wed, 2023-11-01 at 13:26 +0100, John Paul Adrian Glaubitz wrote:
> Hi Geert,
> 
> On Wed, 2023-11-01 at 13:19 +0100, Geert Uytterhoeven wrote:
> > > Isn't that a bit late?
> > 
> > It can always be reverted...
> 
> Sure, but I'd rather see such discussions before merging the removal
> patch. Best would have been to reach out to the netatalk project, for
> example and ask [1]. They just released version 3.1.18 of the
> netatalk
> server in October 2023.
> 
> It's an incredibly cool project because it allows you to replace the
> expensive Apple TimeMachine hardware with a cheap Raspberry Pi ;-).

But... Time Machine debuted with 10.5 and AppleTalk got removed in
10.6; did the actual TimeCapsules ever support AppleTalk, or were they
always TCP/IP-based?

(also TimeMachine-capable Airport Extremes [A1354] are like $15 on
eBay; that's cheaper than a Raspberry Pi)

This patch only removes the Linux-side ipddp driver (eg MacIP) so if
Time Capsules never supported AppleTalk, this patch is unrelated to
TimeMachine.

What this patch *may* break is Linux as a MacIP gateway, allowing
AppleTalk-only machines to talk TCP/IP to systems. But that's like
what, the 128/512/Plus and PowerBook Duo/1xx? Everything else had a
PDS/NuBus slot or onboard Ethernet and could do native
MacTCP/OpenTransport...

Dan

^ permalink raw reply

* Re: [PATCH net v2] net: dsa: tag_rtl4_a: Bump min packet size
From: Linus Walleij @ 2023-11-01 20:26 UTC (permalink / raw)
  To: Luiz Angelo Daros de Luca
  Cc: Vladimir Oltean, Andrew Lunn, Florian Fainelli, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, netdev, linux-kernel
In-Reply-To: <CAJq09z6QwLNEc5rEGvE3jujZ-vb+vtUQLS-fkOnrdnYqk5KvxA@mail.gmail.com>

On Wed, Nov 1, 2023 at 1:35 PM Luiz Angelo Daros de Luca
<luizluca@gmail.com> wrote:

> Sorry but I noticed no issues:

Don't be sorry about that, it's good news because now I know
where to look, i.e. in the ethernet controller.

> My device is using
> https://github.com/luizluca/openwrt/tree/ath79_dsa_prep%2Bdevices . In
> summary, kernel 6.1 with openwrt generic patches and the
> reset-controller patch I sent net-next recently.

Looking good to me.

> [    3.888540] realtek-smi switch: found an RTL8366RB switch
> [    3.952366] realtek-smi switch: RTL5937 ver 3 chip found

Same version as mine too.

> [    3.967086] realtek-smi switch: set MAC: 42:E4:F5:XX:XX:XX

Unrelated: I have seen other DSA switches "inherit" the MAC of the
conduit interface (BRCM). I wonder if we could do that too instead
of this random MAC number. Usually the conduit interface has a
properly configured MAC.

> [    3.976779] realtek-smi switch: missing child interrupt-controller node
> [    3.983455] realtek-smi switch: no interrupt support
> [    4.158891] realtek-smi switch: no LED for port 5

Are the LEDs working? My device has no LEDs so I have never
tested it, despite I coded it. (Also these days we can actually
support each LED individually configured from device tree using
the LED API, but that would be quite a bit of code, so only some
fun for the aspiring developer...)

> Maybe the ag71xx driver is doing something differently.

I'll look at it!

Thanks a lot Luiz,
Linus Walleij

^ permalink raw reply

* Re: [PATCH] rxrpc_find_service_conn_rcu: use read_seqbegin() rather than read_seqbegin_or_lock()
From: Oleg Nesterov @ 2023-11-01 20:23 UTC (permalink / raw)
  To: David Howells
  Cc: Marc Dionne, Alexander Viro, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Chuck Lever, linux-afs, netdev,
	linux-kernel
In-Reply-To: <1952182.1698853516@warthog.procyon.org.uk>

On 11/01, David Howells wrote:
>
> Oleg Nesterov <oleg@redhat.com> wrote:
>
> > read_seqbegin_or_lock() makes no sense unless you make "seq" odd
> > after the lockless access failed.
>
> I think you're wrong.

I think you missed the point ;)

> write_seqlock() turns it odd.

It changes seqcount_t->sequence but not "seq" so this doesn't matter.

> For instance, if the read lock is taken first:
>
> 	sequence seq	CPU 1				CPU 2
> 	======= =======	===============================	===============
> 	0
> 	0	0	seq = 0  MUST BE EVEN

This is correct,

> ACCORDING TO DOC

documentation is wrong, please see

	[PATCH 1/2] seqlock: fix the wrong read_seqbegin_or_lock/need_seqretry documentation
	https://lore.kernel.org/all/20231024120808.GA15382@redhat.com/

> 	0	0	read_seqbegin_or_lock() [lockless]
> 			...
> 	1	0					write_seqlock()
> 	1	0	need_seqretry() [seq=even; sequence!=seq: retry]

Yes, if CPU_1 races with write_seqlock() need_seqretry() returns true,

> 	1	1	read_seqbegin_or_lock() [exclusive]

No. "seq" is still even, so read_seqbegin_or_lock() won't do read_seqlock_excl(),
it will do

	seq = read_seqbegin(lock);

again.

> Note that it spins in __read_seqcount_begin() until we get an even seq,
> indicating that no write is currently in progress - at which point we can
> perform a lockless pass.

Exactly. And this means that "seq" is always even.

> > See thread_group_cputime() as an example, note that it does nextseq = 1 for
> > the 2nd round.
>
> That's not especially convincing.

See also the usage of read_seqbegin_or_lock() in fs/dcache.c and fs/d_path.c.
All other users are wrong.

Lets start from the very beginning. This code does

        int seq = 0;
        do {
                read_seqbegin_or_lock(service_conn_lock, &seq);

                do_something();

        } while (need_seqretry(service_conn_lock, seq));

        done_seqretry(service_conn_lock, seq);

Initially seq is even (it is zero), so read_seqbegin_or_lock(&seq) does

	*seq = read_seqbegin(lock);

and returns. Note that "seq" is still even.

Now. If need_seqretry(seq) detects the race with write_seqlock() it returns
true but it does NOT change this "seq", it is still even. So on the next
iteration read_seqbegin_or_lock() will do

	*seq = read_seqbegin(lock);

again, it won't take this lock for writing. And again, seq will be even.
And so on.

And this means that the code above is equivalent to

	do {
		seq = read_seqbegin(service_conn_lock);

		do_something();

	} while (read_seqretry(service_conn_lock, seq));

and this is what this patch does.

Yes this is confusing. Again, even the documentation is wrong! That is why
I am trying to remove the misuse of read_seqbegin_or_lock(), then I am going
to change the semantics of need_seqretry() to enforce the locking on the 2nd
pass.

Oleg.

^ permalink raw reply

* Re: [PATCH net v3] net: dsa: tag_rtl4_a: Bump min packet size
From: Linus Walleij @ 2023-11-01 20:18 UTC (permalink / raw)
  To: Andrew Lunn, Florian Fainelli, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Luiz Angelo Daros de Luca
  Cc: netdev, linux-kernel
In-Reply-To: <20231031-fix-rtl8366rb-v3-1-04dfc4e7d90e@linaro.org>

On Tue, Oct 31, 2023 at 11:45 PM Linus Walleij <linus.walleij@linaro.org> wrote:

> It was reported that the "LuCI" web UI was not working properly
> with a device using the RTL8366RB switch. Disabling the egress
> port tagging code made the switch work again, but this is not
> a good solution as we want to be able to direct traffic to a
> certain port.

Luiz is not seeing this on his ethernet controller so:

pw-bot: cr

(I've seen Vladmir do this, I don't know what it means, but seems
to be how to hold back patches.)

Yours,
Linus Walleij

^ permalink raw reply

* [PATCH net] macsec: Abort MACSec Rx offload datapath when skb is not marked with MACSec metadata
From: Rahul Rameshbabu @ 2023-11-01 20:02 UTC (permalink / raw)
  To: netdev
  Cc: Jakub Kicinski, Paolo Abeni, Eric Dumazet, David S. Miller,
	Rahul Rameshbabu, Sabrina Dubroca, Saeed Mahameed

When MACSec is configured on an outer netdev, traffic received directly
through the underlying netdev should not be processed by the MACSec Rx
datapath. When using MACSec offload on an outer netdev, traffic with no
metadata indicator in the skb is mistakenly considered as MACSec traffic
and incorrectly handled in the handle_not_macsec function. Treat skbs with
no metadata type as non-MACSec packets rather than assuming they are MACSec
packets.

Fixes: 860ead89b851 ("net/macsec: Add MACsec skb_metadata_dst Rx Data path support")
Cc: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
Reviewed-by: Saeed Mahameed <saeed@kernel.org>
---
 drivers/net/macsec.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index 9663050a852d..102200ce87d3 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -996,10 +996,12 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
 	struct metadata_dst *md_dst;
 	struct macsec_rxh_data *rxd;
 	struct macsec_dev *macsec;
+	bool is_macsec_md_skb;
 
 	rcu_read_lock();
 	rxd = macsec_data_rcu(skb->dev);
 	md_dst = skb_metadata_dst(skb);
+	is_macsec_md_skb = !md_dst || md_dst->type != METADATA_MACSEC;
 
 	list_for_each_entry_rcu(macsec, &rxd->secys, secys) {
 		struct sk_buff *nskb;
@@ -1012,10 +1014,11 @@ static enum rx_handler_result handle_not_macsec(struct sk_buff *skb)
 		if (macsec_is_offloaded(macsec) && netif_running(ndev)) {
 			struct macsec_rx_sc *rx_sc = NULL;
 
-			if (md_dst && md_dst->type == METADATA_MACSEC)
-				rx_sc = find_rx_sc(&macsec->secy, md_dst->u.macsec_info.sci);
+			if (is_macsec_md_skb)
+				continue;
 
-			if (md_dst && md_dst->type == METADATA_MACSEC && !rx_sc)
+			rx_sc = find_rx_sc(&macsec->secy, md_dst->u.macsec_info.sci);
+			if (!rx_sc)
 				continue;
 
 			if (ether_addr_equal_64bits(hdr->h_dest,
-- 
2.40.1


^ permalink raw reply related

* Re: [PATCH] [PATCH net] tg3: power down device only on SYSTEM_POWER_OFF
From: George Shuklin @ 2023-11-01 19:58 UTC (permalink / raw)
  To: Pavan Chebbi; +Cc: netdev, Andrew Gospodarek, Michael Chan
In-Reply-To: <CALs4sv37sniGKkYADvHwwMjFzp5tBbBnpfOnyK-peM=rnp63Bw@mail.gmail.com>

On 01/11/2023 17:20, Pavan Chebbi wrote:
> On Wed, Nov 1, 2023 at 6:34 PM George Shuklin <george.shuklin@gmail.com> wrote:
>> Dell R650xs servers hangs if tg3 driver calls tg3_power_down.
>>
>> This happens only if network adapters (BCM5720 for R650xs) were
>> initialized using SNP (e.g. by booting ipxe.efi).
>>
>> This is partial revert of commit 2ca1c94ce0b.
>>
>> The actual problem is on Dell side, but this fix allow servers
>> to come back alive after reboot.
> How are you sure that the problem solved by 2ca1c94ce0b is not
> reintroduced with this change?

I contacted the author of original patch, no reply yet (1st day). Also, 
I tested it on few generations of available Dell servers (R330, R340, 
R350 and R650sx, for which this fix should help). It does produce log 
message from 
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1917471, but, at 
least, it reboots without issues.

Actually, original patch is regression: 5.19 rebooting just fine, 6.0 
start to hang. I also reported it to dell support forum, but I'm not 
sure if they pick it up or not.

What would be the proper course of actions for such problem (outside of 
fixing UEFI SNP, for which I don't have access to sources)?

^ permalink raw reply

* Re: [PATCH net-next v2 3/3] net: dsa: realtek: support reset controller
From: Luiz Angelo Daros de Luca @ 2023-11-01 19:55 UTC (permalink / raw)
  To: Vladimir Oltean
  Cc: netdev, linus.walleij, alsi, andrew, vivien.didelot, f.fainelli,
	davem, kuba, pabeni, robh+dt, krzk+dt, arinc.unal
In-Reply-To: <CAJq09z6KV-Oz_8tt4QHKxMx1fjb_81C+XpvFRjLu5vXJHNWKOQ@mail.gmail.com>

Hi Vladimir,

> realtek-mdio is an MDIO driver while realtek-smi is a platform driver
> implementing a bitbang protocol. They might never be used together in
> a system to share RAM and not even installed together in small
> systems. If I do need to share the code, I would just link it twice.
> Would something like this be acceptable?
>
> drivers/net/dsa/realtek/Makefile
> -obj-$(CONFIG_NET_DSA_REALTEK_MDIO)     += realtek-mdio.o
> -obj-$(CONFIG_NET_DSA_REALTEK_SMI)      += realtek-smi.o
> +obj-$(CONFIG_NET_DSA_REALTEK_MDIO)     += realtek-mdio.o realtek_common.o
> +obj-$(CONFIG_NET_DSA_REALTEK_SMI)      += realtek-smi.o realtek_common.o

Just a follow up.

It is not that simple to include a .c file into an existing single
file module. It looks like you need to rename the original file as all
linked objects must not conflict with the module name. The kernel
build seems to create a new object file for each module. Is there a
clearer way? I think #include a common .c file would not be
acceptable.

I tested your shared module suggestion. It is the clearest one but it
also increased the overall size quite a bit. Even linking two objects
seems to use the double of space used by the functions alone. These
are some values (mips)

drivers/net/dsa/realtek/realtek_common.o  5744  without exports
drivers/net/dsa/realtek/realtek_common.o 33472  exporting the two
reset functions (assert/deassert)

drivers/net/dsa/realtek/realtek-mdio.o   18756  without the reset
funcs (to be used as a module)
drivers/net/dsa/realtek/realtek-mdio.o   20480  including the
realtek_common.c (#include <realtek_common.c>)
drivers/net/dsa/realtek/realtek-mdio.o   22696  linking the realtek_common.o

drivers/net/dsa/realtek/realtek-smi.o    30712  without the reset
funcs (to be used as a module)
drivers/net/dsa/realtek/realtek-smi.o    34604  linking the realtek_common.o

drivers/net/dsa/realtek/realtek-mdio.ko  28800  without the reset
funcs (it will use realtek_common.ko)
drivers/net/dsa/realtek/realtek-mdio.ko  32736  linking the realtek_common.o

drivers/net/dsa/realtek/realtek-smi.ko   40708  without the reset
funcs (it will use realtek_common.ko)
drivers/net/dsa/realtek/realtek-smi.ko   44612  linking the realtek_common.o

In summary, we get about 1.5kb of code with the extra functions,
almost 4kb if we link a common object containing the functions and
33kb if we use a module for those two functions.

I can go with any option. I just need to know which one would be
accepted to update my patches.
1) keep duplicated functions on each file
2) share the code including the .c on both
3) share the code linking a common object in both modules (and
renaming existing .c files)
4) create a new module used by both modules.

The devices that would use this driver have very restricted storage
space. Every kbyte counts.

Regards,

Luiz

^ permalink raw reply

* Re: [net-next PATCH v2 1/2] net: phy: aquantia: add firmware load support
From: Andrew Lunn @ 2023-11-01 19:46 UTC (permalink / raw)
  To: Christian Marangi
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Rob Herring, Krzysztof Kozlowski, Conor Dooley, Heiner Kallweit,
	Russell King, netdev, devicetree, linux-kernel, Robert Marko
In-Reply-To: <65428629.050a0220.b2431.1edc@mx.google.com>

> Do we have API to check this? Or I think I should just check the iram
> and dram size and see if iram_size % sizeof(u32) is zero and return
> error otherwise.

Yes, that sounds correct.

     Andrew

^ permalink raw reply

* [PATCH] rfkill: return ENOTTY on invalid ioctl
From: Thomas Weißschuh @ 2023-11-01 19:41 UTC (permalink / raw)
  To: Johannes Berg, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni
  Cc: linux-wireless, netdev, linux-kernel, Thomas Weißschuh

For unknown ioctls the correct error is
ENOTTY "Inappropriate ioctl for device".

ENOSYS as returned before should only be used to indicate that a syscall
is not available at all.

Signed-off-by: Thomas Weißschuh <linux@weissschuh.net>
---
 net/rfkill/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index 14cc8fe8584b..c3feb4f49d09 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -1351,11 +1351,11 @@ static long rfkill_fop_ioctl(struct file *file, unsigned int cmd,
 			     unsigned long arg)
 {
 	struct rfkill_data *data = file->private_data;
-	int ret = -ENOSYS;
+	int ret = -ENOTTY;
 	u32 size;
 
 	if (_IOC_TYPE(cmd) != RFKILL_IOC_MAGIC)
-		return -ENOSYS;
+		return -ENOTTY;
 
 	mutex_lock(&data->mtx);
 	switch (_IOC_NR(cmd)) {

---
base-commit: 7d461b291e65938f15f56fe58da2303b07578a76
change-id: 20231101-rfkill-ioctl-enosys-00a2bb0a4ab1

Best regards,
-- 
Thomas Weißschuh <linux@weissschuh.net>


^ permalink raw reply related

* [syzbot] [can?] WARNING in j1939_xtp_rx_rts
From: syzbot @ 2023-11-01 19:39 UTC (permalink / raw)
  To: davem, edumazet, kernel, kuba, linux-can, linux-kernel, mkl,
	netdev, o.rempel, pabeni, robin, socketcan, syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    2af9b20dbb39 Merge tag 'x86-urgent-2023-10-28' of git://gi..
git tree:       upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=12257ac3680000
kernel config:  https://syzkaller.appspot.com/x/.config?x=7d1f30869bb78ec6
dashboard link: https://syzkaller.appspot.com/bug?extid=daa36413a5cedf799ae4
compiler:       gcc (Debian 12.2.0-14) 12.2.0, GNU ld (GNU Binutils for Debian) 2.40
userspace arch: i386

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image (non-bootable): https://storage.googleapis.com/syzbot-assets/7bc7510fe41f/non_bootable_disk-2af9b20d.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/1e66f0c379e2/vmlinux-2af9b20d.xz
kernel image: https://storage.googleapis.com/syzbot-assets/3732b6e62161/bzImage-2af9b20d.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+daa36413a5cedf799ae4@syzkaller.appspotmail.com

WARNING: CPU: 2 PID: 5504 at net/can/j1939/transport.c:1656 j1939_xtp_rx_rts_session_new net/can/j1939/transport.c:1656 [inline]
WARNING: CPU: 2 PID: 5504 at net/can/j1939/transport.c:1656 j1939_xtp_rx_rts+0x1553/0x17e0 net/can/j1939/transport.c:1735
Modules linked in:
CPU: 2 PID: 5504 Comm: syz-executor.3 Not tainted 6.6.0-rc7-syzkaller-00195-g2af9b20dbb39 #0
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-debian-1.16.2-1 04/01/2014
RIP: 0010:j1939_xtp_rx_rts_session_new net/can/j1939/transport.c:1656 [inline]
RIP: 0010:j1939_xtp_rx_rts+0x1553/0x17e0 net/can/j1939/transport.c:1735
Code: 00 48 89 ef e8 fe d9 eb fa e9 42 ef ff ff e8 b4 1b 4e f8 be 01 00 00 00 48 89 ef e8 e7 d9 eb fa e9 a8 f9 ff ff e8 9d 1b 4e f8 <0f> 0b 48 8d 83 c0 00 00 00 4c 8d b3 c2 00 00 00 48 89 44 24 10 e9
RSP: 0018:ffffc90000520950 EFLAGS: 00010246

RAX: 0000000000000000 RBX: ffff8880267b8400 RCX: 0000000000000100
RDX: ffff88807058c800 RSI: ffffffff8939be93 RDI: 0000000000000005
RBP: 00000000fffffff5 R08: 0000000000000005 R09: 0000000000000000
R10: 00000000fffffff5 R11: dffffc0000000000 R12: ffff8880198ec000
R13: 0000000000000002 R14: ffff888074db3c1b R15: 0000000000000002
FS:  0000000000000000(0000) GS:ffff88802c800000(0063) knlGS:00000000f7fc4b40
CS:  0010 DS: 002b ES: 002b CR0: 0000000080050033
CR2: 000000002fa23000 CR3: 000000004a9cd000 CR4: 0000000000350ee0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
Call Trace:
 <IRQ>
 j1939_tp_cmd_recv net/can/j1939/transport.c:2057 [inline]
 j1939_tp_recv+0x366/0xf50 net/can/j1939/transport.c:2144
 j1939_can_recv+0x78b/0xa70 net/can/j1939/main.c:112
 deliver net/can/af_can.c:572 [inline]
 can_rcv_filter+0x2a5/0x8e0 net/can/af_can.c:606
 can_receive+0x31b/0x5c0 net/can/af_can.c:663
 can_rcv+0x1dc/0x270 net/can/af_can.c:687
 __netif_receive_skb_one_core+0x115/0x180 net/core/dev.c:5552
 __netif_receive_skb+0x1f/0x1b0 net/core/dev.c:5666
 process_backlog+0x101/0x6b0 net/core/dev.c:5994
 __napi_poll.constprop.0+0xb4/0x530 net/core/dev.c:6556
 napi_poll net/core/dev.c:6623 [inline]
 net_rx_action+0x956/0xe90 net/core/dev.c:6756
 __do_softirq+0x218/0x965 kernel/softirq.c:553
 do_softirq kernel/softirq.c:454 [inline]
 do_softirq+0xaa/0xe0 kernel/softirq.c:441
 </IRQ>
 <TASK>
 __local_bh_enable_ip+0xf8/0x120 kernel/softirq.c:381
 j1939_sk_sendmsg+0xeb7/0x13a0 net/can/j1939/socket.c:1266
 sock_sendmsg_nosec net/socket.c:730 [inline]
 __sock_sendmsg+0xd5/0x180 net/socket.c:745
 ____sys_sendmsg+0x6ac/0x940 net/socket.c:2558
 ___sys_sendmsg+0x135/0x1d0 net/socket.c:2612
 __sys_sendmsg+0x117/0x1e0 net/socket.c:2641
 do_syscall_32_irqs_on arch/x86/entry/common.c:112 [inline]
 __do_fast_syscall_32+0x61/0xe0 arch/x86/entry/common.c:178
 do_fast_syscall_32+0x33/0x70 arch/x86/entry/common.c:203
 entry_SYSENTER_compat_after_hwframe+0x70/0x82
RIP: 0023:0xf7fc9579
Code: b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 8d b4 26 00 00 00 00 8d b4 26 00 00 00 00
RSP: 002b:00000000f7fc45ac EFLAGS: 00000292 ORIG_RAX: 0000000000000172
RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 0000000020000200
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
R10: 0000000000000000 R11: 0000000000000292 R12: 0000000000000000
R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 </TASK>
----------------
Code disassembly (best guess), 2 bytes skipped:
   0:	10 06                	adc    %al,(%rsi)
   2:	03 74 b4 01          	add    0x1(%rsp,%rsi,4),%esi
   6:	10 07                	adc    %al,(%rdi)
   8:	03 74 b0 01          	add    0x1(%rax,%rsi,4),%esi
   c:	10 08                	adc    %cl,(%rax)
   e:	03 74 d8 01          	add    0x1(%rax,%rbx,8),%esi
  1e:	00 51 52             	add    %dl,0x52(%rcx)
  21:	55                   	push   %rbp
  22:	89 e5                	mov    %esp,%ebp
  24:	0f 34                	sysenter
  26:	cd 80                	int    $0x80
* 28:	5d                   	pop    %rbp <-- trapping instruction
  29:	5a                   	pop    %rdx
  2a:	59                   	pop    %rcx
  2b:	c3                   	ret
  2c:	90                   	nop
  2d:	90                   	nop
  2e:	90                   	nop
  2f:	90                   	nop
  30:	8d b4 26 00 00 00 00 	lea    0x0(%rsi,%riz,1),%esi
  37:	8d b4 26 00 00 00 00 	lea    0x0(%rsi,%riz,1),%esi


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* [PATCH 1/2] tg3: Increment tx_dropped in tg3_tso_bug()
From: alexey.pakhunov @ 2023-11-01 19:18 UTC (permalink / raw)
  To: siva.kallam
  Cc: vincent.wong2, netdev, linux-kernel, prashant, mchan,
	Alex Pakhunov
In-Reply-To: <20231101191858.2611154-1-alexey.pakhunov@spacex.com>

From: Alex Pakhunov <alexey.pakhunov@spacex.com>

tg3_tso_bug() drops a packet if it cannot be segmented for any reason.
The number of discarded frames should be incremeneted accordingly.

Signed-off-by: Alex Pakhunov <alexey.pakhunov@spacex.com>
Signed-off-by: Vincent Wong <vincent.wong2@spacex.com>
---
 drivers/net/ethernet/broadcom/tg3.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 14b311196b8f..99638e6c9e16 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -7874,8 +7874,10 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi,
 
 	segs = skb_gso_segment(skb, tp->dev->features &
 				    ~(NETIF_F_TSO | NETIF_F_TSO6));
-	if (IS_ERR(segs) || !segs)
+	if (IS_ERR(segs) || !segs) {
+		tp->tx_dropped++;
 		goto tg3_tso_bug_end;
+	}
 
 	skb_list_walk_safe(segs, seg, next) {
 		skb_mark_not_on_list(seg);
-- 
2.39.3


^ permalink raw reply related

* [PATCH 0/2] tg3: Fix the TX ring stall
From: alexey.pakhunov @ 2023-11-01 19:18 UTC (permalink / raw)
  To: siva.kallam
  Cc: vincent.wong2, netdev, linux-kernel, prashant, mchan,
	Alex Pakhunov

From: Alex Pakhunov <alexey.pakhunov@spacex.com>

This patch fixes the problem with tg3 driver we encountered on several
machines having Broadcom 5719 NIC. The problem showed up as a 10-20 seconds
interruption in network traffic and these dmegs message followed by the NIC
registers dump:

=== dmesg ===
NETDEV WATCHDOG: eth0 (tg3): transmit queue 0 timed out
...
RIP: 0010:dev_watchdog+0x21e/0x230
...
tg3 0000:02:00.2 eth0: transmit timed out, resetting
=== ===

The issue was observed with "4.15.0-52-lowlatency #56~16.04.1-Ubuntu" and
"4.15.0-161-lowlatency #169~16.04.1-Ubuntu" kernels.

Based on the state of the TX queue at the time of the reset and analysis of
dev_watchdog() it appeared that the NIC has not been notified about packets
accumulated in the TX ring for TG3_TX_TIMEOUT seconds and was reset:

=== dmesg ===
tg3 0000:02:00.2 eth0: 0: Host status block [00000001:000000a0:(0000:06d8:0000):(0000:01a0)]
tg3 0000:02:00.2 eth0: 0: NAPI info [000000a0:000000a0:(0188:01a0:01ff):0000:(06f2:0000:0000:0000)]
=== ===

tnapi->hw_status->idx[0].tx_consumer is the same as tnapi->tx_cons (0x1a0)
meaning that the driver has processed all TX descriptions released by
the NIC. tnapi->tx_prod (0x188) is ahead of 0x1a0 meaning that there are
more descriptors in the TX ring ready to be sent but the NIC does not know
about that yet.

Further analysis showed that tg3_start_xmit() can stop the TX queue and
not tell the NIC about already enqueued packets. The specific sequence
is:

1. tg3_start_xmit() is called at least once and queues packet(s) without
   updating tnapi->prodmbox (netdev_xmit_more() returns true)
2. tg3_start_xmit() is called with an SKB which causes tg3_tso_bug() to be
   called.
3. tg3_tso_bug() determines that the SKB is too large [L7860], ...

        if (unlikely(tg3_tx_avail(tnapi) <= frag_cnt_est)) {

   ... stops the queue [L7861], and returns NETDEV_TX_BUSY [L7870]:

        netif_tx_stop_queue(txq);
        ...
        if (tg3_tx_avail(tnapi) <= frag_cnt_est)
                return NETDEV_TX_BUSY;

4. Since all tg3_tso_bug() call sites directly return, the code updating
   tnapi->prodmbox [L8138] is skipped.

5. The queue is stuck now. tg3_start_xmit() is not called while the queue
   is stopped. The NIC is not processing new packets because
   tnapi->prodmbox wasn't updated. tg3_tx() is not called by
   tg3_poll_work() because the all TX descriptions that could be freed has
   been freed [L7159]:

        /* run TX completion thread */
        if (tnapi->hw_status->idx[0].tx_consumer != tnapi->tx_cons) {
                tg3_tx(tnapi);

6. Eventually, dev_watchdog() fires resetting the queue.

As far as I can tell this sequence is still possible in HEAD of master.

I could not reproduce this stall by generating traffic to match conditions
required for tg3_tso_bug() to be called. Based on the driver's code
the SKB must be a TSO or GSO skb; it should contain a VLAN tag or extra TCP
header options; and it should be queued at exactly the right time.
I believe that the last part is what makes reproducing it harder.

However I was able to reproduce the stall by mimicing the behavior of
tg3_tso_bug() in tg3_start_xmit(). I added the following lines to
tg3_start_xmit() before "would_hit_hwbug = 0;" [L8046]:

        if (...) {
                netif_tx_stop_queue(txq);
                return NETDEV_TX_BUSY;
        }

        would_hit_hwbug = 0;

The condition is not super relevant. It was used to control when the stall
is induced, so that the network is not completely broken dueing testing.
This approach reproduced the issue rather reliably.

The proposed fix makes sure that the tnapi->prodmbox update happens
regardless of the reason tg3_start_xmit() returned. It essentially moves
the code updating tnapi->prodmbox from tg3_start_xmit() (which is renamed
to __tg3_start_xmit()) to a new wrapper. This makes sure all retun paths
are covered.

I tested this fix with the code inducing the TX stall from above. The fix
eliminated stalls completely.

Note that initially, instead of adding a new wrapper function, I tried
adjusting all code paths exiting tg3_start_xmit() so that they go through
the branch updating tnapi->prodmbox. This approach seemed more fragile
than adding a wrapper.

Alex Pakhunov (2):
  tg3: Increment tx_dropped in tg3_tso_bug()
  tg3: Fix the TX ring stall

 drivers/net/ethernet/broadcom/tg3.c | 50 ++++++++++++++++++++++-------
 1 file changed, 38 insertions(+), 12 deletions(-)

base-commit: ffc253263a1375a65fa6c9f62a893e9767fbebfa
-- 
2.39.3

^ permalink raw reply

* [PATCH 2/2] tg3: Fix the TX ring stall
From: alexey.pakhunov @ 2023-11-01 19:18 UTC (permalink / raw)
  To: siva.kallam
  Cc: vincent.wong2, netdev, linux-kernel, prashant, mchan,
	Alex Pakhunov
In-Reply-To: <20231101191858.2611154-1-alexey.pakhunov@spacex.com>

From: Alex Pakhunov <alexey.pakhunov@spacex.com>

The TX ring maintained by the tg3 driver can end up in a state, when it
has packets queued for sending but the NIC hardware is not informed, so no
progress is made. This leads to a multi-second interruption in network
traffic followed by dev_watchdog() firing and resetting the queue.

The specific sequence of steps is:

1. tg3_start_xmit() is called at least once and queues packet(s) without
   updating tnapi->prodmbox (netdev_xmit_more() returns true)
2. tg3_start_xmit() is called with an SKB which causes tg3_tso_bug() to be
   called.
3. tg3_tso_bug() determines that the SKB is too large, ...

        if (unlikely(tg3_tx_avail(tnapi) <= frag_cnt_est)) {

   ... stops the queue, and returns NETDEV_TX_BUSY:

        netif_tx_stop_queue(txq);
        ...
        if (tg3_tx_avail(tnapi) <= frag_cnt_est)
                return NETDEV_TX_BUSY;

4. Since all tg3_tso_bug() call sites directly return, the code updating
   tnapi->prodmbox is skipped.

5. The queue is stuck now. tg3_start_xmit() is not called while the queue
   is stopped. The NIC is not processing new packets because
   tnapi->prodmbox wasn't updated. tg3_tx() is not called by
   tg3_poll_work() because the all TX descriptions that could be freed has
   been freed:

        /* run TX completion thread */
        if (tnapi->hw_status->idx[0].tx_consumer != tnapi->tx_cons) {
                tg3_tx(tnapi);

6. Eventually, dev_watchdog() fires triggering a reset of the queue.

This fix makes sure that the tnapi->prodmbox update happens regardless of
the reason tg3_start_xmit() returned.

Signed-off-by: Alex Pakhunov <alexey.pakhunov@spacex.com>
Signed-off-by: Vincent Wong <vincent.wong2@spacex.com>
---
 drivers/net/ethernet/broadcom/tg3.c | 46 ++++++++++++++++++++++-------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 99638e6c9e16..c3512409434e 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -6603,9 +6603,9 @@ static void tg3_tx(struct tg3_napi *tnapi)
 
 	tnapi->tx_cons = sw_idx;
 
-	/* Need to make the tx_cons update visible to tg3_start_xmit()
+	/* Need to make the tx_cons update visible to __tg3_start_xmit()
 	 * before checking for netif_queue_stopped().  Without the
-	 * memory barrier, there is a small possibility that tg3_start_xmit()
+	 * memory barrier, there is a small possibility that __tg3_start_xmit()
 	 * will miss it and cause the queue to be stopped forever.
 	 */
 	smp_mb();
@@ -7845,7 +7845,7 @@ static bool tg3_tso_bug_gso_check(struct tg3_napi *tnapi, struct sk_buff *skb)
 	return skb_shinfo(skb)->gso_segs < tnapi->tx_pending / 3;
 }
 
-static netdev_tx_t tg3_start_xmit(struct sk_buff *, struct net_device *);
+static netdev_tx_t __tg3_start_xmit(struct sk_buff *, struct net_device *);
 
 /* Use GSO to workaround all TSO packets that meet HW bug conditions
  * indicated in tg3_tx_frag_set()
@@ -7881,7 +7881,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi,
 
 	skb_list_walk_safe(segs, seg, next) {
 		skb_mark_not_on_list(seg);
-		tg3_start_xmit(seg, tp->dev);
+		__tg3_start_xmit(seg, tp->dev);
 	}
 
 tg3_tso_bug_end:
@@ -7891,7 +7891,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct tg3_napi *tnapi,
 }
 
 /* hard_start_xmit for all devices */
-static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t __tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct tg3 *tp = netdev_priv(dev);
 	u32 len, entry, base_flags, mss, vlan = 0;
@@ -8135,11 +8135,6 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 			netif_tx_wake_queue(txq);
 	}
 
-	if (!netdev_xmit_more() || netif_xmit_stopped(txq)) {
-		/* Packets are ready, update Tx producer idx on card. */
-		tw32_tx_mbox(tnapi->prodmbox, entry);
-	}
-
 	return NETDEV_TX_OK;
 
 dma_error:
@@ -8152,6 +8147,35 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	u16 skb_queue_mapping = skb_get_queue_mapping(skb);
+	struct netdev_queue *txq = netdev_get_tx_queue(dev, skb_queue_mapping);
+
+	netdev_tx_t ret = __tg3_start_xmit(skb, dev);
+
+	/* Notify the hardware that packets are ready by updating the TX ring
+	 * tail pointer. We respect netdev_xmit_more() thus avoiding poking
+	 * the hardware for every packet. To guarantee forward progress the TX
+	 * ring must be drained when it is full as indicated by
+	 * netif_xmit_stopped(). This needs to happen even when the current
+	 * skb was dropped or rejected with NETDEV_TX_BUSY. Otherwise packets
+	 * queued by previous __tg3_start_xmit() calls might get stuck in
+	 * the queue forever.
+	 */
+	if (!netdev_xmit_more() || netif_xmit_stopped(txq)) {
+		struct tg3 *tp = netdev_priv(dev);
+		struct tg3_napi *tnapi = &tp->napi[skb_queue_mapping];
+
+		if (tg3_flag(tp, ENABLE_TSS))
+			tnapi++;
+
+		tw32_tx_mbox(tnapi->prodmbox, tnapi->tx_prod);
+	}
+
+	return ret;
+}
+
 static void tg3_mac_loopback(struct tg3 *tp, bool enable)
 {
 	if (enable) {
@@ -17682,7 +17706,7 @@ static int tg3_init_one(struct pci_dev *pdev,
 	 * device behind the EPB cannot support DMA addresses > 40-bit.
 	 * On 64-bit systems with IOMMU, use 40-bit dma_mask.
 	 * On 64-bit systems without IOMMU, use 64-bit dma_mask and
-	 * do DMA address check in tg3_start_xmit().
+	 * do DMA address check in __tg3_start_xmit().
 	 */
 	if (tg3_flag(tp, IS_5788))
 		persist_dma_mask = dma_mask = DMA_BIT_MASK(32);
-- 
2.39.3


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox