Netdev List
 help / color / mirror / Atom feed
* [PATCH v1 net-next 07/10] net: fib_rules: Drop RTNL assertions.
From: Kuniyuki Iwashima @ 2026-06-29 18:10 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260629181226.1929658-1-kuniyu@google.com>

Now, fib_rule structs are protected by per-fib_rules_ops mutex.

Let's drop ASSERT_RTNL_NET() and rtnl_dereference().

Note that fib_rules_event() iterates over net->rules_ops without
net->rules_mod_lock, but this is fine because all fib_rule users
are built-in and concurrent fib_rules_unregister() does not happen.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/core/fib_rules.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 25a3fd997782..5eef5d6ace82 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -387,7 +387,6 @@ static int call_fib_rule_notifiers(struct net *net,
 		.rule = rule,
 	};
 
-	ASSERT_RTNL_NET(net);
 	lockdep_assert_held(&ops->lock);
 
 	/* Paired with READ_ONCE() in fib_rules_seq() */
@@ -955,7 +954,7 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 		list_for_each_entry(r, &ops->rules_list, list) {
 			if (r->action == FR_ACT_GOTO &&
 			    r->target == rule->pref &&
-			    rtnl_dereference(r->ctarget) == NULL) {
+			    !rcu_access_pointer(r->ctarget)) {
 				rcu_assign_pointer(r->ctarget, rule);
 				if (--ops->unresolved_rules == 0)
 					break;
@@ -1064,7 +1063,7 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	if (rule->action == FR_ACT_GOTO) {
 		ops->nr_goto_rules--;
-		if (rtnl_dereference(rule->ctarget) == NULL)
+		if (!rcu_access_pointer(rule->ctarget))
 			ops->unresolved_rules--;
 	}
 
@@ -1082,7 +1081,7 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 		if (&n->list == &ops->rules_list || n->pref != rule->pref)
 			n = NULL;
 		list_for_each_entry(r, &ops->rules_list, list) {
-			if (rtnl_dereference(r->ctarget) != rule)
+			if (rcu_access_pointer(r->ctarget) != rule)
 				continue;
 			rcu_assign_pointer(r->ctarget, n);
 			if (!n)
@@ -1400,8 +1399,6 @@ static int fib_rules_event(struct notifier_block *this, unsigned long event,
 	struct net *net = dev_net(dev);
 	struct fib_rules_ops *ops;
 
-	ASSERT_RTNL();
-
 	switch (event) {
 	case NETDEV_REGISTER:
 		list_for_each_entry(ops, &net->rules_ops, list) {
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v1 net-next 08/10] net: fib_rules: Use dev_get_by_name_rcu().
From: Kuniyuki Iwashima @ 2026-06-29 18:11 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260629181226.1929658-1-kuniyu@google.com>

We will no longer hold RTNL for RTM_NEWRULE and RMT_DELRULE
except for the first IPv4 RTM_NEWRULE.

Let's covnert __dev_get_by_name() in fib_nl2rule_rtnl() to
dev_get_by_name_rcu() and rename it to fib_nl2rule_locked().

Note that dev_get_by_name_rcu() must be called inside ops->lock
to serialise fib_rules_event() by __dev_change_net_namespace().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/core/fib_rules.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 5eef5d6ace82..2b652dd83241 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -734,10 +734,10 @@ static int fib_nl2rule(struct net *net, struct nlmsghdr *nlh,
 	return err;
 }
 
-static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
-			    struct fib_rules_ops *ops,
-			    struct nlattr *tb[],
-			    struct netlink_ext_ack *extack)
+static int fib_nl2rule_locked(struct fib_rule *nlrule,
+			      struct fib_rules_ops *ops,
+			      struct nlattr *tb[],
+			      struct netlink_ext_ack *extack)
 {
 	if (!tb[FRA_PRIORITY])
 		nlrule->pref = fib_default_rule_pref(ops);
@@ -748,12 +748,14 @@ static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
 		return -EINVAL;
 	}
 
+	rcu_read_lock();
+
 	if (tb[FRA_IIFNAME]) {
 		struct net_device *dev;
 
-		dev = __dev_get_by_name(nlrule->fr_net, nlrule->iifname);
+		dev = dev_get_by_name_rcu(nlrule->fr_net, nlrule->iifname);
 		if (dev) {
-			nlrule->iifindex = dev->ifindex;
+			nlrule->iifindex = READ_ONCE(dev->ifindex);
 			nlrule->iif_is_l3_master = netif_is_l3_master(dev);
 		}
 	}
@@ -761,13 +763,15 @@ static int fib_nl2rule_rtnl(struct fib_rule *nlrule,
 	if (tb[FRA_OIFNAME]) {
 		struct net_device *dev;
 
-		dev = __dev_get_by_name(nlrule->fr_net, nlrule->oifname);
+		dev = dev_get_by_name_rcu(nlrule->fr_net, nlrule->oifname);
 		if (dev) {
-			nlrule->oifindex = dev->ifindex;
+			nlrule->oifindex = READ_ONCE(dev->ifindex);
 			nlrule->oif_is_l3_master = netif_is_l3_master(dev);
 		}
 	}
 
+	rcu_read_unlock();
+
 	return 0;
 }
 
@@ -906,7 +910,7 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 		rtnl_net_lock(net);
 	mutex_lock(&ops->lock);
 
-	err = fib_nl2rule_rtnl(rule, ops, tb, extack);
+	err = fib_nl2rule_locked(rule, ops, tb, extack);
 	if (err)
 		goto errout_free;
 
@@ -1038,7 +1042,7 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 		rtnl_net_lock(net);
 	mutex_lock(&ops->lock);
 
-	err = fib_nl2rule_rtnl(nlrule, ops, tb, extack);
+	err = fib_nl2rule_locked(nlrule, ops, tb, extack);
 	if (err)
 		goto errout_free;
 
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v1 net-next 09/10] net: fib_rules: Only hold RTNL for the first IPv4 RTM_NEWRULE.
From: Kuniyuki Iwashima @ 2026-06-29 18:11 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260629181226.1929658-1-kuniyu@google.com>

Now, RTM_DELRULE no longer needs RTNL, and the only RTNL dependant
in RTM_NEWRULE is fib_unmerge(), which is called for the first
IPv4 rule.

Let's add fib_rules_ops.need_rtnl() and hold RTNL only for the
first IPv4 rule.

Tested:
The script below creates 1K rules in parallel in 4K netns, and
it got 20x/30x faster for IPv4/IPv6.

  #!/bin/bash
  N=4096
  F=rules.txt

  for i in $(seq $N); do ip netns add ns-$i; done
  printf 'rule add from all table %d\n' {1..1024} > $F

  for v in 4 6; do
  	echo "=== IPv${v} ==="
  	time { for i in $(seq $N); do nsenter \
  	--net=/var/run/netns/ns-$i ip -$v -batch $F & done; wait; }
  done

  for i in $(seq $N); do ip netns del ns-$i; done
  rm -f $F

Without this series:

  # ./test.sh
  === IPv4 ===

  real	0m22.752s
  user	0m7.834s
  sys	92m46.721s
  === IPv6 ===

  real	0m35.181s
  user	0m8.635s
  sys	142m30.479s

With this series:

  # ./test.sh
  === IPv4 ===

  real	0m0.918s
  user	0m5.675s
  sys	2m7.024s
  === IPv6 ===

  real	0m1.214s
  user	0m7.917s
  sys	4m19.489s

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 include/net/fib_rules.h |  1 +
 net/core/fib_rules.c    | 15 ++++++---------
 net/ipv4/fib_rules.c    |  6 ++++++
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index 7636ef4da5ad..c6b94790fa81 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -93,6 +93,7 @@ struct fib_rules_ops {
 	/* Called after modifications to the rules set, must flush
 	 * the route cache if one exists. */
 	void			(*flush_cache)(struct fib_rules_ops *ops);
+	bool			(*need_rtnl)(struct net *net);
 
 	int			nlgroup;
 	struct list_head	rules_list;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 2b652dd83241..22e5e5e1a9c4 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -881,6 +881,7 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct nlattr *tb[FRA_MAX + 1];
 	bool user_priority = false;
 	struct fib_rule_hdr *frh;
+	bool unlock_rtnl = false;
 
 	frh = nlmsg_payload(nlh, sizeof(*frh));
 	if (!frh) {
@@ -906,8 +907,10 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		goto errout;
 
-	if (!rtnl_held)
+	if (!rtnl_held && ops->need_rtnl && ops->need_rtnl(net)) {
+		unlock_rtnl = true;
 		rtnl_net_lock(net);
+	}
 	mutex_lock(&ops->lock);
 
 	err = fib_nl2rule_locked(rule, ops, tb, extack);
@@ -978,7 +981,7 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 	fib_rule_get(rule);
 
 	mutex_unlock(&ops->lock);
-	if (!rtnl_held)
+	if (unlock_rtnl)
 		rtnl_net_unlock(net);
 
 	notify_rule_change(RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
@@ -989,7 +992,7 @@ int fib_newrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 
 errout_free:
 	mutex_unlock(&ops->lock);
-	if (!rtnl_held)
+	if (unlock_rtnl)
 		rtnl_net_unlock(net);
 	kfree(rule);
 errout:
@@ -1038,8 +1041,6 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		goto errout;
 
-	if (!rtnl_held)
-		rtnl_net_lock(net);
 	mutex_lock(&ops->lock);
 
 	err = fib_nl2rule_locked(nlrule, ops, tb, extack);
@@ -1096,8 +1097,6 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 	call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule, ops, NULL);
 
 	mutex_unlock(&ops->lock);
-	if (!rtnl_held)
-		rtnl_net_unlock(net);
 
 	notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).portid);
 	fib_rule_put(rule);
@@ -1108,8 +1107,6 @@ int fib_delrule(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh,
 
 errout_free:
 	mutex_unlock(&ops->lock);
-	if (!rtnl_held)
-		rtnl_net_unlock(net);
 	kfree(nlrule);
 errout:
 	rules_ops_put(ops);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 16d202246a36..4edb0dca7be8 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -460,6 +460,11 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
 	rt_cache_flush(ops->fro_net);
 }
 
+static bool fib4_rule_need_rtnl(struct net *net)
+{
+	return !net->ipv4.fib_has_custom_rules;
+}
+
 static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.family		= AF_INET,
 	.rule_size	= sizeof(struct fib4_rule),
@@ -473,6 +478,7 @@ static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = {
 	.fill		= fib4_rule_fill,
 	.nlmsg_payload	= fib4_rule_nlmsg_payload,
 	.flush_cache	= fib4_rule_flush_cache,
+	.need_rtnl	= fib4_rule_need_rtnl,
 	.nlgroup	= RTNLGRP_IPV4_RULE,
 	.owner		= THIS_MODULE,
 };
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* [PATCH v1 net-next 10/10] ipv6: fib_rules: Convert fib6_rules_net_exit_rtnl() to ->exit().
From: Kuniyuki Iwashima @ 2026-06-29 18:11 UTC (permalink / raw)
  To: David Ahern, Ido Schimmel, David S . Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In-Reply-To: <20260629181226.1929658-1-kuniyu@google.com>

Now fib_rule is protected by per-ops mutex.

fib6_rules_net_exit_batch() no longer needs RTNL.

Let's convert it to ->exit() and drop RTNL.

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
 net/ipv6/fib6_rules.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 5ab4dde07225..04dab9329d0c 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -635,21 +635,14 @@ static int __net_init fib6_rules_net_init(struct net *net)
 	goto out;
 }
 
-static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list)
+static void __net_exit fib6_rules_net_exit(struct net *net)
 {
-	struct net *net;
-
-	rtnl_lock();
-	list_for_each_entry(net, net_list, exit_list) {
-		fib_rules_unregister(net->ipv6.fib6_rules_ops);
-		cond_resched();
-	}
-	rtnl_unlock();
+	fib_rules_unregister(net->ipv6.fib6_rules_ops);
 }
 
 static struct pernet_operations fib6_rules_net_ops = {
 	.init = fib6_rules_net_init,
-	.exit_batch = fib6_rules_net_exit_batch,
+	.exit = fib6_rules_net_exit,
 };
 
 int __init fib6_rules_init(void)
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* Re: [PATCH] selftests/bpf: Mask socket type flags in mptcpify prog
From: Amery Hung @ 2026-06-29 18:12 UTC (permalink / raw)
  To: Guillaume @layus Maudoux
  Cc: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Matthieu Baerts,
	Mat Martineau, Geliang Tang, Shuah Khan, bpf, mptcp, netdev,
	linux-kselftest, linux-kernel
In-Reply-To: <20260629125637.384923-1-layus.on@gmail.com>

On Mon, Jun 29, 2026 at 6:19 AM Guillaume @layus Maudoux
<layus.on@gmail.com> wrote:
>
> The mptcpify BPF prog hooks update_socket_protocol() to rewrite
> eligible TCP socket() calls to IPPROTO_MPTCP. It only does so when the
> socket type is exactly SOCK_STREAM:
>
>         type == SOCK_STREAM
>
> The problem is that update_socket_protocol() in __sys_socket() is
> called on the raw type argument as passed from userspace, before
> __sys_socket_create() strips the flag bits with
> "type &= SOCK_TYPE_MASK". The type argument may therefore carry
> SOCK_CLOEXEC and/or SOCK_NONBLOCK in its upper bits, and the equality
> check above then fails.
>
> As a result, any socket created with e.g.
> socket(AF_INET, SOCK_STREAM | SOCK_CLOEXEC, 0) -- which is what common
> libraries do by default -- is silently left as plain TCP instead of
> being upgraded to MPTCP. This was observed in practice with curl,
> whose connections were not upgraded to MPTCP despite the prog being
> attached.
>
> The impact reaches beyond the test, because mptcpify.c is referenced
> as example code for users who want to transparently enable MPTCP. The
> same mistake is therefore likely to be copied into real deployments,
> where it fails the same way and is hard to diagnose.
>
> The fix is to mask off the flag bits before comparing, mirroring what
> the socket core does:
>
>         (type & SOCK_TYPE_MASK) == SOCK_STREAM
>
> Since SOCK_TYPE_MASK is not exposed through vmlinux.h, define it in
> bpf_tracing_net.h.
>
> To exercise the regression directly, extend the mptcpify test to also
> create the server socket with SOCK_CLOEXEC and SOCK_NONBLOCK set.
> Routing a flagged type through start_server() then revealed a second
> instance of the same pattern: start_server_addr() compared the type
> against SOCK_STREAM for equality to decide whether to set SO_REUSEADDR
> and call listen(), and so would skip listening for a flagged type.
> Mask the type there as well. As SOCK_TYPE_MASK is not exposed by
> glibc's <sys/socket.h> either, define it in network_helpers.h,
> mirroring prog_tests/socket_helpers.h.
>
> Fixes: ddba122428a7 ("selftests/bpf: Add mptcpify test")
> Signed-off-by: Guillaume @layus Maudoux <layus.on@gmail.com>

Hello,

You will need to use your real name in the SOB. At least "@" caused
some problems when I downloaded the patch.

Please also include "bpf-next" in your subject prefix so that the CI
can properly test it.

> ---
>  tools/testing/selftests/bpf/network_helpers.c |  4 ++--
>  tools/testing/selftests/bpf/network_helpers.h |  5 +++++
>  .../testing/selftests/bpf/prog_tests/mptcp.c  | 20 +++++++++++++++----
>  .../selftests/bpf/progs/bpf_tracing_net.h     |  3 +++
>  tools/testing/selftests/bpf/progs/mptcpify.c  |  2 +-
>  5 files changed, 27 insertions(+), 7 deletions(-)
>
> diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
> index b82f572641b7..db935a9d9fc1 100644
> --- a/tools/testing/selftests/bpf/network_helpers.c
> +++ b/tools/testing/selftests/bpf/network_helpers.c
> @@ -111,7 +111,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a
>         if (settimeo(fd, opts->timeout_ms))
>                 goto error_close;
>
> -       if (type == SOCK_STREAM &&
> +       if ((type & SOCK_TYPE_MASK) == SOCK_STREAM &&
>             setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on))) {
>                 log_err("Failed to enable SO_REUSEADDR");
>                 goto error_close;
> @@ -128,7 +128,7 @@ int start_server_addr(int type, const struct sockaddr_storage *addr, socklen_t a
>                 goto error_close;
>         }
>
> -       if (type == SOCK_STREAM) {
> +       if ((type & SOCK_TYPE_MASK) == SOCK_STREAM) {
>                 if (listen(fd, opts->backlog ? MAX(opts->backlog, 0) : 1) < 0) {
>                         log_err("Failed to listed on socket");
>                         goto error_close;
> diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
> index 79a010c88e11..75133119c04a 100644
> --- a/tools/testing/selftests/bpf/network_helpers.h
> +++ b/tools/testing/selftests/bpf/network_helpers.h
> @@ -25,6 +25,11 @@ typedef __u16 __sum16;
>  #define VIP_NUM 5
>  #define MAGIC_BYTES 123
>
> +/* include/linux/net.h */
> +#ifndef SOCK_TYPE_MASK
> +#define SOCK_TYPE_MASK 0xf
> +#endif
> +
>  struct network_helper_opts {
>         int timeout_ms;
>         int proto;
> diff --git a/tools/testing/selftests/bpf/prog_tests/mptcp.c b/tools/testing/selftests/bpf/prog_tests/mptcp.c
> index 8fade8bdc451..faa001ea84ab 100644
> --- a/tools/testing/selftests/bpf/prog_tests/mptcp.c
> +++ b/tools/testing/selftests/bpf/prog_tests/mptcp.c
> @@ -264,7 +264,7 @@ static int verify_mptcpify(int server_fd, int client_fd)
>         return err;
>  }
>
> -static int run_mptcpify(int cgroup_fd)
> +static int run_mptcpify(int cgroup_fd, int type)
>  {
>         int server_fd, client_fd, err = 0;
>         struct mptcpify *mptcpify_skel;
> @@ -280,7 +280,7 @@ static int run_mptcpify(int cgroup_fd)
>                 goto out;
>
>         /* without MPTCP */
> -       server_fd = start_server(AF_INET, SOCK_STREAM, NULL, 0, 0);
> +       server_fd = start_server(AF_INET, type, NULL, 0, 0);
>         if (!ASSERT_GE(server_fd, 0, "start_server")) {
>                 err = -EIO;
>                 goto out;
> @@ -307,7 +307,18 @@ static int run_mptcpify(int cgroup_fd)
>  static void test_mptcpify(void)
>  {
>         struct netns_obj *netns = NULL;
> -       int cgroup_fd;
> +       int cgroup_fd, i;
> +       int types[] = {
> +               SOCK_STREAM,
> +               /* userspace sets these flags together with the type, and the
> +                * BPF prog must still upgrade the socket to MPTCP. See
> +                * update_socket_protocol() in net/socket.c, which runs before
> +                * the type is masked with SOCK_TYPE_MASK.
> +                */
> +               SOCK_STREAM | SOCK_CLOEXEC,
> +               SOCK_STREAM | SOCK_NONBLOCK,
> +               SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK,
> +       };
>
>         cgroup_fd = test__join_cgroup("/mptcpify");
>         if (!ASSERT_GE(cgroup_fd, 0, "test__join_cgroup"))
> @@ -317,7 +328,8 @@ static void test_mptcpify(void)
>         if (!ASSERT_OK_PTR(netns, "netns_new"))
>                 goto fail;
>
> -       ASSERT_OK(run_mptcpify(cgroup_fd), "run_mptcpify");
> +       for (i = 0; i < ARRAY_SIZE(types); i++)
> +               ASSERT_OK(run_mptcpify(cgroup_fd, types[i]), "run_mptcpify");

nit: testing all four type combinations adds little value here. I'd
drop array+loop and keep two. For example:

  ASSERT_OK(run_mptcpify(cgroup_fd, SOCK_STREAM), "run_mptcpify");
  ASSERT_OK(run_mptcpify(cgroup_fd, SOCK_STREAM | SOCK_CLOEXEC),
"run_mptcpify_cloexec");

>
>  fail:
>         netns_free(netns);
> diff --git a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
> index d8dacef37c16..c4b438854565 100644
> --- a/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
> +++ b/tools/testing/selftests/bpf/progs/bpf_tracing_net.h
> @@ -8,6 +8,9 @@
>  #define AF_INET                        2
>  #define AF_INET6               10
>
> +/* include/linux/net.h */
> +#define SOCK_TYPE_MASK         0xf
> +
>  #define SOL_SOCKET             1
>  #define SO_REUSEADDR           2
>  #define SO_SNDBUF              7
> diff --git a/tools/testing/selftests/bpf/progs/mptcpify.c b/tools/testing/selftests/bpf/progs/mptcpify.c
> index cbdc730c3a47..e3f8cb54dbe9 100644
> --- a/tools/testing/selftests/bpf/progs/mptcpify.c
> +++ b/tools/testing/selftests/bpf/progs/mptcpify.c
> @@ -15,7 +15,7 @@ int BPF_PROG(mptcpify, int family, int type, int protocol)
>                 return protocol;
>
>         if ((family == AF_INET || family == AF_INET6) &&
> -           type == SOCK_STREAM &&
> +           (type & SOCK_TYPE_MASK) == SOCK_STREAM &&
>             (!protocol || protocol == IPPROTO_TCP)) {
>                 return IPPROTO_MPTCP;
>         }
> --
> 2.54.0
>
>

^ permalink raw reply

* Re: the confusing 10000base_CR. Shouldn't it be 10000_SFI_DA?
From: Michal Kubecek @ 2026-06-29 18:14 UTC (permalink / raw)
  To: Maxime Chevallier
  Cc: D H, Siddaraju, Andrew Lunn, netdev@vger.kernel.org, Das, Shubham,
	Chintalapalle, Balaji, Srinivasan, Vijay
In-Reply-To: <1b3975a8-788a-4b81-94ec-3ab5708b251b@bootlin.com>

[-- Attachment #1: Type: text/plain, Size: 3328 bytes --]

On Mon, Jun 29, 2026 at 11:30:43AM GMT, Maxime Chevallier wrote:
> > What about
> > "option-(b): create a new enum ETHTOOL_LINK_MODE_10G_SFI_DA_Full_BIT"?
> >   Idea is just to create a new enum, with same enum value of 10000baseCR.
> >   This will NOT consume a bit position in "ethtool_link_mode_bit_indices".
> >   It just helps those tech-savvy people, who does not accept 10000baseCR
> >   and prefer 10000sfiDA for being explicit.
> 
> The thing is that even with a new enum value, that won't bring much to
> the table. It would likely be better to have a comment near the
> 10000baseCR definition explaining the SFF equivalency.
> 
> > 
> > At worst case, hope we agree for
> > "option-(c): ethtool.8 man page help strings to indicate 10G_SFI_DA"
> >   Something like
> >     "10000baseCR (10G_SFI_DA    SFF-8431 SFP+ DA)
> >   under "advertise" mask values.
> 
> In that case, let's add Michal in the loop as the ethtool maintainer. Even
> then it's not straightforward as some tooling relies on the JSON output
> from ethtool, so _if_ we change the output for that mode, it should only
> be in the non-json output.

The biggest problem I see here is that even if ethtool has its own link
mode tables, those are only for backward compatibility and as long as
reasonably new ethtool and kernel are in use, ethtool gets the link mode
names from kernel. There are multiple aspects:

1. The UAPI constants. These are part of kernel uAPI and ethtool just
gets a sanitized copy of kernel file. In other words, if we want any
change here (even adding a comment), it has to be done on kernel side
and ethtool gets the change on next uapi sync. Adding an alias here
would be quite simple.

2. Link mode names as shown in "ethtool <dev>" output. These are
normally retrieved from kernel via the ETH_SS_LINK_MODES string set.
In other words, if any change were desired, it would have to happen on
kernel side as well. And unless we add a specific exception to the code,
the same name is shown in both plain text and json output. (Personally
I don't think it would be a good idea to show different name in each.)

3. Link mode names that can be used in "ethtool -s" command line. These
are handled exactly the same way as names used for output. Here I could
imagine an alias implemented on ethtool side but it would have to be
hard coded in ethtool and it would only work in (new versions of)
ethtool. In general, names shown by "ethtool <dev>" should match those
expected on "ethtool -s" command line, any mismatch would be very
confusing and impractical.

4. Adding a text to ethtool(8) manual page is possible. While most of
the link mode table in the man page is (unsurprisingly) generated, it is
then reordered manually and adding a note would definitely be possible.
One just needs to be careful about the table formatting. (Yes, I'm aware
that this table has been getting out of hand for some time and I'm
thinking about ways to make it more readable and useful. Suggestions are
welcome.)

> My personal opinion would be that adding a comment in the enum definition
> near 10000baseCR is enough :/

That would certainly be the easiest solution but as I said above, it
would be a kernel code change, ethtool would just inherit the updated
header file.

Michal

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply

* [PATCH net-next V4 1/6] net/mlx5: Clear FW reset-in-progress bit before reload
From: Mark Bloch @ 2026-06-29 18:20 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch, Shay Drori, Moshe Shemesh
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

mlx5 sets MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS when acknowledging a sync
reset request. This bit blocks devlink reload and other devlink operations
while the firmware reset is running, but it was kept set until after the
driver reload finished.

Clear the reset-in-progress bit once the reset unload flow is done and PCI
access is back, before reloading the device. For a reset initiated through
devlink, clear it before completing the reload waiter. For a reset reported
through an asynchronous firmware event, keep the unload flow outside
devl_lock, then take devl_lock before clearing the bit and reloading
through the devl-locked load helper.

Reviewed-by: Shay Drori <shayd@nvidia.com>
Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 .../ethernet/mellanox/mlx5/core/fw_reset.c    | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
index 07440c58713a..7283e5b49eed 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/fw_reset.c
@@ -238,24 +238,30 @@ static void mlx5_fw_reset_complete_reload(struct mlx5_core_dev *dev)
 {
 	struct mlx5_fw_reset *fw_reset = dev->priv.fw_reset;
 	struct devlink *devlink = priv_to_devlink(dev);
+	int err;
 
 	/* if this is the driver that initiated the fw reset, devlink completed the reload */
 	if (test_bit(MLX5_FW_RESET_FLAGS_PENDING_COMP, &fw_reset->reset_flags)) {
+		clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS,
+			  &fw_reset->reset_flags);
 		complete(&fw_reset->done);
-	} else {
-		mlx5_sync_reset_unload_flow(dev, false);
-		if (mlx5_health_wait_pci_up(dev))
-			mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
-		else
-			mlx5_load_one(dev, true);
-		devl_lock(devlink);
-		devlink_remote_reload_actions_performed(devlink, 0,
-							BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
-							BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
-		devl_unlock(devlink);
+		return;
 	}
 
+	mlx5_sync_reset_unload_flow(dev, false);
+	err = mlx5_health_wait_pci_up(dev);
+
+	devl_lock(devlink);
 	clear_bit(MLX5_FW_RESET_FLAGS_RESET_IN_PROGRESS, &fw_reset->reset_flags);
+	if (err)
+		mlx5_core_err(dev, "reset reload flow aborted, PCI reads still not working\n");
+	else
+		mlx5_load_one_devl_locked(dev, true);
+
+	devlink_remote_reload_actions_performed(devlink, 0,
+						BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT) |
+						BIT(DEVLINK_RELOAD_ACTION_FW_ACTIVATE));
+	devl_unlock(devlink);
 }
 
 static void mlx5_stop_sync_reset_poll(struct mlx5_core_dev *dev)
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next V4 0/6] evlink: Add boot-time eswitch mode defaults
From: Mark Bloch @ 2026-06-29 18:20 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch

This series adds a devlink_eswitch_mode= kernel command line parameter
for setting a default devlink eswitch mode during boot.

Following the discussion with Jakub[1] and the feedback on the RFC
postings, this version keeps the scope limited to a boot-time devlink
eswitch mode default only.

The option selects either all devlink handles or an explicit
comma-separated handle list:

devlink_eswitch_mode=*=switchdev
devlink_eswitch_mode=pci/0000:08:00.0,pci/0000:09:00.1=switchdev_inactive

The supported modes are legacy, switchdev and switchdev_inactive. The
selected mode is applied through the existing eswitch_mode_set() devlink
operation, the same operation used by the devlink eswitch mode command.

Registration may happen before a driver is ready to change eswitch mode,
so devlink core queues an asynchronous apply request from devl_register().
The worker takes the devlink instance lock before calling into the driver.

After a successful reload that performed DRIVER_REINIT, devlink core
already holds the devlink instance lock and the driver completed
reload_up(), so the default is applied directly from the reload path.

Drivers that know exactly when the device is ready can call
devl_apply_default_esw_mode() directly. mlx5 uses this after initial
probe, when the device is initialized and the devlink lock is already
held.

Patch 1 clears the mlx5 FW reset-in-progress bit before reload.

Patch 2 factors the common eswitch mode set validation into a helper.

Patch 3 adds the devlink_eswitch_mode= parser and documentation.

Patch 4 applies parsed defaults from devlink core.

Patch 5 adds devl_apply_default_esw_mode() for drivers.

Patch 6 wires mlx5 to apply the default after initial probe.

Changelog:

v3 -> v4:

- Rework registration time apply to use per devlink delayed work instead
  of calling eswitch_mode_set() directly from devl_register().

- Apply the default directly after successful DRIVER_REINIT devlink reload,
  where the devlink lock is already held and reload_up() has completed.

- Add devl_apply_default_esw_mode() for drivers that know their exact ready
  point.

- Drop the driver registration-ordering preparation patches that are no
  longer needed with the async registration apply path.

v2 -> v3:

- Change the devlink_eswitch_mode= API syntax to use <selector>=<mode>
  instead of [<selector>]:<mode>, following a comment from Randy Dunlap.

v1 -> v2:

- Move default eswitch mode application into devlink core. The default is
  now applied during devlink registration and after a successful devlink
  reload that performed DRIVER_REINIT.

- Remove the exported devl_apply_default_esw_mode() driver API and the mlx5
  driver-side call to it.

- Skip devlink health recovery notifications while the devlink instance is
  not registered, so drivers can move registration later without early
  health work hitting registration assertions.

- Move mlx5 devlink registration after device initialization, including the
  lightweight init path, so the core can apply the default through the
  normal registration flow.

- Move the matching netdevsim and mlx5 unregister paths before object
  teardown, so unregister notifications come from devl_unregister() and the
  later object teardown paths run while the devlink instance is no longer
  registered.

- Add registration-ordering preparation patches for netdevsim and octeontx2
  AF/PF, so their eswitch state is ready before registration-time defaults
  may call eswitch_mode_set().

[1] lore.kernel.org/r/20260502184153.4fd8d06f@kernel.org/
RFC v1: lore.kernel.org/r/20260506123739.1959770-1-mbloch@nvidia.com/
RFC v2: lore.kernel.org/r/20260510185424.2041415-1-mbloch@nvidia.com/
v1: lore.kernel.org/r/20260521072434.362624-1-tariqt@nvidia.com/
v2: lore.kernel.org/all/20260603193259.3412464-1-mbloch@nvidia.com/
v3: lore.kernel.org/all/20260605181030.3486619-1-mbloch@nvidia.com/

Mark Bloch (6):
  net/mlx5: Clear FW reset-in-progress bit before reload
  devlink: Factor out eswitch mode setting
  devlink: Parse eswitch mode boot defaults
  devlink: Apply eswitch mode boot defaults
  devlink: Add API to apply eswitch mode boot default
  net/mlx5: Apply devlink eswitch mode boot default on probe

 .../admin-guide/kernel-parameters.txt         |  25 ++
 .../networking/devlink/devlink-defaults.rst   |  78 ++++
 Documentation/networking/devlink/index.rst    |   1 +
 .../ethernet/mellanox/mlx5/core/fw_reset.c    |  28 +-
 .../net/ethernet/mellanox/mlx5/core/main.c    |  13 +
 include/net/devlink.h                         |   1 +
 net/devlink/core.c                            | 393 ++++++++++++++++++
 net/devlink/dev.c                             |  33 +-
 net/devlink/devl_internal.h                   |   8 +
 9 files changed, 562 insertions(+), 18 deletions(-)
 create mode 100644 Documentation/networking/devlink/devlink-defaults.rst


base-commit: 805185b7c7a1069e407b6f7b3bc98e44d415f484
-- 
2.43.0


^ permalink raw reply

* [PATCH net-next V4 2/6] devlink: Factor out eswitch mode setting
From: Mark Bloch @ 2026-06-29 18:20 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

Move the common eswitch mode set checks into a small helper and use it
from the netlink eswitch set command. Making the same validation
available to the devlink core path that applies eswitch mode defaults.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 net/devlink/dev.c           | 27 ++++++++++++++++++++-------
 net/devlink/devl_internal.h |  3 +++
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index 57b2b8f03543..4fb02bb993c1 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -702,6 +702,25 @@ int devlink_nl_eswitch_get_doit(struct sk_buff *skb, struct genl_info *info)
 	return genlmsg_reply(msg, info);
 }
 
+int devlink_eswitch_mode_set(struct devlink *devlink,
+			     enum devlink_eswitch_mode mode,
+			     struct netlink_ext_ack *extack)
+{
+	const struct devlink_ops *ops = devlink->ops;
+	int err;
+
+	devl_assert_locked(devlink);
+
+	if (!ops->eswitch_mode_set)
+		return -EOPNOTSUPP;
+
+	err = devlink_rates_check(devlink, devlink_rate_is_node, extack);
+	if (err)
+		return err;
+
+	return ops->eswitch_mode_set(devlink, mode, extack);
+}
+
 int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink *devlink = info->user_ptr[0];
@@ -712,14 +731,8 @@ int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
 	u16 mode;
 
 	if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
-		if (!ops->eswitch_mode_set)
-			return -EOPNOTSUPP;
-		err = devlink_rates_check(devlink, devlink_rate_is_node,
-					  info->extack);
-		if (err)
-			return err;
 		mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
-		err = ops->eswitch_mode_set(devlink, mode, info->extack);
+		err = devlink_eswitch_mode_set(devlink, mode, info->extack);
 		if (err)
 			return err;
 	}
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index e4e48ee2da5a..97be77d3ed42 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -328,6 +328,9 @@ bool devlink_rate_is_node(const struct devlink_rate *devlink_rate);
 int devlink_rates_check(struct devlink *devlink,
 			bool (*rate_filter)(const struct devlink_rate *),
 			struct netlink_ext_ack *extack);
+int devlink_eswitch_mode_set(struct devlink *devlink,
+			     enum devlink_eswitch_mode mode,
+			     struct netlink_ext_ack *extack);
 
 /* Linecards */
 unsigned int devlink_linecard_index(struct devlink_linecard *linecard);
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next V4 3/6] devlink: Parse eswitch mode boot defaults
From: Mark Bloch @ 2026-06-29 18:20 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

Add devlink_eswitch_mode= kernel command line parsing for a default
eswitch mode.

The supported syntax selects either all devlink handles or one explicit
comma-separated handle list:

  devlink_eswitch_mode=*=<mode>

  devlink_eswitch_mode=<handle>[,<handle>...]=<mode>

where <mode> is one of legacy, switchdev or switchdev_inactive. All
selected handles receive the same mode. Assigning different modes to
different handle lists in the same parameter value is not supported.

Store the parsed selector and mode in devlink core so the default can be
applied by a downstream patch.

Document the devlink_eswitch_mode= syntax and duplicate handle handling.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 .../admin-guide/kernel-parameters.txt         |  25 ++
 .../networking/devlink/devlink-defaults.rst   |  78 ++++++
 Documentation/networking/devlink/index.rst    |   1 +
 net/devlink/core.c                            | 227 ++++++++++++++++++
 4 files changed, 331 insertions(+)
 create mode 100644 Documentation/networking/devlink/devlink-defaults.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b5493a7f8f22..117300dd589c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1249,6 +1249,31 @@ Kernel parameters
 	dell_smm_hwmon.fan_max=
 			[HW] Maximum configurable fan speed.
 
+	devlink_eswitch_mode=
+			[NET]
+			Format:
+			<selector>=<mode>
+
+			<selector>:
+			* | <handle>[,<handle>...]
+
+			<handle>:
+			<bus-name>/<dev-name>
+
+			Configure default devlink eswitch mode for matching
+			devlink instances during device initialization.
+
+			<mode>:
+			legacy | switchdev | switchdev_inactive
+
+			Examples:
+			devlink_eswitch_mode=*=switchdev
+			devlink_eswitch_mode=pci/0000:08:00.0=switchdev
+			devlink_eswitch_mode=pci/0000:08:00.0,pci/0000:09:00.1=switchdev_inactive
+
+			See Documentation/networking/devlink/devlink-defaults.rst
+			for the full syntax.
+
 	dfltcc=		[HW,S390]
 			Format: { on | off | def_only | inf_only | always }
 			on:       s390 zlib hardware support for compression on
diff --git a/Documentation/networking/devlink/devlink-defaults.rst b/Documentation/networking/devlink/devlink-defaults.rst
new file mode 100644
index 000000000000..380c9e99210e
--- /dev/null
+++ b/Documentation/networking/devlink/devlink-defaults.rst
@@ -0,0 +1,78 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+==============================
+Devlink Eswitch Mode Defaults
+==============================
+
+Devlink eswitch mode defaults allow the eswitch mode to be provided on the
+kernel command line and applied to matching devlink instances during device
+initialization.
+
+The devlink device is selected by its devlink handle. For PCI devices this is
+the same handle shown by ``devlink dev show``, for example
+``pci/0000:08:00.0``.
+
+Kernel command line syntax
+==========================
+
+Defaults are specified with the ``devlink_eswitch_mode=`` kernel command line
+parameter.
+
+The general syntax is::
+
+  devlink_eswitch_mode=<selector>=<mode>
+
+``<selector>`` is either ``*`` or one or more devlink handles::
+
+  * | <bus-name>/<dev-name>[,<bus-name>/<dev-name>...]
+
+``*`` applies the mode to every devlink instance. All handles in the same
+selector receive the same eswitch mode.
+
+``<mode>`` is one of ``legacy``, ``switchdev`` or ``switchdev_inactive``.
+
+Syntax rules
+------------
+
+The following syntax rules apply:
+
+* Specify the default in one ``devlink_eswitch_mode=`` parameter. Repeated
+  ``devlink_eswitch_mode=`` parameters are not accumulated.
+* The ``devlink_eswitch_mode=`` value is limited by the kernel command line
+  size.
+* Whitespace is not allowed within the parameter value.
+* ``<selector>`` must be either ``*`` or a handle list. ``*`` cannot be
+  combined with explicit handles.
+* ``<bus-name>`` and ``<dev-name>`` must not be empty.
+* ``<dev-name>`` may contain ``:``. This allows PCI names such as
+  ``0000:08:00.0``.
+* Handles must not contain whitespace, ``*``, ``=`` or more than one ``/``.
+* A comma separates handles.
+* Comma-separated default assignments are not supported.
+* Duplicate handles are rejected and the devlink eswitch mode default is
+  ignored.
+
+The eswitch mode default corresponds to the userspace command::
+
+  devlink dev eswitch set <handle> mode <value>
+
+
+Examples
+========
+
+Set all devlink instances to switchdev mode::
+
+  devlink_eswitch_mode=*=switchdev
+
+Set one PCI devlink instance to switchdev mode::
+
+  devlink_eswitch_mode=pci/0000:08:00.0=switchdev
+
+Set two PCI devlink instances to switchdev inactive mode::
+
+  devlink_eswitch_mode=pci/0000:08:00.0,pci/0000:09:00.1=switchdev_inactive
+
+The following is invalid because comma-separated default assignments are not
+supported::
+
+  devlink_eswitch_mode=pci/0000:08:00.0=switchdev,pci/0000:09:00.0=switchdev_inactive
diff --git a/Documentation/networking/devlink/index.rst b/Documentation/networking/devlink/index.rst
index 32f70879ddd0..93f09cb18c44 100644
--- a/Documentation/networking/devlink/index.rst
+++ b/Documentation/networking/devlink/index.rst
@@ -56,6 +56,7 @@ general.
    :maxdepth: 1
 
    devlink-dpipe
+   devlink-defaults
    devlink-eswitch-attr
    devlink-flash
    devlink-health
diff --git a/net/devlink/core.c b/net/devlink/core.c
index fe9f6a0a67d5..5126509a9c4e 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -4,6 +4,10 @@
  * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
  */
 
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <net/genetlink.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/devlink.h>
@@ -16,6 +20,193 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
 
 DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
 
+static char *devlink_default_esw_mode_param;
+static bool devlink_default_esw_mode_match_all;
+static enum devlink_eswitch_mode devlink_default_esw_mode;
+static LIST_HEAD(devlink_default_esw_mode_nodes);
+
+struct devlink_default_esw_mode_node {
+	struct list_head list;
+	char *bus_name;
+	char *dev_name;
+};
+
+static int __init
+devlink_default_esw_mode_to_value(const char *str,
+				  enum devlink_eswitch_mode *mode)
+{
+	if (!strcmp(str, "legacy")) {
+		*mode = DEVLINK_ESWITCH_MODE_LEGACY;
+		return 0;
+	}
+	if (!strcmp(str, "switchdev")) {
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV;
+		return 0;
+	}
+	if (!strcmp(str, "switchdev_inactive")) {
+		*mode = DEVLINK_ESWITCH_MODE_SWITCHDEV_INACTIVE;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static int __init
+devlink_default_esw_mode_handle_parse(char *handle, char **bus_name,
+				      char **dev_name)
+{
+	char *slash;
+	char *p;
+
+	if (!*handle)
+		return -EINVAL;
+
+	for (p = handle; *p; p++) {
+		if (*p == '*' || *p == '=')
+			return -EINVAL;
+	}
+
+	slash = strchr(handle, '/');
+	if (!slash || slash == handle || !slash[1])
+		return -EINVAL;
+	if (strchr(slash + 1, '/'))
+		return -EINVAL;
+
+	*slash = '\0';
+
+	*bus_name = handle;
+	*dev_name = slash + 1;
+	return 0;
+}
+
+static struct devlink_default_esw_mode_node *
+devlink_default_esw_mode_node_find(const char *bus_name, const char *dev_name)
+{
+	struct devlink_default_esw_mode_node *node;
+
+	list_for_each_entry(node, &devlink_default_esw_mode_nodes, list) {
+		if (!strcmp(node->bus_name, bus_name) &&
+		    !strcmp(node->dev_name, dev_name))
+			return node;
+	}
+
+	return NULL;
+}
+
+static int __init
+devlink_default_esw_mode_node_add(const char *bus_name, const char *dev_name)
+{
+	struct devlink_default_esw_mode_node *node;
+
+	if (devlink_default_esw_mode_node_find(bus_name, dev_name))
+		return -EEXIST;
+
+	node = kzalloc_obj(*node);
+	if (!node)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&node->list);
+	node->bus_name = kstrdup(bus_name, GFP_KERNEL);
+	node->dev_name = kstrdup(dev_name, GFP_KERNEL);
+	if (!node->bus_name || !node->dev_name) {
+		kfree(node->bus_name);
+		kfree(node->dev_name);
+		kfree(node);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&node->list, &devlink_default_esw_mode_nodes);
+	return 0;
+}
+
+static int __init devlink_default_esw_mode_handles_parse(char *handles)
+{
+	char *handle;
+	int err;
+
+	if (!strcmp(handles, "*")) {
+		devlink_default_esw_mode_match_all = true;
+		return 0;
+	}
+
+	while ((handle = strsep(&handles, ",")) != NULL) {
+		char *bus_name;
+		char *dev_name;
+
+		err = devlink_default_esw_mode_handle_parse(handle, &bus_name,
+							    &dev_name);
+		if (err)
+			return err;
+
+		err = devlink_default_esw_mode_node_add(bus_name, dev_name);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static void __init
+devlink_default_esw_mode_node_free(struct devlink_default_esw_mode_node *node)
+{
+	kfree(node->bus_name);
+	kfree(node->dev_name);
+	kfree(node);
+}
+
+static void __init devlink_default_esw_mode_nodes_clear(void)
+{
+	struct devlink_default_esw_mode_node *node;
+	struct devlink_default_esw_mode_node *node_tmp;
+
+	list_for_each_entry_safe(node, node_tmp,
+				 &devlink_default_esw_mode_nodes, list) {
+		list_del(&node->list);
+		devlink_default_esw_mode_node_free(node);
+	}
+
+	devlink_default_esw_mode_match_all = false;
+}
+
+static int __init devlink_default_esw_mode_parse(char *str)
+{
+	char *handles;
+	char *separator;
+	char *mode;
+	enum devlink_eswitch_mode esw_mode;
+	int err;
+
+	if (!*str)
+		return -EINVAL;
+
+	separator = strrchr(str, '=');
+	if (!separator || separator == str || !separator[1])
+		return -EINVAL;
+
+	*separator = '\0';
+	handles = str;
+	mode = separator + 1;
+
+	err = devlink_default_esw_mode_to_value(mode, &esw_mode);
+	if (err)
+		return err;
+
+	err = devlink_default_esw_mode_handles_parse(handles);
+	if (err)
+		devlink_default_esw_mode_nodes_clear();
+	else
+		devlink_default_esw_mode = esw_mode;
+
+	return err;
+}
+
+static int __init devlink_default_esw_mode_setup(char *str)
+{
+	devlink_default_esw_mode_param = str;
+	return 1;
+}
+__setup("devlink_eswitch_mode=", devlink_default_esw_mode_setup);
+
 static struct devlink *devlinks_xa_get(unsigned long index)
 {
 	struct devlink *devlink;
@@ -382,6 +573,14 @@ struct devlink *devlinks_xa_lookup_get(struct net *net, unsigned long index)
 /**
  * devl_register - Register devlink instance
  * @devlink: devlink
+ *
+ * Make @devlink visible to userspace. Drivers must call this only after the
+ * instance is fully initialized and its devlink operations can be called.
+ *
+ * Context: Caller must hold the devlink instance lock. Use devlink_register()
+ * when the lock is not already held.
+ *
+ * Return: 0 on success.
  */
 int devl_register(struct devlink *devlink)
 {
@@ -580,6 +779,31 @@ static int __init devlink_init(void)
 {
 	int err;
 
+	if (devlink_default_esw_mode_param) {
+		char *def;
+
+		def = kstrdup(devlink_default_esw_mode_param, GFP_KERNEL);
+		if (!def) {
+			devlink_default_esw_mode_param = NULL;
+			pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
+		} else {
+			err = devlink_default_esw_mode_parse(def);
+			kfree(def);
+			if (err == -EEXIST) {
+				devlink_default_esw_mode_param = NULL;
+				pr_warn("devlink: duplicate eswitch mode handles ignored\n");
+			} else if (err == -EINVAL) {
+				devlink_default_esw_mode_param = NULL;
+				pr_warn("devlink: invalid devlink_eswitch_mode parameter ignored\n");
+			} else if (err == -ENOMEM) {
+				devlink_default_esw_mode_param = NULL;
+				pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
+			} else if (err) {
+				goto out;
+			}
+		}
+	}
+
 	err = register_pernet_subsys(&devlink_pernet_ops);
 	if (err)
 		goto out;
@@ -595,7 +819,10 @@ static int __init devlink_init(void)
 out_unreg_pernet_subsys:
 	unregister_pernet_subsys(&devlink_pernet_ops);
 out:
+	if (err)
+		devlink_default_esw_mode_nodes_clear();
 	WARN_ON(err);
+
 	return err;
 }
 
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next V4 5/6] devlink: Add API to apply eswitch mode boot default
From: Mark Bloch @ 2026-06-29 18:21 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

Add devl_apply_default_esw_mode() for drivers that can apply the
devlink_eswitch_mode= boot default once their device is ready instead of
waiting for the asynchronous registration work.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 include/net/devlink.h |  1 +
 net/devlink/core.c    | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/include/net/devlink.h b/include/net/devlink.h
index dd546dbd57cf..b71d282c6d52 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1652,6 +1652,7 @@ static inline struct devlink *devlink_alloc(const struct devlink_ops *ops,
 
 int devl_register(struct devlink *devlink);
 void devl_unregister(struct devlink *devlink);
+void devl_apply_default_esw_mode(struct devlink *devlink);
 void devlink_register(struct devlink *devlink);
 void devlink_unregister(struct devlink *devlink);
 void devlink_free(struct devlink *devlink);
diff --git a/net/devlink/core.c b/net/devlink/core.c
index 998e4ffd5dce..d8f273e1732c 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -299,6 +299,28 @@ void devlink_default_esw_mode_apply_disable(struct devlink *devlink)
 	devlink->default_esw_mode_apply_pending = false;
 }
 
+/**
+ * devl_apply_default_esw_mode - Apply devlink eswitch mode boot default
+ * @devlink: devlink
+ *
+ * Apply a matching devlink_eswitch_mode= boot default immediately. Drivers may
+ * use this helper when the device is ready for an eswitch mode change and the
+ * caller already holds the devlink instance lock.
+ *
+ * Any pending asynchronous default apply is cleared before applying the
+ * default, so work queued by devl_register() will not apply it again.
+ *
+ * Context: Caller must hold the devlink instance lock.
+ */
+void devl_apply_default_esw_mode(struct devlink *devlink)
+{
+	devl_assert_locked(devlink);
+
+	devlink->default_esw_mode_apply_pending = false;
+	devlink_default_esw_mode_apply(devlink);
+}
+EXPORT_SYMBOL_GPL(devl_apply_default_esw_mode);
+
 static void devlink_default_esw_mode_apply_cancel(struct devlink *devlink)
 {
 	if (cancel_delayed_work_sync(&devlink->default_esw_mode_apply_dw))
-- 
2.43.0


^ permalink raw reply related

* [PATCH net-next V4 4/6] devlink: Apply eswitch mode boot defaults
From: Mark Bloch @ 2026-06-29 18:20 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

Apply parsed devlink_eswitch_mode= defaults after devlink registration
and after successful reload.

devl_register() may still be called before the device is ready for an
eswitch mode change, so keep a per-devlink delayed work item and pending
flag for the registration path. Registration queues the work, and the
worker tries to take the devlink instance lock.

If the lock is busy, the worker requeues itself with a delay.

For successful reloads that performed DRIVER_REINIT, devlink_reload()
already holds the devlink instance lock and the driver has completed
reload_up(). Clear pending work and apply the default directly from the
reload path instead of queueing work.

If a user sets eswitch mode through netlink before the pending
registration work runs, clear the pending flag so the queued default does
not override that user request. Cancel pending default apply work when
freeing the devlink instance.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 net/devlink/core.c          | 198 +++++++++++++++++++++++++++++++-----
 net/devlink/dev.c           |   6 ++
 net/devlink/devl_internal.h |   5 +
 3 files changed, 182 insertions(+), 27 deletions(-)

diff --git a/net/devlink/core.c b/net/devlink/core.c
index 5126509a9c4e..998e4ffd5dce 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/jiffies.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 #include <linux/string.h>
@@ -22,8 +23,12 @@ DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC);
 
 static char *devlink_default_esw_mode_param;
 static bool devlink_default_esw_mode_match_all;
+static bool devlink_default_esw_mode_enabled;
 static enum devlink_eswitch_mode devlink_default_esw_mode;
 static LIST_HEAD(devlink_default_esw_mode_nodes);
+static struct workqueue_struct *devlink_default_esw_mode_wq;
+
+#define DEVLINK_DEFAULT_ESW_MODE_APPLY_DELAY msecs_to_jiffies(100)
 
 struct devlink_default_esw_mode_node {
 	struct list_head list;
@@ -166,6 +171,7 @@ static void __init devlink_default_esw_mode_nodes_clear(void)
 	}
 
 	devlink_default_esw_mode_match_all = false;
+	devlink_default_esw_mode_enabled = false;
 }
 
 static int __init devlink_default_esw_mode_parse(char *str)
@@ -192,14 +198,113 @@ static int __init devlink_default_esw_mode_parse(char *str)
 		return err;
 
 	err = devlink_default_esw_mode_handles_parse(handles);
-	if (err)
+	if (err) {
 		devlink_default_esw_mode_nodes_clear();
-	else
+	} else {
 		devlink_default_esw_mode = esw_mode;
+		devlink_default_esw_mode_enabled = true;
+	}
 
 	return err;
 }
 
+static bool devlink_default_esw_mode_match(struct devlink *devlink)
+{
+	const char *bus_name = devlink_bus_name(devlink);
+	const char *dev_name = devlink_dev_name(devlink);
+	struct devlink_default_esw_mode_node *node;
+
+	if (devlink_default_esw_mode_match_all)
+		return true;
+
+	node = devlink_default_esw_mode_node_find(bus_name, dev_name);
+	return !!node;
+}
+
+void devlink_default_esw_mode_apply(struct devlink *devlink)
+{
+	const struct devlink_ops *ops = devlink->ops;
+	int err;
+
+	devl_assert_locked(devlink);
+
+	if (!devlink_default_esw_mode_match(devlink))
+		return;
+
+	if (!ops->eswitch_mode_set) {
+		if (!devlink_default_esw_mode_match_all)
+			devl_warn(devlink,
+				  "devlink_eswitch_mode= selected this device but eswitch mode setting is not supported\n");
+		return;
+	}
+
+	err = devlink_eswitch_mode_set(devlink, devlink_default_esw_mode, NULL);
+	if (err)
+		devl_warn(devlink,
+			  "Couldn't apply default eswitch mode, err %d\n",
+			  err);
+}
+
+static void
+devlink_default_esw_mode_apply_queue(struct devlink *devlink,
+				     unsigned long delay)
+{
+	if (!devlink_default_esw_mode_enabled || !devlink_default_esw_mode_wq)
+		return;
+	if (!devlink_try_get(devlink))
+		return;
+	if (!queue_delayed_work(devlink_default_esw_mode_wq,
+				&devlink->default_esw_mode_apply_dw,
+				delay))
+		devlink_put(devlink);
+}
+
+static void devlink_default_esw_mode_apply_work(struct work_struct *work)
+{
+	unsigned long delay = DEVLINK_DEFAULT_ESW_MODE_APPLY_DELAY;
+	struct delayed_work *dwork = to_delayed_work(work);
+	struct devlink *devlink;
+
+	devlink = container_of(dwork, struct devlink,
+			       default_esw_mode_apply_dw);
+	if (!devl_trylock(devlink)) {
+		if (__devl_is_registered(devlink))
+			devlink_default_esw_mode_apply_queue(devlink, delay);
+		devlink_put(devlink);
+		return;
+	}
+
+	if (devl_is_registered(devlink) &&
+	    devlink->default_esw_mode_apply_pending) {
+		devlink_default_esw_mode_apply(devlink);
+		devlink->default_esw_mode_apply_pending = false;
+	}
+
+	devl_unlock(devlink);
+	devlink_put(devlink);
+}
+
+void devlink_default_esw_mode_apply_schedule(struct devlink *devlink)
+{
+	devl_assert_locked(devlink);
+
+	devlink->default_esw_mode_apply_pending = true;
+	devlink_default_esw_mode_apply_queue(devlink, 0);
+}
+
+void devlink_default_esw_mode_apply_disable(struct devlink *devlink)
+{
+	devl_assert_locked(devlink);
+
+	devlink->default_esw_mode_apply_pending = false;
+}
+
+static void devlink_default_esw_mode_apply_cancel(struct devlink *devlink)
+{
+	if (cancel_delayed_work_sync(&devlink->default_esw_mode_apply_dw))
+		devlink_put(devlink);
+}
+
 static int __init devlink_default_esw_mode_setup(char *str)
 {
 	devlink_default_esw_mode_param = str;
@@ -577,6 +682,12 @@ struct devlink *devlinks_xa_lookup_get(struct net *net, unsigned long index)
  * Make @devlink visible to userspace. Drivers must call this only after the
  * instance is fully initialized and its devlink operations can be called.
  *
+ * If a matching devlink_eswitch_mode= default was provided on the kernel
+ * command line, devlink core schedules async work to apply it after
+ * registration. Drivers implementing eswitch_mode_set() must therefore be
+ * ready to perform the same work as a userspace eswitch mode set request from
+ * this point, including creation of representors and other eswitch state.
+ *
  * Context: Caller must hold the devlink instance lock. Use devlink_register()
  * when the lock is not already held.
  *
@@ -590,6 +701,7 @@ int devl_register(struct devlink *devlink)
 	xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
 	devlink_notify_register(devlink);
 	devlink_rel_nested_in_notify(devlink);
+	devlink_default_esw_mode_apply_schedule(devlink);
 
 	return 0;
 }
@@ -612,6 +724,7 @@ void devl_unregister(struct devlink *devlink)
 	ASSERT_DEVLINK_REGISTERED(devlink);
 	devl_assert_locked(devlink);
 
+	devlink_default_esw_mode_apply_disable(devlink);
 	devlink_notify_unregister(devlink);
 	xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
 	devlink_rel_put(devlink);
@@ -673,6 +786,9 @@ struct devlink *__devlink_alloc(const struct devlink_ops *ops, size_t priv_size,
 	INIT_LIST_HEAD(&devlink->trap_group_list);
 	INIT_LIST_HEAD(&devlink->trap_policer_list);
 	INIT_RCU_WORK(&devlink->rwork, devlink_release);
+	INIT_DELAYED_WORK(&devlink->default_esw_mode_apply_dw,
+			  devlink_default_esw_mode_apply_work);
+	devlink->default_esw_mode_apply_pending = true;
 	lockdep_register_key(&devlink->lock_key);
 	mutex_init(&devlink->lock);
 	lockdep_set_class(&devlink->lock, &devlink->lock_key);
@@ -716,6 +832,7 @@ EXPORT_SYMBOL_GPL(devlink_alloc_ns);
 void devlink_free(struct devlink *devlink)
 {
 	ASSERT_DEVLINK_NOT_REGISTERED(devlink);
+	devlink_default_esw_mode_apply_cancel(devlink);
 
 	devlink_rel_put(devlink);
 
@@ -775,35 +892,59 @@ static struct notifier_block devlink_port_netdevice_nb = {
 	.notifier_call = devlink_port_netdevice_event,
 };
 
-static int __init devlink_init(void)
+static int __init devlink_default_esw_mode_init(void)
 {
+	char *def;
 	int err;
 
-	if (devlink_default_esw_mode_param) {
-		char *def;
-
-		def = kstrdup(devlink_default_esw_mode_param, GFP_KERNEL);
-		if (!def) {
-			devlink_default_esw_mode_param = NULL;
-			pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
-		} else {
-			err = devlink_default_esw_mode_parse(def);
-			kfree(def);
-			if (err == -EEXIST) {
-				devlink_default_esw_mode_param = NULL;
-				pr_warn("devlink: duplicate eswitch mode handles ignored\n");
-			} else if (err == -EINVAL) {
-				devlink_default_esw_mode_param = NULL;
-				pr_warn("devlink: invalid devlink_eswitch_mode parameter ignored\n");
-			} else if (err == -ENOMEM) {
-				devlink_default_esw_mode_param = NULL;
-				pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
-			} else if (err) {
-				goto out;
-			}
-		}
+	if (!devlink_default_esw_mode_param)
+		return 0;
+
+	def = kstrdup(devlink_default_esw_mode_param, GFP_KERNEL);
+	if (!def) {
+		devlink_default_esw_mode_param = NULL;
+		pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
+		return 0;
+	}
+
+	err = devlink_default_esw_mode_parse(def);
+	kfree(def);
+	if (err == -EEXIST) {
+		devlink_default_esw_mode_param = NULL;
+		pr_warn("devlink: duplicate eswitch mode handles ignored\n");
+		return 0;
+	} else if (err == -EINVAL) {
+		devlink_default_esw_mode_param = NULL;
+		pr_warn("devlink: invalid devlink_eswitch_mode parameter ignored\n");
+		return 0;
+	} else if (err == -ENOMEM) {
+		devlink_default_esw_mode_param = NULL;
+		pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate memory\n");
+		return 0;
+	} else if (err) {
+		return err;
 	}
 
+	devlink_default_esw_mode_wq = alloc_workqueue("devlink_default_esw_mode",
+						      WQ_UNBOUND | WQ_MEM_RECLAIM,
+						      0);
+	if (!devlink_default_esw_mode_wq) {
+		devlink_default_esw_mode_param = NULL;
+		devlink_default_esw_mode_nodes_clear();
+		pr_warn("devlink: devlink_eswitch_mode parameter ignored, failed to allocate workqueue\n");
+	}
+
+	return 0;
+}
+
+static int __init devlink_init(void)
+{
+	int err;
+
+	err = devlink_default_esw_mode_init();
+	if (err)
+		goto out;
+
 	err = register_pernet_subsys(&devlink_pernet_ops);
 	if (err)
 		goto out;
@@ -819,8 +960,11 @@ static int __init devlink_init(void)
 out_unreg_pernet_subsys:
 	unregister_pernet_subsys(&devlink_pernet_ops);
 out:
-	if (err)
+	if (err) {
+		if (devlink_default_esw_mode_wq)
+			destroy_workqueue(devlink_default_esw_mode_wq);
 		devlink_default_esw_mode_nodes_clear();
+	}
 	WARN_ON(err);
 
 	return err;
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index 4fb02bb993c1..7f6ed52a5f73 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -478,6 +478,11 @@ int devlink_reload(struct devlink *devlink, struct net *dest_net,
 		return err;
 
 	WARN_ON(!(*actions_performed & BIT(action)));
+	if (*actions_performed & BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT)) {
+		devlink_default_esw_mode_apply_disable(devlink);
+		devlink_default_esw_mode_apply(devlink);
+	}
+
 	/* Catch driver on updating the remote action within devlink reload */
 	WARN_ON(memcmp(remote_reload_stats, devlink->stats.remote_reload_stats,
 		       sizeof(remote_reload_stats)));
@@ -731,6 +736,7 @@ int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
 	u16 mode;
 
 	if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
+		devlink_default_esw_mode_apply_disable(devlink);
 		mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
 		err = devlink_eswitch_mode_set(devlink, mode, info->extack);
 		if (err)
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index 97be77d3ed42..d6ff233da974 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -58,8 +58,10 @@ struct devlink {
 	struct mutex lock;
 	struct lock_class_key lock_key;
 	u8 reload_failed:1;
+	u8 default_esw_mode_apply_pending:1;
 	refcount_t refcount;
 	struct rcu_work rwork;
+	struct delayed_work default_esw_mode_apply_dw;
 	struct devlink_rel *rel;
 	struct xarray nested_rels;
 	char priv[] __aligned(NETDEV_ALIGN);
@@ -71,6 +73,9 @@ extern struct genl_family devlink_nl_family;
 struct devlink *__devlink_alloc(const struct devlink_ops *ops, size_t priv_size,
 				struct net *net, struct device *dev,
 				const struct device_driver *dev_driver);
+void devlink_default_esw_mode_apply(struct devlink *devlink);
+void devlink_default_esw_mode_apply_schedule(struct devlink *devlink);
+void devlink_default_esw_mode_apply_disable(struct devlink *devlink);
 
 #define devl_warn(devlink, format, args...)				\
 	do {								\
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next v1] tcp/dccp: avoid parity split for socket-local bind range
From: Kuniyuki Iwashima @ 2026-06-29 18:21 UTC (permalink / raw)
  To: luoxuanqiang
  Cc: Eric Dumazet, Neal Cardwell, netdev, David S . Miller,
	Jakub Kicinski, Paolo Abeni, Simon Horman, luoxuanqiang
In-Reply-To: <F6AFD54C-1618-4501-80C2-88F796BA95C0@linux.dev>

On Fri, Jun 26, 2026 at 7:00 PM luoxuanqiang <xuanqiang.luo@linux.dev> wrote:
> > 2026年6月27日 07:40,Kuniyuki Iwashima <kuniyu@google.com> 写道:
> >
> > On Fri, Jun 26, 2026 at 2:40 AM <xuanqiang.luo@linux.dev> wrote:
> >>
> >> From: luoxuanqiang <luoxuanqiang@kylinos.cn>
> >>
> >> IP_LOCAL_PORT_RANGE lets applications override the netns ephemeral port
> >> range on a per-socket basis.  __inet_hash_connect() already treats such a
> >> range as an explicit application partition and scans it with step 1 [1].
> >>
> >> Do the same in inet_csk_find_open_port():
> >
> > What's the use case of IP_LOCAL_PORT_RANGE + bind(, 0)
> > without IP_BIND_ADDRESS_NO_PORT ?
> Hi Kuniyuki,
>
> Thanks for the question!
>
> The use case is when an application wants to restrict ephemeral port
> allocation to a socket-local IP_LOCAL_PORT_RANGE, but still needs
> bind(..., 0) to allocate and reserve a local port immediately.

IP_LOCAL_PORT_RANGE was introduced for connect().

Unlike connect(), bind() occupies the port without SO_REUSEADDR/PORT,
so I don't think the step 1 or 2 makes any difference.


>
> IP_BIND_ADDRESS_NO_PORT is useful when the application can defer port
> allocation until connect(), but it changes this behavior: bind(..., 0)
> does not reserve a port in that case. So it is not a replacement for
> applications that need the local port before connect(), for example to
> publish it to another component or set up local policy.
>
> This patch is also intended to keep the bind(..., 0) path consistent with
> Eric's earlier change in __inet_hash_connect().
>
> Thanks,
> Xuanqiang

^ permalink raw reply

* [PATCH net-next V4 6/6] net/mlx5: Apply devlink eswitch mode boot default on probe
From: Mark Bloch @ 2026-06-29 18:21 UTC (permalink / raw)
  To: Jiri Pirko, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman
  Cc: Saeed Mahameed, Leon Romanovsky, Tariq Toukan, Andrew Lunn,
	Jonathan Corbet, Shuah Khan, netdev, linux-rdma, linux-doc,
	Mark Bloch
In-Reply-To: <20260629182102.245150-1-mbloch@nvidia.com>

Apply devlink_eswitch_mode= boot defaults for mlx5 after the initial
probe finishes device initialization while holding the devlink instance
lock.

At this point the devlink instance is registered and mlx5 can perform an
eswitch mode change. Calling devl_apply_default_esw_mode() also clears
any pending default apply work queued by devl_register(), so the queued
work will not apply the same default again.

Keep this call in mlx5_init_one() rather than the lower-level
devl-locked init helper. That helper is also used by devlink reload, and
devlink core already applies the boot default after a successful
DRIVER_REINIT reload.

Signed-off-by: Mark Bloch <mbloch@nvidia.com>
---
 drivers/net/ethernet/mellanox/mlx5/core/main.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c
index 643b4aac2033..0712efea74cc 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
@@ -1392,6 +1392,17 @@ static void mlx5_unload(struct mlx5_core_dev *dev)
 	mlx5_free_bfreg(dev, &dev->priv.bfreg);
 }
 
+static void mlx5_devl_apply_default_esw_mode(struct mlx5_core_dev *dev)
+{
+	struct devlink *devlink = priv_to_devlink(dev);
+
+	if (!MLX5_ESWITCH_MANAGER(dev))
+		return;
+
+	devl_assert_locked(devlink);
+	devl_apply_default_esw_mode(devlink);
+}
+
 int mlx5_init_one_devl_locked(struct mlx5_core_dev *dev)
 {
 	bool light_probe = mlx5_dev_is_lightweight(dev);
@@ -1471,6 +1482,8 @@ int mlx5_init_one(struct mlx5_core_dev *dev)
 	err = mlx5_init_one_devl_locked(dev);
 	if (err)
 		devl_unregister(devlink);
+	else
+		mlx5_devl_apply_default_esw_mode(dev);
 unlock:
 	devl_unlock(devlink);
 	return err;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] net: neighbour: add neigh_parms_lookup_dev() helper
From: Kuniyuki Iwashima @ 2026-06-29 18:23 UTC (permalink / raw)
  To: Paritosh Potukuchi
  Cc: netdev, linux-kernel, Paritosh Potukuchi, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	Ido Schimmel, Petr Machata
In-Reply-To: <20260629155748.715754-1-paritosh.potukuchi@amd.com>

On Mon, Jun 29, 2026 at 8:58 AM Paritosh Potukuchi
<paritoshpotukuchi@gmail.com> wrote:
>
> Provide a helper to lookup neigh_parms associated
> with a given (neigh_table, net_device) pair.
>
> The existing lookup_neigh_parms() helper is internal to the
> neighbour subsystem and cannot be used by other subsystems.
> Some stacked/virtual devices like bond require access to the
> underlying device's neigh_parms.
>
> neigh_parms_lookup_dev() is designed to be a wrapper around
> lookup_neigh_parms(). The function provides controlled access
> to per device neigh_parms.

Please post a series of patches with the neigh_parms_lookup_dev()
users.


>
> The caller is expected to hold rcu_read_lock().
>
> This does not break any existing functionality.
>
> Signed-off-by: Paritosh Potukuchi <paritosh.potukuchi@amd.com>
> ---
>  include/net/neighbour.h | 2 ++
>  net/core/neighbour.c    | 8 ++++++++
>  2 files changed, 10 insertions(+)
>
> diff --git a/include/net/neighbour.h b/include/net/neighbour.h
> index 8860cc2175fc..1b3b06eda886 100644
> --- a/include/net/neighbour.h
> +++ b/include/net/neighbour.h
> @@ -438,6 +438,8 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
>                           proc_handler *proc_handler);
>  void neigh_sysctl_unregister(struct neigh_parms *p);
>
> +struct neigh_parms *neigh_parms_lookup_dev(struct neigh_table *tbl, struct net_device *dev);
> +
>  static inline void __neigh_parms_put(struct neigh_parms *parms)
>  {
>         refcount_dec(&parms->refcnt);
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 1349c0eedb64..6d32c2668af3 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -1757,6 +1757,14 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
>         return NULL;
>  }
>
> +/* Caller must hold rcu_read_lock()*/
> +
> +struct neigh_parms *neigh_parms_lookup_dev(struct neigh_table *tbl, struct net_device *dev)
> +{
> +       return lookup_neigh_parms(tbl, dev_net(dev), dev->ifindex);
> +}
> +EXPORT_SYMBOL(neigh_parms_lookup_dev);
> +
>  struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
>                                       struct neigh_table *tbl)
>  {
> --
> 2.43.0
>

^ permalink raw reply

* Re: [PATCH bpf-next v3 1/2] bpf, sockmap: disallow update and delete from tc, xdp and flow_dissector
From: Alexei Starovoitov @ 2026-06-29 18:24 UTC (permalink / raw)
  To: Sechang Lim, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	John Fastabend, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	David S . Miller, Jakub Kicinski, Jesper Dangaard Brouer
  Cc: Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Stanislav Fomichev, Emil Tsalapatis, Lorenz Bauer, Jakub Sitnicki,
	Jiayuan Chen, Shuah Khan, bpf, netdev, linux-kselftest,
	linux-kernel
In-Reply-To: <20260629172704.1302218-2-rhkrqnwk98@gmail.com>

On Mon Jun 29, 2026 at 10:27 AM PDT, Sechang Lim wrote:
> sock_map_update_common() and __sock_map_delete() hold stab->lock and call
> sock_map_unref() -> sock_map_del_link(), which takes sk_callback_lock for
> write. That gives the order stab->lock -> sk_callback_lock.
>
> The reverse order comes from the SK_SKB stream parser.
> sk_psock_strp_data_ready() holds sk_callback_lock for read, and after the
> verdict tcp_bpf_strp_read_sock() acks the consumed data inline via
> __tcp_cleanup_rbuf(). The ACK goes out egress, where a sched_cls program
> deletes from the sockmap and takes stab->lock:
>
>   WARNING: possible circular locking dependency detected
>   ------------------------------------------------------
>   syz.9.8824 is trying to acquire lock:
>   (&stab->lock){+.-.}-{3:3}, at: __sock_map_delete net/core/sock_map.c:421
>   but task is already holding lock:
>   (clock-AF_INET){++.-}-{3:3}, at: sk_psock_strp_data_ready net/core/skmsg.c:1173
>
>   -> #1 (clock-AF_INET){++.-}-{3:3}:
>          _raw_write_lock_bh
>          sock_map_del_link net/core/sock_map.c:167
>          sock_map_unref net/core/sock_map.c:184
>          sock_map_update_common net/core/sock_map.c:509
>          sock_map_update_elem_sys net/core/sock_map.c:588
>          map_update_elem kernel/bpf/syscall.c:1805
>
>   -> #0 (&stab->lock){+.-.}-{3:3}:
>          _raw_spin_lock_bh
>          __sock_map_delete net/core/sock_map.c:421
>          sock_map_delete_elem net/core/sock_map.c:452
>          bpf_prog_06044d24140080b6
>          tcx_run net/core/dev.c:4451
>          sch_handle_egress net/core/dev.c:4541
>          __dev_queue_xmit net/core/dev.c:4808
>          ...
>          tcp_bpf_strp_read_sock net/ipv4/tcp_bpf.c:701
>          strp_data_ready net/strparser/strparser.c:402
>          sk_psock_strp_data_ready net/core/skmsg.c:1174
>          tcp_data_queue net/ipv4/tcp_input.c:5661
>
>   Possible unsafe locking scenario:
>
>          CPU0                    CPU1
>          ----                    ----
>     rlock(clock-AF_INET);
>                                  lock(&stab->lock);
>                                  lock(clock-AF_INET);
>     lock(&stab->lock);
>
>    *** DEADLOCK ***
>
> A tc, xdp or flow_dissector program has no reason to update or delete a
> sockmap, and redirect does not go through here. Drop them from
> may_update_sockmap() so the verifier rejects it. It also closes the
> matching sockhash inversion.
>
> Suggested-by: John Fastabend <john.fastabend@gmail.com>
> Signed-off-by: Sechang Lim <rhkrqnwk98@gmail.com>

John,

please ack.


^ permalink raw reply

* [PATCH net] sctp: fix addr_wq_timer race in sctp_free_addr_wq()
From: Xin Long @ 2026-06-29 18:31 UTC (permalink / raw)
  To: network dev, linux-sctp
  Cc: davem, kuba, Eric Dumazet, Paolo Abeni, Simon Horman,
	Marcelo Ricardo Leitner, Eric W . Biederman

sctp_free_addr_wq() previously removed addr_wq_timer using timer_delete()
while holding addr_wq_lock. However, timer_delete() does not guarantee that
a currently running timer handler has completed.

This allows a race with sctp_addr_wq_timeout_handler(), where the handler
may still run after addr_waitq has been freed, acquire addr_wq_lock, and
access freed memory, leading to a use-after-free.

Fix this by calling timer_shutdown_sync() before taking addr_wq_lock.  This
guarantees that any in-flight timer handler has finished and prevents the
timer from being re-armed during teardown, making subsequent cleanup safe.

Fixes: 4db67e808640 ("sctp: Make the address lists per network namespace")
Reported-by: Sashiko <sashiko-bot@kernel.org>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
---
 net/sctp/protocol.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 587b0017a67d..cf335494bffe 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -663,8 +663,9 @@ static void sctp_free_addr_wq(struct net *net)
 	struct sctp_sockaddr_entry *addrw;
 	struct sctp_sockaddr_entry *temp;
 
+	timer_shutdown_sync(&net->sctp.addr_wq_timer);
+
 	spin_lock_bh(&net->sctp.addr_wq_lock);
-	timer_delete(&net->sctp.addr_wq_timer);
 	list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) {
 		list_del(&addrw->list);
 		kfree(addrw);
-- 
2.47.1


^ permalink raw reply related

* [PATCH net-next v2 0/4] net: dsa: motorcomm: Add LED support
From: David Yang @ 2026-06-29 18:31 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel

v1: https://lore.kernel.org/r/20260618202716.2166450-1-mmyangfl@gmail.com
  - set up polarity correctly
  - do not set up .brightness_get() to prevent dead lock

David Yang (4):
  net: dsa: motorcomm: Move to subdirectory
  net: dsa: motorcomm: Split SMI module
  net: dsa: motorcomm: Dynamically allocate port structures
  net: dsa: motorcomm: Add LED support

 MAINTAINERS                                   |   2 +-
 drivers/net/dsa/Kconfig                       |  10 +-
 drivers/net/dsa/Makefile                      |   2 +-
 drivers/net/dsa/motorcomm/Kconfig             |  17 +
 drivers/net/dsa/motorcomm/Makefile            |   5 +
 .../net/dsa/{yt921x.c => motorcomm/chip.c}    | 336 ++++-------
 .../net/dsa/{yt921x.h => motorcomm/chip.h}    |  24 +-
 drivers/net/dsa/motorcomm/leds.c              | 530 ++++++++++++++++++
 drivers/net/dsa/motorcomm/leds.h              | 108 ++++
 drivers/net/dsa/motorcomm/smi.c               | 157 ++++++
 drivers/net/dsa/motorcomm/smi.h               |  88 +++
 11 files changed, 1029 insertions(+), 250 deletions(-)
 create mode 100644 drivers/net/dsa/motorcomm/Kconfig
 create mode 100644 drivers/net/dsa/motorcomm/Makefile
 rename drivers/net/dsa/{yt921x.c => motorcomm/chip.c} (95%)
 rename drivers/net/dsa/{yt921x.h => motorcomm/chip.h} (98%)
 create mode 100644 drivers/net/dsa/motorcomm/leds.c
 create mode 100644 drivers/net/dsa/motorcomm/leds.h
 create mode 100644 drivers/net/dsa/motorcomm/smi.c
 create mode 100644 drivers/net/dsa/motorcomm/smi.h

-- 
2.53.0


^ permalink raw reply

* [PATCH net-next v2 1/4] net: dsa: motorcomm: Move to subdirectory
From: David Yang @ 2026-06-29 18:31 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel
In-Reply-To: <20260629183137.541341-1-mmyangfl@gmail.com>

yt921x is already the longest single-file DSA driver, so it's time to
split it into parts.

Signed-off-by: David Yang <mmyangfl@gmail.com>
---
 MAINTAINERS                                    |  2 +-
 drivers/net/dsa/Kconfig                        | 10 ++--------
 drivers/net/dsa/Makefile                       |  2 +-
 drivers/net/dsa/motorcomm/Kconfig              |  8 ++++++++
 drivers/net/dsa/motorcomm/Makefile             |  3 +++
 drivers/net/dsa/{yt921x.c => motorcomm/chip.c} |  2 +-
 drivers/net/dsa/{yt921x.h => motorcomm/chip.h} |  0
 7 files changed, 16 insertions(+), 11 deletions(-)
 create mode 100644 drivers/net/dsa/motorcomm/Kconfig
 create mode 100644 drivers/net/dsa/motorcomm/Makefile
 rename drivers/net/dsa/{yt921x.c => motorcomm/chip.c} (99%)
 rename drivers/net/dsa/{yt921x.h => motorcomm/chip.h} (100%)

diff --git a/MAINTAINERS b/MAINTAINERS
index 06df1171f4cf..b007f20b2763 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -18039,7 +18039,7 @@ M:	David Yang <mmyangfl@gmail.com>
 L:	netdev@vger.kernel.org
 S:	Maintained
 F:	Documentation/devicetree/bindings/net/dsa/motorcomm,yt921x.yaml
-F:	drivers/net/dsa/yt921x.*
+F:	drivers/net/dsa/motorcomm/
 F:	net/dsa/tag_yt921x.c
 
 MOXA SMARTIO/INDUSTIO/INTELLIO SERIAL CARD
diff --git a/drivers/net/dsa/Kconfig b/drivers/net/dsa/Kconfig
index 4ab567c5bbaf..98e9bbe47de7 100644
--- a/drivers/net/dsa/Kconfig
+++ b/drivers/net/dsa/Kconfig
@@ -72,6 +72,8 @@ config NET_DSA_MV88E6060
 
 source "drivers/net/dsa/microchip/Kconfig"
 
+source "drivers/net/dsa/motorcomm/Kconfig"
+
 source "drivers/net/dsa/mv88e6xxx/Kconfig"
 
 source "drivers/net/dsa/mxl862xx/Kconfig"
@@ -158,12 +160,4 @@ config NET_DSA_VITESSE_VSC73XX_PLATFORM
 	  This enables support for the Vitesse VSC7385, VSC7388, VSC7395
 	  and VSC7398 SparX integrated ethernet switches, connected over
 	  a CPU-attached address bus and work in memory-mapped I/O mode.
-
-config NET_DSA_YT921X
-	tristate "Motorcomm YT9215 ethernet switch chip support"
-	select NET_DSA_TAG_YT921X
-	select NET_IEEE8021Q_HELPERS if DCB
-	help
-	  This enables support for the Motorcomm YT9215 ethernet switch
-	  chip.
 endmenu
diff --git a/drivers/net/dsa/Makefile b/drivers/net/dsa/Makefile
index d2975badffc0..138225baa4d5 100644
--- a/drivers/net/dsa/Makefile
+++ b/drivers/net/dsa/Makefile
@@ -14,11 +14,11 @@ obj-$(CONFIG_NET_DSA_SMSC_LAN9303_MDIO) += lan9303_mdio.o
 obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX) += vitesse-vsc73xx-core.o
 obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM) += vitesse-vsc73xx-platform.o
 obj-$(CONFIG_NET_DSA_VITESSE_VSC73XX_SPI) += vitesse-vsc73xx-spi.o
-obj-$(CONFIG_NET_DSA_YT921X) += yt921x.o
 obj-y				+= b53/
 obj-y				+= hirschmann/
 obj-y				+= lantiq/
 obj-y				+= microchip/
+obj-y				+= motorcomm/
 obj-y				+= mv88e6xxx/
 obj-y				+= mxl862xx/
 obj-y				+= netc/
diff --git a/drivers/net/dsa/motorcomm/Kconfig b/drivers/net/dsa/motorcomm/Kconfig
new file mode 100644
index 000000000000..1fddd386f866
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: ISC
+config NET_DSA_YT921X
+	tristate "Motorcomm YT9215 ethernet switch chip support"
+	select NET_DSA_TAG_YT921X
+	select NET_IEEE8021Q_HELPERS if DCB
+	help
+	  This enables support for the Motorcomm YT9215 ethernet switch
+	  chip.
diff --git a/drivers/net/dsa/motorcomm/Makefile b/drivers/net/dsa/motorcomm/Makefile
new file mode 100644
index 000000000000..afd03be9fa35
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: ISC
+obj-$(CONFIG_NET_DSA_YT921X) += yt921x.o
+yt921x-objs := chip.o
diff --git a/drivers/net/dsa/yt921x.c b/drivers/net/dsa/motorcomm/chip.c
similarity index 99%
rename from drivers/net/dsa/yt921x.c
rename to drivers/net/dsa/motorcomm/chip.c
index 159b16606f6c..f070732845eb 100644
--- a/drivers/net/dsa/yt921x.c
+++ b/drivers/net/dsa/motorcomm/chip.c
@@ -26,7 +26,7 @@
 #include <net/ieee8021q.h>
 #include <net/pkt_cls.h>
 
-#include "yt921x.h"
+#include "chip.h"
 
 struct yt921x_mib_desc {
 	unsigned int size;
diff --git a/drivers/net/dsa/yt921x.h b/drivers/net/dsa/motorcomm/chip.h
similarity index 100%
rename from drivers/net/dsa/yt921x.h
rename to drivers/net/dsa/motorcomm/chip.h
-- 
2.53.0


^ permalink raw reply related

* [PATCH net-next v2 2/4] net: dsa: motorcomm: Split SMI module
From: David Yang @ 2026-06-29 18:31 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel
In-Reply-To: <20260629183137.541341-1-mmyangfl@gmail.com>

SMI operations are going to be used across different modules.

Signed-off-by: David Yang <mmyangfl@gmail.com>
---
 drivers/net/dsa/motorcomm/Makefile |   1 +
 drivers/net/dsa/motorcomm/chip.c   | 207 +----------------------------
 drivers/net/dsa/motorcomm/smi.c    | 157 ++++++++++++++++++++++
 drivers/net/dsa/motorcomm/smi.h    |  88 ++++++++++++
 4 files changed, 247 insertions(+), 206 deletions(-)
 create mode 100644 drivers/net/dsa/motorcomm/smi.c
 create mode 100644 drivers/net/dsa/motorcomm/smi.h

diff --git a/drivers/net/dsa/motorcomm/Makefile b/drivers/net/dsa/motorcomm/Makefile
index afd03be9fa35..6cea5313a444 100644
--- a/drivers/net/dsa/motorcomm/Makefile
+++ b/drivers/net/dsa/motorcomm/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: ISC
 obj-$(CONFIG_NET_DSA_YT921X) += yt921x.o
 yt921x-objs := chip.o
+yt921x-objs += smi.o
diff --git a/drivers/net/dsa/motorcomm/chip.c b/drivers/net/dsa/motorcomm/chip.c
index f070732845eb..6dee25b6754a 100644
--- a/drivers/net/dsa/motorcomm/chip.c
+++ b/drivers/net/dsa/motorcomm/chip.c
@@ -13,7 +13,6 @@
 #include <linux/if_bridge.h>
 #include <linux/if_hsr.h>
 #include <linux/if_vlan.h>
-#include <linux/iopoll.h>
 #include <linux/mdio.h>
 #include <linux/module.h>
 #include <linux/of.h>
@@ -27,6 +26,7 @@
 #include <net/pkt_cls.h>
 
 #include "chip.h"
+#include "smi.h"
 
 struct yt921x_mib_desc {
 	unsigned int size;
@@ -155,9 +155,6 @@ static const struct yt921x_info yt921x_infos[] = {
 
 #define YT921X_VID_UNWARE	4095
 
-#define YT921X_POLL_SLEEP_US	10000
-#define YT921X_POLL_TIMEOUT_US	100000
-
 /* The interval should be small enough to avoid overflow of 32bit MIBs.
  *
  * Until we can read MIBs from stats64 call directly (i.e. sleep
@@ -196,208 +193,6 @@ static u32 ethaddr_lo2_to_u32(const unsigned char *addr)
 	return (addr[4] << 8) | addr[5];
 }
 
-static int yt921x_reg_read(struct yt921x_priv *priv, u32 reg, u32 *valp)
-{
-	WARN_ON(!mutex_is_locked(&priv->reg_lock));
-
-	return priv->reg_ops->read(priv->reg_ctx, reg, valp);
-}
-
-static int yt921x_reg_write(struct yt921x_priv *priv, u32 reg, u32 val)
-{
-	WARN_ON(!mutex_is_locked(&priv->reg_lock));
-
-	return priv->reg_ops->write(priv->reg_ctx, reg, val);
-}
-
-static int
-yt921x_reg_wait(struct yt921x_priv *priv, u32 reg, u32 mask, u32 *valp)
-{
-	u32 val;
-	int res;
-	int ret;
-
-	ret = read_poll_timeout(yt921x_reg_read, res,
-				res || (val & mask) == *valp,
-				YT921X_POLL_SLEEP_US, YT921X_POLL_TIMEOUT_US,
-				false, priv, reg, &val);
-	if (ret)
-		return ret;
-	if (res)
-		return res;
-
-	*valp = val;
-	return 0;
-}
-
-static int
-yt921x_reg_update_bits(struct yt921x_priv *priv, u32 reg, u32 mask, u32 val)
-{
-	int res;
-	u32 v;
-	u32 u;
-
-	res = yt921x_reg_read(priv, reg, &v);
-	if (res)
-		return res;
-
-	u = v;
-	u &= ~mask;
-	u |= val;
-	if (u == v)
-		return 0;
-
-	return yt921x_reg_write(priv, reg, u);
-}
-
-static int yt921x_reg_set_bits(struct yt921x_priv *priv, u32 reg, u32 mask)
-{
-	return yt921x_reg_update_bits(priv, reg, 0, mask);
-}
-
-static int yt921x_reg_clear_bits(struct yt921x_priv *priv, u32 reg, u32 mask)
-{
-	return yt921x_reg_update_bits(priv, reg, mask, 0);
-}
-
-static int
-yt921x_reg_toggle_bits(struct yt921x_priv *priv, u32 reg, u32 mask, bool set)
-{
-	return yt921x_reg_update_bits(priv, reg, mask, !set ? 0 : mask);
-}
-
-/* Some multi-word registers, like VLANn_CTRL, should be treated as a single
- * long register. More specifically, writes to parts of its words won't become
- * visible, until the last word is written.
- *
- * Here we require full read and write operations over these registers to
- * eliminate potential issues, although partial reads/writes are also possible.
- */
-
-static void update_ctrls_unaligned(u32 *lo, u32 *hi, u64 mask, u64 val)
-{
-	*lo &= ~lower_32_bits(mask);
-	*hi &= ~upper_32_bits(mask);
-	*lo |= lower_32_bits(val);
-	*hi |= upper_32_bits(val);
-}
-
-static int
-yt921x_regs_read(struct yt921x_priv *priv, u32 reg, u32 *vals,
-		 unsigned int num_regs)
-{
-	int res;
-
-	for (unsigned int i = 0; i < num_regs; i++) {
-		res = yt921x_reg_read(priv, reg + 4 * i, &vals[i]);
-		if (res)
-			return res;
-	}
-
-	return 0;
-}
-
-static int
-yt921x_regs_write(struct yt921x_priv *priv, u32 reg, const u32 *vals,
-		  unsigned int num_regs)
-{
-	int res;
-
-	for (unsigned int i = 0; i < num_regs; i++) {
-		res = yt921x_reg_write(priv, reg + 4 * i, vals[i]);
-		if (res)
-			return res;
-	}
-
-	return 0;
-}
-
-static int
-yt921x_regs_update_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
-			const u32 *vals, unsigned int num_regs)
-{
-	bool changed = false;
-	u32 vs[4];
-	int res;
-
-	BUILD_BUG_ON(num_regs > ARRAY_SIZE(vs));
-
-	res = yt921x_regs_read(priv, reg, vs, num_regs);
-	if (res)
-		return res;
-
-	for (unsigned int i = 0; i < num_regs; i++) {
-		u32 u = vs[i];
-
-		u &= ~masks[i];
-		u |= vals[i];
-		if (u != vs[i])
-			changed = true;
-
-		vs[i] = u;
-	}
-
-	if (!changed)
-		return 0;
-
-	return yt921x_regs_write(priv, reg, vs, num_regs);
-}
-
-static int
-yt921x_regs_clear_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
-		       unsigned int num_regs)
-{
-	bool changed = false;
-	u32 vs[4];
-	int res;
-
-	BUILD_BUG_ON(num_regs > ARRAY_SIZE(vs));
-
-	res = yt921x_regs_read(priv, reg, vs, num_regs);
-	if (res)
-		return res;
-
-	for (unsigned int i = 0; i < num_regs; i++) {
-		u32 u = vs[i];
-
-		u &= ~masks[i];
-		if (u != vs[i])
-			changed = true;
-
-		vs[i] = u;
-	}
-
-	if (!changed)
-		return 0;
-
-	return yt921x_regs_write(priv, reg, vs, num_regs);
-}
-
-static int
-yt921x_reg64_write(struct yt921x_priv *priv, u32 reg, const u32 *vals)
-{
-	return yt921x_regs_write(priv, reg, vals, 2);
-}
-
-static int
-yt921x_reg64_update_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
-			 const u32 *vals)
-{
-	return yt921x_regs_update_bits(priv, reg, masks, vals, 2);
-}
-
-static int
-yt921x_reg64_clear_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks)
-{
-	return yt921x_regs_clear_bits(priv, reg, masks, 2);
-}
-
-static int
-yt921x_reg96_write(struct yt921x_priv *priv, u32 reg, const u32 *vals)
-{
-	return yt921x_regs_write(priv, reg, vals, 3);
-}
-
 static int yt921x_reg_mdio_read(void *context, u32 reg, u32 *valp)
 {
 	struct yt921x_reg_mdio *mdio = context;
diff --git a/drivers/net/dsa/motorcomm/smi.c b/drivers/net/dsa/motorcomm/smi.c
new file mode 100644
index 000000000000..9054896e4cd1
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/smi.c
@@ -0,0 +1,157 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 David Yang
+ */
+
+#include <linux/iopoll.h>
+
+#include "chip.h"
+#include "smi.h"
+
+#define YT921X_POLL_SLEEP_US	10000
+#define YT921X_POLL_TIMEOUT_US	100000
+
+int yt921x_reg_read(struct yt921x_priv *priv, u32 reg, u32 *valp)
+{
+	lockdep_assert_held_once(&priv->reg_lock);
+
+	return priv->reg_ops->read(priv->reg_ctx, reg, valp);
+}
+
+int yt921x_reg_write(struct yt921x_priv *priv, u32 reg, u32 val)
+{
+	lockdep_assert_held_once(&priv->reg_lock);
+
+	return priv->reg_ops->write(priv->reg_ctx, reg, val);
+}
+
+int yt921x_reg_wait(struct yt921x_priv *priv, u32 reg, u32 mask, u32 *valp)
+{
+	u32 val;
+	int res;
+	int ret;
+
+	ret = read_poll_timeout(yt921x_reg_read, res,
+				res || (val & mask) == *valp,
+				YT921X_POLL_SLEEP_US, YT921X_POLL_TIMEOUT_US,
+				false, priv, reg, &val);
+	if (ret)
+		return ret;
+	if (res)
+		return res;
+
+	*valp = val;
+	return 0;
+}
+
+int yt921x_reg_update_bits(struct yt921x_priv *priv, u32 reg, u32 mask, u32 val)
+{
+	int res;
+	u32 v;
+	u32 u;
+
+	res = yt921x_reg_read(priv, reg, &v);
+	if (res)
+		return res;
+
+	u = v;
+	u &= ~mask;
+	u |= val;
+	if (u == v)
+		return 0;
+
+	return yt921x_reg_write(priv, reg, u);
+}
+
+int
+yt921x_regs_read(struct yt921x_priv *priv, u32 reg, u32 *vals,
+		 unsigned int num_regs)
+{
+	int res;
+
+	for (unsigned int i = 0; i < num_regs; i++) {
+		res = yt921x_reg_read(priv, reg + 4 * i, &vals[i]);
+		if (res)
+			return res;
+	}
+
+	return 0;
+}
+
+int
+yt921x_regs_write(struct yt921x_priv *priv, u32 reg, const u32 *vals,
+		  unsigned int num_regs)
+{
+	int res;
+
+	for (unsigned int i = 0; i < num_regs; i++) {
+		res = yt921x_reg_write(priv, reg + 4 * i, vals[i]);
+		if (res)
+			return res;
+	}
+
+	return 0;
+}
+
+int
+yt921x_regs_update_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
+			const u32 *vals, unsigned int num_regs)
+{
+	bool changed = false;
+	u32 vs[4];
+	int res;
+
+	if (WARN_ON_ONCE(num_regs > ARRAY_SIZE(vs)))
+		return -EINVAL;
+
+	res = yt921x_regs_read(priv, reg, vs, num_regs);
+	if (res)
+		return res;
+
+	for (unsigned int i = 0; i < num_regs; i++) {
+		u32 u = vs[i];
+
+		u &= ~masks[i];
+		u |= vals[i];
+		if (u != vs[i])
+			changed = true;
+
+		vs[i] = u;
+	}
+
+	if (!changed)
+		return 0;
+
+	return yt921x_regs_write(priv, reg, vs, num_regs);
+}
+
+int
+yt921x_regs_clear_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
+		       unsigned int num_regs)
+{
+	bool changed = false;
+	u32 vs[4];
+	int res;
+
+	if (WARN_ON_ONCE(num_regs > ARRAY_SIZE(vs)))
+		return -EINVAL;
+
+	res = yt921x_regs_read(priv, reg, vs, num_regs);
+	if (res)
+		return res;
+
+	for (unsigned int i = 0; i < num_regs; i++) {
+		u32 u = vs[i];
+
+		u &= ~masks[i];
+		if (u != vs[i])
+			changed = true;
+
+		vs[i] = u;
+	}
+
+	if (!changed)
+		return 0;
+
+	return yt921x_regs_write(priv, reg, vs, num_regs);
+}
diff --git a/drivers/net/dsa/motorcomm/smi.h b/drivers/net/dsa/motorcomm/smi.h
new file mode 100644
index 000000000000..2e956065eb90
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/smi.h
@@ -0,0 +1,88 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 David Yang
+ */
+
+#ifndef _YT_SMI_H
+#define _YT_SMI_H
+
+#include <linux/types.h>
+#include <linux/wordpart.h>
+
+struct yt921x_priv;
+
+int yt921x_reg_read(struct yt921x_priv *priv, u32 reg, u32 *valp);
+int yt921x_reg_write(struct yt921x_priv *priv, u32 reg, u32 val);
+int yt921x_reg_wait(struct yt921x_priv *priv, u32 reg, u32 mask, u32 *valp);
+int yt921x_reg_update_bits(struct yt921x_priv *priv, u32 reg, u32 mask,
+			   u32 val);
+
+static inline int
+yt921x_reg_set_bits(struct yt921x_priv *priv, u32 reg, u32 mask)
+{
+	return yt921x_reg_update_bits(priv, reg, 0, mask);
+}
+
+static inline int
+yt921x_reg_clear_bits(struct yt921x_priv *priv, u32 reg, u32 mask)
+{
+	return yt921x_reg_update_bits(priv, reg, mask, 0);
+}
+
+static inline int
+yt921x_reg_toggle_bits(struct yt921x_priv *priv, u32 reg, u32 mask, bool set)
+{
+	return yt921x_reg_update_bits(priv, reg, mask, !set ? 0 : mask);
+}
+
+/* Some multi-word registers, like VLANn_CTRL, should be treated as a single
+ * long register. More specifically, writes to parts of its words won't become
+ * visible, until the last word is written.
+ *
+ * Here we require full read and write operations over these registers to
+ * eliminate potential issues, although partial reads/writes are also possible.
+ */
+
+static inline void update_ctrls_unaligned(u32 *lo, u32 *hi, u64 mask, u64 val)
+{
+	*lo &= ~lower_32_bits(mask);
+	*hi &= ~upper_32_bits(mask);
+	*lo |= lower_32_bits(val);
+	*hi |= upper_32_bits(val);
+}
+
+int yt921x_regs_read(struct yt921x_priv *priv, u32 reg, u32 *vals,
+		     unsigned int num_regs);
+int yt921x_regs_write(struct yt921x_priv *priv, u32 reg, const u32 *vals,
+		      unsigned int num_regs);
+int yt921x_regs_update_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
+			    const u32 *vals, unsigned int num_regs);
+int yt921x_regs_clear_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
+			   unsigned int num_regs);
+
+static inline int
+yt921x_reg64_write(struct yt921x_priv *priv, u32 reg, const u32 *vals)
+{
+	return yt921x_regs_write(priv, reg, vals, 2);
+}
+
+static inline int
+yt921x_reg64_update_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks,
+			 const u32 *vals)
+{
+	return yt921x_regs_update_bits(priv, reg, masks, vals, 2);
+}
+
+static inline int
+yt921x_reg64_clear_bits(struct yt921x_priv *priv, u32 reg, const u32 *masks)
+{
+	return yt921x_regs_clear_bits(priv, reg, masks, 2);
+}
+
+static inline int
+yt921x_reg96_write(struct yt921x_priv *priv, u32 reg, const u32 *vals)
+{
+	return yt921x_regs_write(priv, reg, vals, 3);
+}
+
+#endif
-- 
2.53.0


^ permalink raw reply related

* [PATCH net-next v2 3/4] net: dsa: motorcomm: Dynamically allocate port structures
From: David Yang @ 2026-06-29 18:31 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel
In-Reply-To: <20260629183137.541341-1-mmyangfl@gmail.com>

With support for LED introduced later, struct yt921x_priv will be 17k
which is not very good for a single kmalloc(). Convert the ports array
to a array of pointers to stop bloating the priv struct.

Signed-off-by: David Yang <mmyangfl@gmail.com>
---
 drivers/net/dsa/motorcomm/chip.c | 119 +++++++++++++++++++++++--------
 drivers/net/dsa/motorcomm/chip.h |   6 +-
 2 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/drivers/net/dsa/motorcomm/chip.c b/drivers/net/dsa/motorcomm/chip.c
index 6dee25b6754a..99d3c33e197b 100644
--- a/drivers/net/dsa/motorcomm/chip.c
+++ b/drivers/net/dsa/motorcomm/chip.c
@@ -548,11 +548,15 @@ yt921x_mbus_ext_init(struct yt921x_priv *priv, struct device_node *mnp)
 /* Read and handle overflow of 32bit MIBs. MIB buffer must be zeroed before. */
 static int yt921x_read_mib(struct yt921x_priv *priv, int port)
 {
-	struct yt921x_port *pp = &priv->ports[port];
+	struct yt921x_port *pp = priv->ports[port];
 	struct device *dev = to_device(priv);
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_mib *mib;
 	int res = 0;
 
+	if (!pp)
+		return -ENODEV;
+	mib = &pp->mib;
+
 	/* Reading of yt921x_port::mib is not protected by a lock and it's vain
 	 * to keep its consistency, since we have to read registers one by one
 	 * and there is no way to make a snapshot of MIB stats.
@@ -609,9 +613,8 @@ static void yt921x_poll_mib(struct work_struct *work)
 {
 	struct yt921x_port *pp = container_of_const(work, struct yt921x_port,
 						    mib_read.work);
-	struct yt921x_priv *priv = (void *)(pp - pp->index) -
-				   offsetof(struct yt921x_priv, ports);
 	unsigned long delay = YT921X_STATS_INTERVAL_JIFFIES;
+	struct yt921x_priv *priv = pp->priv;
 	int port = pp->index;
 	int res;
 
@@ -643,10 +646,14 @@ static void
 yt921x_dsa_get_ethtool_stats(struct dsa_switch *ds, int port, uint64_t *data)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
 	size_t j;
 
+	if (!pp)
+		return;
+	mib = &pp->mib;
+
 	mutex_lock(&priv->reg_lock);
 	yt921x_read_mib(priv, port);
 	mutex_unlock(&priv->reg_lock);
@@ -685,8 +692,12 @@ yt921x_dsa_get_eth_mac_stats(struct dsa_switch *ds, int port,
 			     struct ethtool_eth_mac_stats *mac_stats)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
+
+	if (!pp)
+		return;
+	mib = &pp->mib;
 
 	mutex_lock(&priv->reg_lock);
 	yt921x_read_mib(priv, port);
@@ -721,8 +732,12 @@ yt921x_dsa_get_eth_ctrl_stats(struct dsa_switch *ds, int port,
 			      struct ethtool_eth_ctrl_stats *ctrl_stats)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
+
+	if (!pp)
+		return;
+	mib = &pp->mib;
 
 	mutex_lock(&priv->reg_lock);
 	yt921x_read_mib(priv, port);
@@ -750,8 +765,12 @@ yt921x_dsa_get_rmon_stats(struct dsa_switch *ds, int port,
 			  const struct ethtool_rmon_hist_range **ranges)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
+
+	if (!pp)
+		return;
+	mib = &pp->mib;
 
 	mutex_lock(&priv->reg_lock);
 	yt921x_read_mib(priv, port);
@@ -786,8 +805,12 @@ yt921x_dsa_get_stats64(struct dsa_switch *ds, int port,
 		       struct rtnl_link_stats64 *stats)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
+
+	if (!pp)
+		return;
+	mib = &pp->mib;
 
 	stats->rx_length_errors = mib->rx_undersize_errors +
 				  mib->rx_fragment_errors;
@@ -822,8 +845,12 @@ yt921x_dsa_get_pause_stats(struct dsa_switch *ds, int port,
 			   struct ethtool_pause_stats *pause_stats)
 {
 	struct yt921x_priv *priv = to_yt921x_priv(ds);
-	struct yt921x_port *pp = &priv->ports[port];
-	struct yt921x_mib *mib = &pp->mib;
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_mib *mib;
+
+	if (!pp)
+		return;
+	mib = &pp->mib;
 
 	mutex_lock(&priv->reg_lock);
 	yt921x_read_mib(priv, port);
@@ -3332,15 +3359,20 @@ static int yt921x_bridge(struct yt921x_priv *priv, u16 ports_mask)
 
 	isolated_mask = 0;
 	for_each_set_bit(port, &targets_mask, YT921X_PORT_NUM) {
-		struct yt921x_port *pp = &priv->ports[port];
+		struct yt921x_port *pp = priv->ports[port];
 
+		if (!pp)
+			continue;
 		if (pp->isolated)
 			isolated_mask |= BIT(port);
 	}
 
 	/* Block from non-cpu bridge ports ... */
 	for_each_set_bit(port, &targets_mask, YT921X_PORT_NUM) {
-		struct yt921x_port *pp = &priv->ports[port];
+		struct yt921x_port *pp = priv->ports[port];
+
+		if (!pp)
+			continue;
 
 		/* to non-bridge ports */
 		ctrl = ~ports_mask;
@@ -3397,11 +3429,14 @@ static int
 yt921x_bridge_flags(struct yt921x_priv *priv, int port,
 		    struct switchdev_brport_flags flags)
 {
-	struct yt921x_port *pp = &priv->ports[port];
+	struct yt921x_port *pp = priv->ports[port];
 	bool do_flush;
 	u32 mask;
 	int res;
 
+	if (!pp)
+		return -ENODEV;
+
 	if (flags.mask & BR_LEARNING) {
 		bool learning = flags.val & BR_LEARNING;
 
@@ -3954,11 +3989,16 @@ yt921x_phylink_mac_link_down(struct phylink_config *config, unsigned int mode,
 {
 	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct yt921x_priv *priv = to_yt921x_priv(dp->ds);
+	struct yt921x_port *pp;
 	int port = dp->index;
 	int res;
 
+	pp = priv->ports[port];
+	if (!pp)
+		return;
+
 	/* No need to sync; port control block is hold until device remove */
-	cancel_delayed_work(&priv->ports[port].mib_read);
+	cancel_delayed_work(&pp->mib_read);
 
 	mutex_lock(&priv->reg_lock);
 	res = yt921x_port_down(priv, port);
@@ -3977,9 +4017,14 @@ yt921x_phylink_mac_link_up(struct phylink_config *config,
 {
 	struct dsa_port *dp = dsa_phylink_to_port(config);
 	struct yt921x_priv *priv = to_yt921x_priv(dp->ds);
+	struct yt921x_port *pp;
 	int port = dp->index;
 	int res;
 
+	pp = priv->ports[port];
+	if (!pp)
+		return;
+
 	mutex_lock(&priv->reg_lock);
 	res = yt921x_port_up(priv, port, mode, interface, speed, duplex,
 			     tx_pause, rx_pause);
@@ -3989,7 +4034,7 @@ yt921x_phylink_mac_link_up(struct phylink_config *config,
 		dev_err(dp->ds->dev, "Failed to %s port %d: %i\n", "bring up",
 			port, res);
 
-	schedule_delayed_work(&priv->ports[port].mib_read, 0);
+	schedule_delayed_work(&pp->mib_read, 0);
 }
 
 static void
@@ -4574,6 +4619,26 @@ static int yt921x_dsa_setup(struct dsa_switch *ds)
 		return -ENODEV;
 	}
 
+	for (int port = 0; port < YT921X_PORT_NUM; port++) {
+		struct yt921x_port *pp = priv->ports[port];
+
+		if (pp)
+			continue;
+		if (port != YT921X_PORT_MCU &&
+		    !(BIT(port) & (priv->info->internal_mask |
+				   priv->info->external_mask)))
+			continue;
+
+		pp = devm_kzalloc(dev, sizeof(*pp), GFP_KERNEL);
+		if (!pp)
+			return -ENOMEM;
+		priv->ports[port] = pp;
+
+		pp->priv = priv;
+		pp->index = port;
+		INIT_DELAYED_WORK(&pp->mib_read, yt921x_poll_mib);
+	}
+
 	mutex_lock(&priv->reg_lock);
 	res = yt921x_chip_setup(priv);
 	mutex_unlock(&priv->reg_lock);
@@ -4682,7 +4747,10 @@ static void yt921x_mdio_remove(struct mdio_device *mdiodev)
 		return;
 
 	for (size_t i = ARRAY_SIZE(priv->ports); i-- > 0; ) {
-		struct yt921x_port *pp = &priv->ports[i];
+		struct yt921x_port *pp = priv->ports[i];
+
+		if (!pp)
+			continue;
 
 		disable_delayed_work_sync(&pp->mib_read);
 	}
@@ -4730,13 +4798,6 @@ static int yt921x_mdio_probe(struct mdio_device *mdiodev)
 	priv->reg_ops = &yt921x_reg_ops_mdio;
 	priv->reg_ctx = mdio;
 
-	for (size_t i = 0; i < ARRAY_SIZE(priv->ports); i++) {
-		struct yt921x_port *pp = &priv->ports[i];
-
-		pp->index = i;
-		INIT_DELAYED_WORK(&pp->mib_read, yt921x_poll_mib);
-	}
-
 	ds = &priv->ds;
 	ds->dev = dev;
 	ds->assisted_learning_on_cpu_port = true;
diff --git a/drivers/net/dsa/motorcomm/chip.h b/drivers/net/dsa/motorcomm/chip.h
index 555046526669..9aecfe7343b8 100644
--- a/drivers/net/dsa/motorcomm/chip.h
+++ b/drivers/net/dsa/motorcomm/chip.h
@@ -851,7 +851,8 @@ enum yt921x_fdb_entry_status {
 #define YT921X_UDF_NUM		8
 
 /* 8 internal + 2 external + 1 mcu */
-#define YT921X_PORT_NUM			11
+#define YT921X_PORT_NUM		11
+#define YT921X_PORT_MCU		10
 
 #define yt921x_port_is_internal(port) ((port) < 8)
 #define yt921x_port_is_external(port) (8 <= (port) && (port) < 9)
@@ -929,6 +930,7 @@ struct yt921x_acl_blk {
 };
 
 struct yt921x_port {
+	struct yt921x_priv *priv;
 	unsigned char index;
 
 	bool hairpin;
@@ -964,7 +966,7 @@ struct yt921x_priv {
 	struct mii_bus *mbus_int;
 	struct mii_bus *mbus_ext;
 
-	struct yt921x_port ports[YT921X_PORT_NUM];
+	struct yt921x_port *ports[YT921X_PORT_NUM];
 
 	u16 eee_ports_mask;
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH net-next v2 4/4] net: dsa: motorcomm: Add LED support
From: David Yang @ 2026-06-29 18:31 UTC (permalink / raw)
  To: netdev
  Cc: David Yang, Andrew Lunn, Vladimir Oltean, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, linux-kernel
In-Reply-To: <20260629183137.541341-1-mmyangfl@gmail.com>

LEDs can be described in the device tree using the same format as qca8k.
Each port can configure up to 3 LEDs.

Signed-off-by: David Yang <mmyangfl@gmail.com>
---
 drivers/net/dsa/motorcomm/Kconfig  |   9 +
 drivers/net/dsa/motorcomm/Makefile |   1 +
 drivers/net/dsa/motorcomm/chip.c   |   8 +-
 drivers/net/dsa/motorcomm/chip.h   |  18 +
 drivers/net/dsa/motorcomm/leds.c   | 530 +++++++++++++++++++++++++++++
 drivers/net/dsa/motorcomm/leds.h   | 108 ++++++
 6 files changed, 672 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/dsa/motorcomm/leds.c
 create mode 100644 drivers/net/dsa/motorcomm/leds.h

diff --git a/drivers/net/dsa/motorcomm/Kconfig b/drivers/net/dsa/motorcomm/Kconfig
index 1fddd386f866..22af3fc91095 100644
--- a/drivers/net/dsa/motorcomm/Kconfig
+++ b/drivers/net/dsa/motorcomm/Kconfig
@@ -6,3 +6,12 @@ config NET_DSA_YT921X
 	help
 	  This enables support for the Motorcomm YT9215 ethernet switch
 	  chip.
+
+config NET_DSA_YT921X_LEDS
+	bool "LED support for Motorcomm YT9215"
+	default y
+	depends on NET_DSA_YT921X
+	depends on LEDS_CLASS=y || LEDS_CLASS=NET_DSA_YT921X
+	help
+	  This enabled support for controlling the LEDs attached to the
+	  Motorcomm YT9215 switch chips.
diff --git a/drivers/net/dsa/motorcomm/Makefile b/drivers/net/dsa/motorcomm/Makefile
index 6cea5313a444..5a63db0029ff 100644
--- a/drivers/net/dsa/motorcomm/Makefile
+++ b/drivers/net/dsa/motorcomm/Makefile
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: ISC
 obj-$(CONFIG_NET_DSA_YT921X) += yt921x.o
 yt921x-objs := chip.o
+yt921x-$(CONFIG_NET_DSA_YT921X_LEDS) += leds.o
 yt921x-objs += smi.o
diff --git a/drivers/net/dsa/motorcomm/chip.c b/drivers/net/dsa/motorcomm/chip.c
index 99d3c33e197b..6458207b9457 100644
--- a/drivers/net/dsa/motorcomm/chip.c
+++ b/drivers/net/dsa/motorcomm/chip.c
@@ -26,6 +26,7 @@
 #include <net/pkt_cls.h>
 
 #include "chip.h"
+#include "leds.h"
 #include "smi.h"
 
 struct yt921x_mib_desc {
@@ -151,8 +152,6 @@ static const struct yt921x_info yt921x_infos[] = {
 	{}
 };
 
-#define YT921X_NAME	"yt921x"
-
 #define YT921X_VID_UNWARE	4095
 
 /* The interval should be small enough to avoid overflow of 32bit MIBs.
@@ -4540,6 +4539,7 @@ static int __maybe_unused yt921x_chip_setup_qos(struct yt921x_priv *priv)
 
 static int yt921x_chip_setup(struct yt921x_priv *priv)
 {
+	struct device *dev = to_device(priv);
 	u32 ctrl;
 	int res;
 
@@ -4577,6 +4577,10 @@ static int yt921x_chip_setup(struct yt921x_priv *priv)
 	if (res)
 		return res;
 
+	res = yt921x_leds_setup(priv);
+	if (res)
+		dev_warn(dev, "Failed to setup LEDs: %d\n", res);
+
 	return 0;
 }
 
diff --git a/drivers/net/dsa/motorcomm/chip.h b/drivers/net/dsa/motorcomm/chip.h
index 9aecfe7343b8..3c86ab855e1d 100644
--- a/drivers/net/dsa/motorcomm/chip.h
+++ b/drivers/net/dsa/motorcomm/chip.h
@@ -850,10 +850,14 @@ enum yt921x_fdb_entry_status {
 #define YT921X_ACL_NUM		(YT921X_ACL_BLK_NUM * YT921X_ACL_ENT_PER_BLK)
 #define YT921X_UDF_NUM		8
 
+#define YT921X_LED_GROUP_NUM	3
+
 /* 8 internal + 2 external + 1 mcu */
 #define YT921X_PORT_NUM		11
 #define YT921X_PORT_MCU		10
 
+#define YT921X_NAME	"yt921x"
+
 #define yt921x_port_is_internal(port) ((port) < 8)
 #define yt921x_port_is_external(port) (8 <= (port) && (port) < 9)
 
@@ -929,6 +933,14 @@ struct yt921x_acl_blk {
 	struct yt921x_acl_rule *rules[YT921X_ACL_ENT_PER_BLK];
 };
 
+struct yt921x_led {
+	struct led_classdev cdev;
+	unsigned char group;
+
+	bool use_cycle;
+	bool use_duty;
+};
+
 struct yt921x_port {
 	struct yt921x_priv *priv;
 	unsigned char index;
@@ -940,6 +952,12 @@ struct yt921x_port {
 	struct yt921x_mib mib;
 	u64 rx_frames;
 	u64 tx_frames;
+
+#if IS_ENABLED(CONFIG_NET_DSA_YT921X_LEDS)
+	struct yt921x_led leds[YT921X_LED_GROUP_NUM];
+	unsigned short led_cycle;
+	unsigned short led_duty;
+#endif
 };
 
 struct yt921x_reg_ops {
diff --git a/drivers/net/dsa/motorcomm/leds.c b/drivers/net/dsa/motorcomm/leds.c
new file mode 100644
index 000000000000..73d738c113ef
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/leds.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2026 David Yang
+ */
+
+#include <linux/uleds.h>
+
+#include "chip.h"
+#include "leds.h"
+#include "smi.h"
+
+#define to_yt921x_led(led_cdev) \
+	container_of_const((led_cdev), struct yt921x_led, cdev)
+#define to_yt921x_port(led) \
+	((void *)((led) - (led)->group) - offsetof(struct yt921x_port, leds))
+#define to_yt921x_priv(pp) ((pp)->priv)
+#define to_device(priv) ((priv)->ds.dev)
+
+static u32 yt921x_led_regaddr(struct yt921x_priv *priv, int port, int group)
+{
+	switch (group) {
+	case 0:
+	default:
+		return YT921X_LED0_PORTn(port);
+	case 1:
+		return YT921X_LED1_PORTn(port);
+	case 2:
+		return YT921X_LED2_PORTn(port);
+	}
+}
+
+static int
+yt921x_led_force_get(struct yt921x_priv *priv, int port, int group, bool *onp)
+{
+	u32 val;
+	int res;
+
+	res = yt921x_reg_read(priv, YT921X_LED2_PORTn(port), &val);
+	if (res)
+		return res;
+
+	*onp = (val & YT921X_LED2_PORT_FORCEn_M(group)) ==
+	       YT921X_LED2_PORT_FORCEn_ON(group);
+	return 0;
+}
+
+static int
+yt921x_led_force_set(struct yt921x_priv *priv, int port, int group, bool on)
+{
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_led *led;
+	u32 ctrl;
+	u32 mask;
+
+	if (!pp)
+		return -ENODEV;
+	led = &pp->leds[group];
+
+	led->use_cycle = false;
+	led->use_duty = false;
+
+	mask = YT921X_LED2_PORT_FORCEn_M(group);
+	ctrl = on ? YT921X_LED2_PORT_FORCEn_ON(group) :
+	       YT921X_LED2_PORT_FORCEn_OFF(group);
+	return yt921x_reg_update_bits(priv, YT921X_LED2_PORTn(port), mask,
+				      ctrl);
+}
+
+static int
+yt921x_led_blink_select(const struct yt921x_priv *priv, unsigned long on,
+			unsigned long off, unsigned short *cyclep,
+			unsigned short *dutyp)
+{
+	static const unsigned short dutys[] = {
+		YT921X_LED_DUTY(1, 6),
+		YT921X_LED_DUTY(1, 4),
+		YT921X_LED_DUTY(1, 3),
+		YT921X_LED_DUTY(1, 2),
+	};
+	unsigned int cycle_upper;
+	unsigned int cycle_req;
+	unsigned int duty_req;
+	unsigned int cycle;
+	unsigned int duty;
+
+	if (!on && !off) {
+		*cyclep = YT921X_LED_BLINK_DEF;
+		*dutyp = YT921X_LED_DUTY(1, 2);
+		return 0;
+	}
+
+	cycle = YT921X_LED_BLINK_MAX;
+	cycle_upper = cycle * 11585 / 8192 + 1;  /* M_SQRT2 * cycle */
+	if (check_add_overflow(on, off, &cycle_req) || cycle_req >= cycle_upper)
+		return -EOPNOTSUPP;
+
+	for (; cycle > YT921X_LED_BLINK_MIN; cycle_upper >>= 1, cycle >>= 1)
+		if (cycle_req >= cycle_upper >> 1)
+			break;
+
+	duty_req = YT921X_LED_DUTY(on > off ? off : on, cycle_req);
+	for (unsigned int i = ARRAY_SIZE(dutys) - 1;; i--)
+		if (i <= 0 || duty_req >= (dutys[i - 1] + dutys[i]) / 2) {
+			duty = dutys[i];
+			break;
+		}
+	if (on > off)
+		duty = YT921X_LED_DUTY_DENOM - duty;
+
+	*cyclep = cycle;
+	*dutyp = duty;
+	return 0;
+}
+
+static int
+yt921x_led_blink_set(struct yt921x_priv *priv, int port, int group,
+		     unsigned long *onp, unsigned long *offp)
+{
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_led *led;
+	unsigned short cycle;
+	unsigned short duty;
+	bool change_cycle;
+	bool change_duty;
+	bool use_cycle;
+	u32 ctrl;
+	u32 mask;
+	u32 val;
+	int res;
+
+	if (!pp)
+		return -ENODEV;
+	led = &pp->leds[group];
+
+	res = yt921x_led_blink_select(priv, *onp, *offp, &cycle, &duty);
+	if (res)
+		return res;
+
+	use_cycle = cycle < YT921X_LED_BLINK_DEF;
+	change_cycle = use_cycle && cycle != pp->led_cycle;
+	change_duty = duty != pp->led_duty;
+	if (change_cycle || change_duty)
+		for (unsigned int i = 0; i < YT921X_LED_GROUP_NUM; i++) {
+			if (i == group)
+				continue;
+			if ((change_cycle && pp->leds[i].use_cycle) ||
+			    (change_duty && pp->leds[i].use_duty))
+				return -EOPNOTSUPP;
+		}
+
+	/* The chip seems to jam a while if changing duty directly */
+	res = yt921x_reg_read(priv, YT921X_LED2_PORTn(port), &val);
+	if (res)
+		return res;
+
+	ctrl = val & ~YT921X_LED2_PORT_FORCEn_M(group);
+	ctrl |= YT921X_LED2_PORT_FORCEn_DONTCARE(group);
+	if (val != ctrl) {
+		res = yt921x_reg_write(priv, YT921X_LED2_PORTn(port), ctrl);
+		if (res)
+			return res;
+	}
+
+	mask = YT921X_LED1_PORT_BLINK_DUTY_M | YT921X_LED1_PORT_BLINK_DUTY_COMP;
+	switch (duty >= YT921X_LED_DUTY(1, 2) ? duty :
+		YT921X_LED_DUTY_DENOM - duty) {
+	default:
+		duty = YT921X_LED_DUTY(1, 2);
+		fallthrough;
+	case YT921X_LED_DUTY(1, 2):
+		ctrl = YT921X_LED1_PORT_BLINK_DUTY_1_2;
+		break;
+	case YT921X_LED_DUTY(2, 3):
+		ctrl = YT921X_LED1_PORT_BLINK_DUTY_2_3;
+		break;
+	case YT921X_LED_DUTY(3, 4):
+		ctrl = YT921X_LED1_PORT_BLINK_DUTY_3_4;
+		break;
+	case YT921X_LED_DUTY(5, 6):
+		ctrl = YT921X_LED1_PORT_BLINK_DUTY_5_6;
+		break;
+	}
+	if (duty < YT921X_LED_DUTY(1, 2))
+		ctrl |= YT921X_LED1_PORT_BLINK_DUTY_COMP;
+	if (use_cycle) {
+		mask |= YT921X_LED1_PORT_OTHER_BLINK_M;
+		ctrl |= YT921X_LED1_PORT_OTHER_BLINK(9 - __fls(cycle));
+	}
+	res = yt921x_reg_update_bits(priv, YT921X_LED1_PORTn(port), mask, ctrl);
+	if (res)
+		return res;
+
+	ctrl = val & ~(YT921X_LED2_PORT_FORCEn_M(group) |
+		       YT921X_LED2_PORT_FORCE_BLINKn_M(group));
+	ctrl |= YT921X_LED2_PORT_FORCEn_BLINK(group);
+	if (use_cycle)
+		ctrl |= YT921X_LED2_PORT_FORCE_BLINKn_OTHER(group);
+	else
+		ctrl |= YT921X_LED2_PORT_FORCE_BLINKn(group, __fls(cycle) - 9);
+	res = yt921x_reg_write(priv, YT921X_LED2_PORTn(port), ctrl);
+	if (res)
+		return res;
+
+	led->use_cycle = use_cycle;
+	if (use_cycle)
+		pp->led_cycle = cycle;
+	led->use_duty = true;
+	pp->led_duty = duty;
+
+	*onp = (duty * cycle + YT921X_LED_DUTY_DENOM / 2) /
+	       YT921X_LED_DUTY_DENOM;
+	*offp = cycle - *onp;
+	return 0;
+}
+
+static const u32 yt921x_led_trigger_maps[__TRIGGER_NETDEV_MAX] = {
+	[TRIGGER_NETDEV_LINK]		= YT921X_LEDx_PORT_ACT_DUPLEX_HALF |
+					  YT921X_LEDx_PORT_ACT_DUPLEX_FULL,
+	[TRIGGER_NETDEV_LINK_10]	= YT921X_LEDx_PORT_ACT_10M,
+	[TRIGGER_NETDEV_LINK_100]	= YT921X_LEDx_PORT_ACT_100M,
+	[TRIGGER_NETDEV_LINK_1000]	= YT921X_LEDx_PORT_ACT_1000M,
+	[TRIGGER_NETDEV_HALF_DUPLEX]	= YT921X_LEDx_PORT_ACT_DUPLEX_HALF,
+	[TRIGGER_NETDEV_FULL_DUPLEX]	= YT921X_LEDx_PORT_ACT_DUPLEX_FULL,
+	[TRIGGER_NETDEV_TX]		= YT921X_LEDx_PORT_ACT_TX,
+	[TRIGGER_NETDEV_RX]		= YT921X_LEDx_PORT_ACT_RX,
+};
+
+static bool yt921x_led_trigger_is_supported(int group, unsigned long flags)
+{
+	unsigned int i;
+
+	for_each_set_bit(i, &flags, __TRIGGER_NETDEV_MAX)
+		if (!yt921x_led_trigger_maps[i])
+			return false;
+
+	return true;
+}
+
+static int
+yt921x_led_trigger_get(struct yt921x_priv *priv, int port, int group,
+		       unsigned long *flagsp)
+{
+	u32 addr;
+	u32 val;
+	int res;
+
+	addr = yt921x_led_regaddr(priv, port, group);
+	res = yt921x_reg_read(priv, addr, &val);
+	if (res)
+		return res;
+
+	*flagsp = 0;
+	for (unsigned int i = 0; i < __TRIGGER_NETDEV_MAX; i++)
+		if (val & yt921x_led_trigger_maps[i])
+			*flagsp |= BIT(i);
+
+	return 0;
+}
+
+static int
+yt921x_led_trigger_set(struct yt921x_priv *priv, int port, int group,
+		       unsigned long flags)
+{
+	struct yt921x_port *pp = priv->ports[port];
+	struct yt921x_led *led;
+	unsigned int i;
+	u32 addr;
+	u32 ctrl;
+	u32 mask;
+	int res;
+
+	if (!pp)
+		return -ENODEV;
+	led = &pp->leds[group];
+
+	ctrl = 0;
+	for_each_set_bit(i, &flags, __TRIGGER_NETDEV_MAX) {
+		if (!yt921x_led_trigger_maps[i])
+			return -EOPNOTSUPP;
+
+		ctrl |= yt921x_led_trigger_maps[i];
+	}
+
+	led->use_cycle = false;
+	led->use_duty = false;
+
+	mask = !group ? YT921X_LED0_PORT_ACT_M : YT921X_LEDx_PORT_ACT_M;
+	if (group == 2) {
+		mask |= YT921X_LED2_PORT_FORCEn_M(group);
+		ctrl |= YT921X_LED2_PORT_FORCEn_DONTCARE(group);
+	}
+	addr = yt921x_led_regaddr(priv, port, group);
+	res = yt921x_reg_update_bits(priv, addr, mask, ctrl);
+	if (res)
+		return res;
+
+	if (group != 2) {
+		mask = YT921X_LED2_PORT_FORCEn_M(group);
+		ctrl = YT921X_LED2_PORT_FORCEn_DONTCARE(group);
+		res = yt921x_reg_update_bits(priv, YT921X_LED2_PORTn(port),
+					     mask, ctrl);
+		if (res)
+			return res;
+	}
+
+	return 0;
+}
+
+static int
+yt921x_cled_brightness_set_blocking(struct led_classdev *led_cdev,
+				    enum led_brightness brightness)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+	struct yt921x_port *pp = to_yt921x_port(led);
+	struct yt921x_priv *priv = to_yt921x_priv(pp);
+	int res;
+
+	mutex_lock(&priv->reg_lock);
+	res = yt921x_led_force_set(priv, pp->index, led->group, brightness);
+	mutex_unlock(&priv->reg_lock);
+
+	return res;
+}
+
+static int
+yt921x_cled_blink_set(struct led_classdev *led_cdev, unsigned long *delay_on,
+		      unsigned long *delay_off)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+	struct yt921x_port *pp = to_yt921x_port(led);
+	struct yt921x_priv *priv = to_yt921x_priv(pp);
+	int res;
+
+	mutex_lock(&priv->reg_lock);
+	res = yt921x_led_blink_set(priv, pp->index, led->group, delay_on,
+				   delay_off);
+	mutex_unlock(&priv->reg_lock);
+
+	return res;
+}
+
+static struct device * __maybe_unused
+yt921x_cled_hw_control_get_device(struct led_classdev *led_cdev)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+	struct yt921x_port *pp = to_yt921x_port(led);
+	struct yt921x_priv *priv = to_yt921x_priv(pp);
+	struct dsa_port *dp;
+
+	dp = dsa_to_port(&priv->ds, pp->index);
+	if (!dp || !dp->user)
+		return NULL;
+	return &dp->user->dev;
+}
+
+static int __maybe_unused
+yt921x_cled_hw_control_is_supported(struct led_classdev *led_cdev,
+				    unsigned long flags)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+
+	return yt921x_led_trigger_is_supported(led->group, flags) ? 0 :
+	       -EOPNOTSUPP;
+}
+
+static int __maybe_unused
+yt921x_cled_hw_control_get(struct led_classdev *led_cdev, unsigned long *flagsp)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+	struct yt921x_port *pp = to_yt921x_port(led);
+	struct yt921x_priv *priv = to_yt921x_priv(pp);
+	int res;
+
+	mutex_lock(&priv->reg_lock);
+	res = yt921x_led_trigger_get(priv, pp->index, led->group, flagsp);
+	mutex_unlock(&priv->reg_lock);
+
+	return res;
+}
+
+static int __maybe_unused
+yt921x_cled_hw_control_set(struct led_classdev *led_cdev, unsigned long flags)
+{
+	struct yt921x_led *led = to_yt921x_led(led_cdev);
+	struct yt921x_port *pp = to_yt921x_port(led);
+	struct yt921x_priv *priv = to_yt921x_priv(pp);
+	int res;
+
+	mutex_lock(&priv->reg_lock);
+	res = yt921x_led_trigger_set(priv, pp->index, led->group, flags);
+	mutex_unlock(&priv->reg_lock);
+
+	return res;
+}
+
+static int
+yt921x_led_setup(struct yt921x_priv *priv, int port,
+		 struct fwnode_handle *fwnode, u32 *invp)
+{
+	struct yt921x_port *pp = priv->ports[port];
+	struct device *dev = to_device(priv);
+	struct led_init_data init_data;
+	struct led_classdev *led_cdev;
+	char name[LED_MAX_NAME_SIZE];
+	enum led_default_state state;
+	struct yt921x_led *led;
+	u32 group;
+	bool on;
+	int res;
+
+	if (!pp)
+		return -ENODEV;
+	if (port == YT921X_PORT_MCU) {
+		dev_err(dev, "No LEDs for port %d\n", port);
+		return -ENODEV;
+	}
+
+	res = fwnode_property_read_u32(fwnode, "reg", &group);
+	if (res)
+		return res;
+	if (group >= YT921X_LED_GROUP_NUM) {
+		dev_err(dev, "Invalid LED reg %u for port %d\n", group, port);
+		return -EINVAL;
+	}
+
+	led = &pp->leds[group];
+	led_cdev = &led->cdev;
+	state = led_init_default_state_get(fwnode);
+	switch (state) {
+	case LEDS_DEFSTATE_OFF:
+	case LEDS_DEFSTATE_ON:
+		on = state != LEDS_DEFSTATE_OFF;
+		res = yt921x_led_force_set(priv, port, group, on);
+		break;
+	case LEDS_DEFSTATE_KEEP:
+		res = yt921x_led_force_get(priv, port, group, &on);
+		break;
+	}
+	if (res)
+		return res;
+	led_cdev->brightness = on;
+	led_cdev->max_brightness = 1;
+	led_cdev->flags = LED_RETAIN_AT_SHUTDOWN;
+	led_cdev->brightness_set_blocking = yt921x_cled_brightness_set_blocking;
+	led_cdev->blink_set = yt921x_cled_blink_set;
+#ifdef CONFIG_LEDS_TRIGGERS
+	led_cdev->hw_control_trigger = "netdev";
+	led_cdev->hw_control_get_device = yt921x_cled_hw_control_get_device;
+	led_cdev->hw_control_is_supported = yt921x_cled_hw_control_is_supported;
+	led_cdev->hw_control_get = yt921x_cled_hw_control_get;
+	led_cdev->hw_control_set = yt921x_cled_hw_control_set;
+#endif
+
+	snprintf(name, sizeof(name), YT921X_NAME "-%u:%02d:%02u",
+		 priv->ds.index, port, group);
+	init_data = (typeof(init_data)){
+		.fwnode = fwnode,
+		.devicename = name,
+		.devname_mandatory = true,
+	};
+	res = devm_led_classdev_register_ext(dev, led_cdev, &init_data);
+	if (res)
+		return res;
+
+	if (fwnode_property_read_bool(fwnode, "active-high"))
+		*invp |= YT921X_LED_PAR_INV_INVnm(group, port);
+
+	return 0;
+}
+
+int yt921x_leds_setup(struct yt921x_priv *priv)
+{
+	struct dsa_switch *ds = &priv->ds;
+	struct dsa_port *dp;
+	u32 ctrl;
+	u32 mask;
+	u32 inv;
+	int res;
+
+	mask = YT921X_LED_CTRL_MODE_M | YT921X_LED_CTRL_PORT_NUM_M |
+	       YT921X_LED_CTRL_EN;
+	ctrl = YT921X_LED_CTRL_MODE_PARALLEL |
+	       YT921X_LED_CTRL_PORT_NUM(YT921X_PORT_NUM - 1) |
+	       YT921X_LED_CTRL_EN;
+	res = yt921x_reg_update_bits(priv, YT921X_LED_CTRL, mask, ctrl);
+	if (res)
+		return res;
+
+	for (int port = 0; port < YT921X_PORT_NUM; port++) {
+		struct yt921x_port *pp = priv->ports[port];
+
+		if (!pp)
+			continue;
+
+		for (int group = 0; group < YT921X_LED_GROUP_NUM; group++)
+			pp->leds[group].group = group;
+	}
+
+	inv = 0;
+	dsa_switch_for_each_port(dp, ds) {
+		struct device_node *leds_np;
+
+		if (!dp->dn)
+			continue;
+
+		leds_np = of_get_child_by_name(dp->dn, "leds");
+		if (!leds_np)
+			continue;
+
+		for_each_child_of_node_scoped(leds_np, led_np) {
+			res = yt921x_led_setup(priv, dp->index,
+					       of_fwnode_handle(led_np), &inv);
+			if (res)
+				break;
+		}
+
+		of_node_put(leds_np);
+		if (res)
+			return res;
+	}
+
+	/* Inversion is internal - FORCEn_HIGH will give low logic.
+	 * In the rest of the file, treat LEDs as if active-low.
+	 */
+	res = yt921x_reg_write(priv, YT921X_LED_PAR_INV, inv);
+	if (res)
+		return res;
+
+	return 0;
+}
diff --git a/drivers/net/dsa/motorcomm/leds.h b/drivers/net/dsa/motorcomm/leds.h
new file mode 100644
index 000000000000..4e5caa0b2938
--- /dev/null
+++ b/drivers/net/dsa/motorcomm/leds.h
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2026 David Yang
+ */
+
+#ifndef _YT_LEDS_H
+#define _YT_LEDS_H
+
+#include <linux/bitfield.h>
+#include <linux/bits.h>
+#include <linux/kconfig.h>
+
+#define YT921X_LED_CTRL			0xd0000
+#define  YT921X_LED_CTRL_EN			BIT(21)
+#define  YT921X_LED_CTRL_LOOPDETECT_BLINK_M	GENMASK(20, 19)	/* cycle = 512 * x ms */
+#define   YT921X_LED_CTRL_LOOPDETECT_BLINK(x)		FIELD_PREP(YT921X_LED_CTRL_LOOPDETECT_BLINK_M, (x))
+#define  YT921X_LED_CTRL_PORT_NUM_M		GENMASK(16, 13)
+#define   YT921X_LED_CTRL_PORT_NUM(x)			FIELD_PREP(YT921X_LED_CTRL_PORT_NUM_M, (x))
+#define  YT921X_LED_CTRL_MODE_M			GENMASK(1, 0)
+#define   YT921X_LED_CTRL_MODE(x)			FIELD_PREP(YT921X_LED_CTRL_MODE_M, (x))
+#define   YT921X_LED_CTRL_MODE_PARALLEL			YT921X_LED_CTRL_MODE(0)
+#define   YT921X_LED_CTRL_MODE_SERIAL			YT921X_LED_CTRL_MODE(2)
+#define YT921X_LED0_PORTn(port)		(0xd0004 + 4 * (port))
+#define  YT921X_LED0_PORT_ACT_M			GENMASK(17, 0)
+#define  YT921X_LED0_PORT_ACT_LINK_TRY_DIS	BIT(17)
+#define  YT921X_LED0_PORT_ACT_COLLISION_BLINK_INDI	BIT(16)
+#define YT921X_LED1_PORTn(port)		(0xd0040 + 4 * (port))
+#define  YT921X_LED1_PORT_OTHER_BLINK_M		GENMASK(31, 30)	/* cycle = 512 >> x ms */
+#define   YT921X_LED1_PORT_OTHER_BLINK(x)		FIELD_PREP(YT921X_LED1_PORT_OTHER_BLINK_M, (x))
+#define  YT921X_LED1_PORT_EEE_BLINK_M		GENMASK(29, 28)	/* cycle = 512 >> x ms */
+#define   YT921X_LED1_PORT_EEE_BLINK(x)			FIELD_PREP(YT921X_LED1_PORT_EEE_BLINK_M, (x))
+#define  YT921X_LED1_PORT_BLINK_DUTY_COMP	BIT(27)
+#define  YT921X_LED1_PORT_BLINK_DUTY_M		GENMASK(26, 25)
+#define   YT921X_LED1_PORT_BLINK_DUTY(x)		FIELD_PREP(YT921X_LED1_PORT_BLINK_DUTY_M, (x))
+#define   YT921X_LED1_PORT_BLINK_DUTY_1_2		YT921X_LED1_PORT_BLINK_DUTY(0)
+#define   YT921X_LED1_PORT_BLINK_DUTY_2_3		YT921X_LED1_PORT_BLINK_DUTY(1)
+#define   YT921X_LED1_PORT_BLINK_DUTY_3_4		YT921X_LED1_PORT_BLINK_DUTY(2)
+#define   YT921X_LED1_PORT_BLINK_DUTY_5_6		YT921X_LED1_PORT_BLINK_DUTY(3)
+#define YT921X_LED2_PORTn(port)		(0xd0080 + 4 * (port))
+#define  YT921X_LED2_PORT_FORCEn_M(grp)		GENMASK(4 * (grp) + 19, 4 * (grp) + 18)
+#define   YT921X_LED2_PORT_FORCEn(grp, x)		((x) << (4 * (grp) + 18))
+#define   YT921X_LED2_PORT_FORCEn_DONTCARE(grp)		YT921X_LED2_PORT_FORCEn(grp, 0)
+#define   YT921X_LED2_PORT_FORCEn_BLINK(grp)		YT921X_LED2_PORT_FORCEn(grp, 1)
+#define   YT921X_LED2_PORT_FORCEn_ON(grp)		YT921X_LED2_PORT_FORCEn(grp, 2)
+#define   YT921X_LED2_PORT_FORCEn_OFF(grp)		YT921X_LED2_PORT_FORCEn(grp, 3)
+#define  YT921X_LED2_PORT_FORCE_BLINKn_M(grp)	GENMASK(4 * (grp) + 17, 4 * (grp) + 16)	/* cycle = 512 << x ms */
+#define   YT921X_LED2_PORT_FORCE_BLINKn(grp, x)		((x) << (4 * (grp) + 16))
+#define   YT921X_LED2_PORT_FORCE_BLINKn_OTHER(grp)	YT921X_LED2_PORT_FORCE_BLINKn(grp, 3)
+#define  YT921X_LEDx_PORT_ACT_M			GENMASK(16, 0)
+#define  YT921X_LEDx_PORT_ACT_EEE_BLINK		BIT(15)
+#define  YT921X_LEDx_PORT_ACT_LOOPDETECT_BLINK	BIT(14)
+#define  YT921X_LEDx_PORT_ACT_ACTIVE_BLINK	BIT(13)
+#define  YT921X_LEDx_PORT_ACT_DUPLEX_FULL	BIT(12)
+#define  YT921X_LEDx_PORT_ACT_DUPLEX_HALF	BIT(11)
+#define  YT921X_LEDx_PORT_ACT_TX_BLINK		BIT(10)
+#define  YT921X_LEDx_PORT_ACT_RX_BLINK		BIT(9)
+#define  YT921X_LEDx_PORT_ACT_TX		BIT(8)
+#define  YT921X_LEDx_PORT_ACT_RX		BIT(7)
+#define  YT921X_LEDx_PORT_ACT_1000M		BIT(6)
+#define  YT921X_LEDx_PORT_ACT_100M		BIT(5)
+#define  YT921X_LEDx_PORT_ACT_10M		BIT(4)
+#define  YT921X_LEDx_PORT_ACT_COLLISION_BLINK	BIT(3)
+#define  YT921X_LEDx_PORT_ACT_1000M_BLINK	BIT(2)
+#define  YT921X_LEDx_PORT_ACT_100M_BLINK	BIT(1)
+#define  YT921X_LEDx_PORT_ACT_10M_BLINK		BIT(0)
+#define YT921X_LED_SER_CTRL		0xd0100
+#define  YT921X_LED_SER_CTRL_EN			GENMASK(25, 24)
+#define  YT921X_LED_SER_CTRL_ACTIVE_LOW		BIT(4)
+#define  YT921X_LED_SER_CTRL_LED_NUM_M		GENMASK(1, 0)	/* #led - 1 */
+#define   YT921X_LED_SER_CTRL_LED_NUM(x)		FIELD_PREP(YT921X_LED_SER_CTRL_LED_NUM_M, (x))
+#define YT921X_LED_SER_MAPnm(grp, port)	(0xd0104 + 8 * (2 - (grp)) + 4 * ((port) / 5))
+#define  YT921X_LED_SER_MAP_DSTn_PORT_M(port)	GENMASK(6 * ((port) % 5) + 5, 6 * ((port) % 5) + 2)
+#define   YT921X_LED_SER_MAP_DSTn_PORT(port, x)		((x) << (6 * ((port) % 5) + 2))
+#define  YT921X_LED_SER_MAP_DSTn_LED_M(port)	GENMASK(6 * ((port) % 5) + 1, 6 * ((port) % 5))
+#define   YT921X_LED_SER_MAP_DSTn_LED(port, x)		((x) << (6 * ((port) % 5)))
+#define YT921X_LED_PAR_PORTS		0xd01c4
+#define YT921X_LED_PAR_INV		0xd01c8
+#define  YT921X_LED_PAR_INV_INVnm(grp, port)	BIT(10 * (grp) + (port))
+#define YT921X_LED_PAR_MAPn(port)	(0xd01d0 + 4 * (port))
+#define  YT921X_LED_PAR_MAP_DSTn_PORT_M(grp)	GENMASK(6 * (grp) + 5, 6 * (grp) + 2)
+#define   YT921X_LED_PAR_MAP_DSTn_PORT(grp, x)		((x) << (6 * (grp) + 2))
+#define  YT921X_LED_PAR_MAP_DSTn_LED_M(grp)	GENMASK(6 * (grp) + 1, 6 * (grp))
+#define   YT921X_LED_PAR_MAP_DSTn_LED(grp, x)		((x) << (6 * (grp)))
+
+#define YT921X_LED_BLINK_MIN	64
+#define YT921X_LED_BLINK_DEF	512
+#define YT921X_LED_BLINK_MAX	2048
+
+/* 2 * lcm(2, 3, 4, 6) */
+#define YT921X_LED_DUTY_DENOM		24
+#define YT921X_LED_DUTY(nom, denom)	(YT921X_LED_DUTY_DENOM * (nom) / (denom))
+
+struct yt921x_priv;
+
+#if IS_ENABLED(CONFIG_NET_DSA_YT921X_LEDS)
+
+int yt921x_leds_setup(struct yt921x_priv *priv);
+
+#else
+
+static inline int yt921x_leds_setup(struct yt921x_priv *priv)
+{
+	return 0;
+}
+
+#endif
+
+#endif
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH net v2] net/sched: cake: reject overhead values that underflow length
From: Samuel Moelius @ 2026-06-29 18:44 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Toke Høiland-Jørgensen, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
	moderated list:CAKE QDISC, open list:TC subsystem, open list
In-Reply-To: <CAE+C+DZpYpJDJOaCK48KBpayXSYorFRSvTE903RBqyPyYkWcCg@mail.gmail.com>

On Mon, Jun 29, 2026 at 1:24 PM Samuel Moelius
<sam.moelius@trailofbits.com> wrote:
>
> On Sat, Jun 13, 2026 at 5:26 PM Jakub Kicinski <kuba@kernel.org> wrote:
> >
> > On Tue,  9 Jun 2026 23:29:36 +0000 Samuel Moelius wrote:
> > > +static const struct netlink_range_validation_signed cake_overhead_range = {
> > > +     .min = -64,
> > > +     .max = 256,
> >
> > Both Sashiko's complain - these values are neither safe nor sufficient.
> >
> > How was the -64 chosen? It looks suspiciously close the min ethernet
> > frame length.
>
> That's how it was chosen. But as you've shown, it was not a good choice.
>
> I would like to submit a revised patch that does the check on the
> datapath, as you suggested. Should I send that patch? Or would you
> prefer to wait for a response from Toke?

Apologies. I just noticed Toke did reply. Should I send a patch
consistent with what he suggested?

^ permalink raw reply

* [PATCH net-next v4 00/15] bnxt_en: Add kTLS TX offload support
From: Michael Chan @ 2026-06-29 18:49 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, kuba, pabeni, andrew+netdev, pavan.chebbi,
	andrew.gospodarek

This patchset adds kTLS offload support for TX direction.  A number
of new files are added:

bnxt_mpc.[ch] handle midpath channels (MPCs) used to offload kTLS
connections to the chip's crypto blocks without going through FW.

bnxt_crypto.[ch] handle the crypto interface and resources.

bnxt_ktls.[ch] handle kTLS offload.

A new CONFIG_BNXT_TLS is added to enable all of the above.  The first 6
patches add the MPC logic including resource accounting and reservations.
The next 5 patches add the crypto logic to handle the crypto resources
and to send/receive control data using the MPCs.  The last 4 patches
add kTLS offload for the TX direction.

There will be a follow-on patchset to make the TX offload more complete
and to add the RX direction offload.

v4:
Fix kerneldoc prototype warning and uninitialized variable warnings reported
by Jakub.

Fix most valid Sashiko reported issues.

v3:
https://lore.kernel.org/netdev/20260614072407.2761092-1-michael.chan@broadcom.com/

Fix most AI reported issues from Jakub.

v2:
https://lore.kernel.org/netdev/20260512212105.3488258-1-michael.chan@broadcom.com/

Fix unused variable compile warnings in patch 10 and 12 by reorganizing
the patches (reported by Jakub)

Fix some error recovery issues in patch 12

v1:
https://lore.kernel.org/netdev/20260504235836.3019499-1-michael.chan@broadcom.com/

Michael Chan (15):
  bnxt_en: Add Midpath channel information
  bnxt_en: Account for the MPC TX and CP rings
  bnxt_en: Set default MPC ring count
  bnxt_en: Rename xdp_tx_lock to tx_lock
  bnxt_en: Allocate and free MPC software structures
  bnxt_en: Allocate and free MPC channels from firmware
  bnxt_en: Allocate crypto structure and backing store
  bnxt_en: Reserve crypto RX and TX key contexts on a PF
  bnxt_en: Add infrastructure for crypto key context IDs
  bnxt_en: Add MPC transmit and completion functions
  bnxt_en: Add crypto MPC transmit/completion infrastructure
  bnxt_en: Support kTLS TX offload by implementing .tls_dev_add/del()
  bnxt_en: Implement kTLS TX normal path
  bnxt_en: Add support for inline transmit BDs
  bnxt_en: Add kTLS retransmission support

 drivers/net/ethernet/broadcom/Kconfig         |   9 +
 drivers/net/ethernet/broadcom/bnxt/Makefile   |   1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 255 +++++-
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  91 ++-
 .../net/ethernet/broadcom/bnxt/bnxt_crypto.c  | 613 +++++++++++++++
 .../net/ethernet/broadcom/bnxt/bnxt_crypto.h  | 229 ++++++
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c |  53 ++
 drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c |   2 +-
 .../net/ethernet/broadcom/bnxt/bnxt_ktls.c    | 572 ++++++++++++++
 .../net/ethernet/broadcom/bnxt/bnxt_ktls.h    | 175 +++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c | 732 ++++++++++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h | 210 +++++
 .../net/ethernet/broadcom/bnxt/bnxt_sriov.c   |   6 +-
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |   4 +-
 include/linux/bnxt/hsi.h                      |  37 +
 15 files changed, 2935 insertions(+), 54 deletions(-)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_crypto.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_crypto.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ktls.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_ktls.h
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h

-- 
2.51.0


^ permalink raw reply

* [PATCH net-next v4 01/15] bnxt_en: Add Midpath channel information
From: Michael Chan @ 2026-06-29 18:49 UTC (permalink / raw)
  To: davem
  Cc: netdev, edumazet, kuba, pabeni, andrew+netdev, pavan.chebbi,
	andrew.gospodarek, Ajit Khaparde, Kalesh AP
In-Reply-To: <20260629184921.3496727-1-michael.chan@broadcom.com>

Midpath channels (MPCs) are rings for hardware control paths.  These
control paths are used to offload kTLS directly to the hardware
without going through firmware.  This patch adds the basic information
structures for these MPCs.

An MPC is basically a TX and completion ring pair with a HW TLS block
as the destination.  Two MPC channel types are used to offload
connections to the TX crypto engine (TCE) and the RX crypto
engine (RCE) respectively.  In the driver, we re-use the
bnxt_tx_ring_info and bnxt_cp_ring_info control structs for the MPCs.

This patch also adds the CONFIG_BNXT_TLS Kconfig option to conditionally
include the MPC logic.  The first few patches in the series add the MPC
support.  kTLS support will be added later in the series.

Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
Reviewed-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
---
 drivers/net/ethernet/broadcom/Kconfig         |  9 ++++
 drivers/net/ethernet/broadcom/bnxt/Makefile   |  1 +
 drivers/net/ethernet/broadcom/bnxt/bnxt.c     |  8 ++++
 drivers/net/ethernet/broadcom/bnxt/bnxt.h     |  2 +
 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c | 26 ++++++++++
 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h | 47 +++++++++++++++++++
 6 files changed, 93 insertions(+)
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c
 create mode 100644 drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index f0bac0dd1439..b33b66f038b8 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -255,6 +255,15 @@ config BNXT_HWMON
 	  Say Y if you want to expose the thermal sensor data on NetXtreme-C/E
 	  devices, via the hwmon sysfs interface.
 
+config BNXT_TLS
+	bool "Broadcom NetXtreme-C/E TLS offload support"
+	default y
+	depends on BNXT && TLS_DEVICE
+	depends on TLS=y || BNXT=m
+	help
+	  Say Y if you want to enable Transport Layer Security (TLS) hardware
+	  encryption and decryption offload on supported NetXtreme-C/E devices.
+
 config BNGE
 	tristate "Broadcom ThorUltra Ethernet device support"
 	depends on PCI
diff --git a/drivers/net/ethernet/broadcom/bnxt/Makefile b/drivers/net/ethernet/broadcom/bnxt/Makefile
index debef78c8b6d..0506574c007a 100644
--- a/drivers/net/ethernet/broadcom/bnxt/Makefile
+++ b/drivers/net/ethernet/broadcom/bnxt/Makefile
@@ -5,3 +5,4 @@ bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.
 bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o
 bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o
 bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o
+bnxt_en-$(CONFIG_BNXT_TLS) += bnxt_mpc.o
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 7513618793da..8faab85d66d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -76,6 +76,7 @@
 #include "bnxt_hwmon.h"
 #include "bnxt_gso.h"
 #include <net/tso.h>
+#include "bnxt_mpc.h"
 
 #define BNXT_TX_TIMEOUT		(5 * HZ)
 #define BNXT_DEF_MSG_ENABLE	(NETIF_MSG_DRV | NETIF_MSG_HW | \
@@ -9943,6 +9944,11 @@ static int __bnxt_hwrm_func_qcaps(struct bnxt *bp)
 	}
 	bp->tso_max_segs = le16_to_cpu(resp->max_tso_segs);
 
+	if (resp->mpc_chnls_cap)
+		bnxt_alloc_mpc_info(bp, resp->mpc_chnls_cap);
+	else
+		bnxt_free_mpc_info(bp);
+
 hwrm_func_qcaps_exit:
 	hwrm_req_drop(bp, req);
 	return rc;
@@ -16547,6 +16553,7 @@ static void bnxt_remove_one(struct pci_dev *pdev)
 	bp->ptp_cfg = NULL;
 	kfree(bp->fw_health);
 	bp->fw_health = NULL;
+	bnxt_free_mpc_info(bp);
 	bnxt_cleanup_pci(bp);
 	bnxt_free_ctx_mem(bp, true);
 	bnxt_free_crash_dump_mem(bp);
@@ -17218,6 +17225,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	bnxt_ethtool_free(bp);
 	kfree(bp->fw_health);
 	bp->fw_health = NULL;
+	bnxt_free_mpc_info(bp);
 	bnxt_cleanup_pci(bp);
 	bnxt_free_ctx_mem(bp, true);
 	bnxt_free_crash_dump_mem(bp);
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 6335dfc14c98..77b1748d12d1 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -2452,6 +2452,8 @@ struct bnxt {
 
 	u8			tph_mode;
 
+	struct bnxt_mpc_info	*mpc_info;
+
 	unsigned int		current_interval;
 #define BNXT_TIMER_INTERVAL	HZ
 
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c
new file mode 100644
index 000000000000..86087e538550
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.c
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Broadcom Inc. */
+
+#include <linux/stddef.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bnxt/hsi.h>
+
+#include "bnxt.h"
+#include "bnxt_mpc.h"
+
+void bnxt_alloc_mpc_info(struct bnxt *bp, u8 mpc_chnls_cap)
+{
+	if (!bp->mpc_info)
+		bp->mpc_info = kzalloc_obj(*bp->mpc_info);
+	if (bp->mpc_info)
+		bp->mpc_info->mpc_chnls_cap = mpc_chnls_cap;
+	else
+		netdev_warn(bp->dev, "Unable to allocate MPC info\n");
+}
+
+void bnxt_free_mpc_info(struct bnxt *bp)
+{
+	kfree(bp->mpc_info);
+	bp->mpc_info = NULL;
+}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h
new file mode 100644
index 000000000000..cd3f268a3a29
--- /dev/null
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_mpc.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2026 Broadcom Inc. */
+
+#ifndef BNXT_MPC_H
+#define BNXT_MPC_H
+
+/* Mid path channel (MPC) definitions.  An MPC is special TX/completion
+ * ring pair to send/receive control plane data to the TCE and RCE
+ * (Transmit/Receive Crypto Engine) HW blocks.
+ */
+
+enum bnxt_mpc_type {
+	BNXT_MPC_TCE_TYPE = RING_ALLOC_REQ_MPC_CHNLS_TYPE_TCE,
+	BNXT_MPC_RCE_TYPE = RING_ALLOC_REQ_MPC_CHNLS_TYPE_RCE,
+	BNXT_MPC_TYPE_MAX,
+};
+
+#define BNXT_MAX_MPC		8
+
+struct bnxt_mpc_info {
+	u8			mpc_chnls_cap;
+	u8			mpc_cp_rings;
+	u8			mpc_ring_count[BNXT_MPC_TYPE_MAX];
+	struct bnxt_tx_ring_info *mpc_rings[BNXT_MPC_TYPE_MAX];
+};
+
+#define BNXT_MPC_CRYPTO_CAP    \
+	(FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TCE | FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RCE)
+
+#define BNXT_MPC_CRYPTO_CAPABLE(bp)					\
+	((bp)->mpc_info ?						\
+	 ((bp)->mpc_info->mpc_chnls_cap & BNXT_MPC_CRYPTO_CAP) ==	\
+	  BNXT_MPC_CRYPTO_CAP : false)
+
+#ifdef CONFIG_BNXT_TLS
+void bnxt_alloc_mpc_info(struct bnxt *bp, u8 mpc_chnls_cap);
+void bnxt_free_mpc_info(struct bnxt *bp);
+#else
+static inline void bnxt_alloc_mpc_info(struct bnxt *bp, u8 mpc_chnls_cap)
+{
+}
+
+static inline void bnxt_free_mpc_info(struct bnxt *bp)
+{
+}
+#endif	/* CONFIG_BNXT_TLS */
+#endif	/* BNXT_MPC_H */
-- 
2.51.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox