* [PATCH net-next v2 7/8] net: dsa: mt7530: implement port_fast_age
From: Daniel Golle @ 2026-06-13 1:11 UTC (permalink / raw)
To: Chester A. Unal, Daniel Golle, Andrew Lunn, Vladimir Oltean,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Matthias Brugger, AngeloGioacchino Del Regno, Russell King,
netdev, linux-kernel, linux-arm-kernel, linux-mediatek
In-Reply-To: <cover.1781312667.git.daniel@makrotopia.org>
Implement the .port_fast_age DSA operation by flushing all non-static
(dynamically learned) MAC address entries from the address table.
The switch does not offer a combined "non-static AND per-port" match
mode, so flush all dynamic entries globally. This is consistent with
what other DSA drivers do (b53, realtek) and relearning is fast.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
v2: no changes
drivers/net/dsa/mt7530.c | 16 ++++++++++++++++
drivers/net/dsa/mt7530.h | 1 +
2 files changed, 17 insertions(+)
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index dcf72ab0cd66..c96420c291d5 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -193,6 +193,21 @@ mt7530_fdb_cmd(struct mt7530_priv *priv, enum mt7530_fdb_cmd cmd, u32 *rsp)
return 0;
}
+static void mt7530_port_fast_age(struct dsa_switch *ds, int port)
+{
+ struct mt7530_priv *priv = ds->priv;
+ struct mt7530_dummy_poll p;
+ u32 val;
+
+ /* Flush all non-static MAC address entries */
+ val = ATC_BUSY | ATC_MAT_NON_STATIC_MAC | MT7530_FDB_FLUSH;
+ regmap_write(priv->regmap, MT7530_ATC, val);
+
+ INIT_MT7530_DUMMY_POLL(&p, priv, MT7530_ATC);
+ readx_poll_timeout(mt7530_mii_poll, &p, val,
+ !(val & ATC_BUSY), 20, 20000);
+}
+
static void
mt7530_fdb_read(struct mt7530_priv *priv, struct mt7530_fdb *fdb)
{
@@ -3319,6 +3334,7 @@ static const struct dsa_switch_ops mt7530_switch_ops = {
.port_bridge_flags = mt7530_port_bridge_flags,
.port_bridge_join = mt7530_port_bridge_join,
.port_bridge_leave = mt7530_port_bridge_leave,
+ .port_fast_age = mt7530_port_fast_age,
.port_fdb_add = mt7530_port_fdb_add,
.port_fdb_del = mt7530_port_fdb_del,
.port_fdb_dump = mt7530_port_fdb_dump,
diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h
index abf19aa69520..decad7a93dbd 100644
--- a/drivers/net/dsa/mt7530.h
+++ b/drivers/net/dsa/mt7530.h
@@ -165,6 +165,7 @@ enum mt753x_to_cpu_fw {
#define ATC_MAT_MASK GENMASK(11, 8)
#define ATC_MAT(x) FIELD_PREP(ATC_MAT_MASK, x)
#define ATC_MAT_MACTAB ATC_MAT(0)
+#define ATC_MAT_NON_STATIC_MAC ATC_MAT(4)
enum mt7530_fdb_cmd {
MT7530_FDB_READ = 0,
--
2.54.0
^ permalink raw reply related
* [PATCH net-next v2 8/8] net: dsa: mt7530: implement port_change_conduit op
From: Daniel Golle @ 2026-06-13 1:11 UTC (permalink / raw)
To: Chester A. Unal, Daniel Golle, Andrew Lunn, Vladimir Oltean,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Matthias Brugger, AngeloGioacchino Del Regno, Russell King,
netdev, linux-kernel, linux-arm-kernel, linux-mediatek
In-Reply-To: <cover.1781312667.git.daniel@makrotopia.org>
Allow changing the CPU port affinity of user ports at runtime via the
IFLA_DSA_CONDUIT netlink attribute. This updates the port matrix to
forward to the new CPU port instead of the old one.
Limit the operation to MT7531. There, trapped link-local frames follow
the per-port affinity, as the MT7531_CPU_PMAP destination mask is
further restricted by the port matrix. A conduit change is hence fully
honoured by the hardware, for regular traffic as well as for trapped
frames.
The MT7530 switch, including the variant embedded in the MT7621 SoC,
instead traps frames to the single CPU port set in the CPU_PORT field
of the MFC register, regardless of the affinity of the inbound user
port. With user ports affine to different CPU ports there is no
correct value for that field, so per-port CPU affinity cannot be fully
implemented for trapped frames. Routing a WAN port via the second SoC
GMAC is conventionally covered by the PHY muxing feature on these
switches, which bypasses the switch fabric and does not involve a CPU
port at all.
The switches on the MT7988, EN7581 and AN7583 SoCs only have a
single CPU port, leaving no other conduit to change to.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
v2: extend commit message
drivers/net/dsa/mt7530.c | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index c96420c291d5..2f3e734b9f53 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -3206,6 +3206,34 @@ static int mt753x_set_mac_eee(struct dsa_switch *ds, int port,
return 0;
}
+static int
+mt753x_port_change_conduit(struct dsa_switch *ds, int port,
+ struct net_device *conduit,
+ struct netlink_ext_ack *extack)
+{
+ struct dsa_port *new_cpu_dp = conduit->dsa_ptr;
+ struct dsa_port *dp = dsa_to_port(ds, port);
+ struct mt7530_priv *priv = ds->priv;
+
+ if (priv->id != ID_MT7531)
+ return -EOPNOTSUPP;
+
+ mutex_lock(&priv->reg_mutex);
+
+ /* dp->cpu_dp still points to the old CPU port */
+ priv->ports[port].pm &= ~PCR_MATRIX(BIT(dp->cpu_dp->index));
+ priv->ports[port].pm |= PCR_MATRIX(BIT(new_cpu_dp->index));
+ if (priv->ports[port].enable)
+ regmap_update_bits(priv->regmap, MT7530_PCR_P(port),
+ PCR_MATRIX_MASK, priv->ports[port].pm);
+
+ mutex_unlock(&priv->reg_mutex);
+
+ mt7530_port_fast_age(ds, port);
+
+ return 0;
+}
+
static void
mt753x_conduit_state_change(struct dsa_switch *ds,
const struct net_device *conduit,
@@ -3317,6 +3345,7 @@ static const struct dsa_switch_ops mt7530_switch_ops = {
.setup = mt753x_setup,
.teardown = mt753x_teardown,
.preferred_default_local_cpu_port = mt753x_preferred_default_local_cpu_port,
+ .port_change_conduit = mt753x_port_change_conduit,
.get_strings = mt7530_get_strings,
.get_ethtool_stats = mt7530_get_ethtool_stats,
.get_sset_count = mt7530_get_sset_count,
--
2.54.0
^ permalink raw reply related
* Re: [PATCH net] tcp: ipv6: clamp default adverting MSS to avoid GSO_BY_FRAGS (0xFFFF)
From: Kuniyuki Iwashima @ 2026-06-13 1:16 UTC (permalink / raw)
To: Eric Dumazet
Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
Ido Schimmel, David Ahern, Neal Cardwell, netdev, eric.dumazet,
syzbot+ebdb22d461c904fc3cb2, Marcelo Ricardo Leitner, Xin Long
In-Reply-To: <20260612162517.83394-1-edumazet@google.com>
On Fri, Jun 12, 2026 at 9:25 AM Eric Dumazet <edumazet@google.com> wrote:
>
> When MTU is large, ip6_default_advmss() can return IPV6_MAXPLEN (65535).
> This is interpreted by TCP as mss_clamp, allowing the MSS to reach 65535.
>
> However, 0xFFFF is also used as a magic value GSO_BY_FRAGS in the kernel.
> If a TCP packet with gso_size=0xFFFF is passed to skb_segment(), it will
> be mistakenly treated as GSO_BY_FRAGS, leading to a NULL pointer
> dereference because local TCP packets do not use frag_list.
>
> Fix this by returning min(IPV6_MAXPLEN, GSO_BY_FRAGS - 1) (65534) from
> ip6_default_advmss() when MTU is large.
>
> Also update the stale comment in ip6_default_advmss() which suggested
> that IPV6_MAXPLEN is returned to mean "any MSS".
>
> Fixes: 3953c46c3ac7 ("sk_buff: allow segmenting based on frag sizes")
> Reported-by: syzbot+ebdb22d461c904fc3cb2@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/6a2c3193.8812e0fc.3c3fa4.0001.GAE@google.com/T/#u
> Signed-off-by: Eric Dumazet <edumazet@google.com>
Interesting !
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
^ permalink raw reply
* Re: [PATCH net-next v2 00/10] net: dsa: microchip: remove unnecessary dsa_switch_ops callbacks
From: patchwork-bot+netdevbpf @ 2026-06-13 1:20 UTC (permalink / raw)
To: Bastien Curutchet
Cc: woojung.huh, UNGLinuxDriver, andrew, olteanv, davem, edumazet,
kuba, pabeni, linux, pascal.eberhard, miquel.raynal,
thomas.petazzoni, tristram.ha, netdev, linux-kernel,
vladimir.oltean
In-Reply-To: <20260608-clean-ksz-3rd-v2-0-6e61b7be23c4@bootlin.com>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Mon, 08 Jun 2026 16:10:03 +0200 you wrote:
> Hi all,
>
> This series continues the rework of the KSZ driver initiated by two previous
> series (see [1] & [2]).
>
> The KSZ driver handles more than 20 switches split in several families.
> This was previously handled through a common set of dsa_switch_ops
> operations that used device-specific ksz_dev_ops callbacks. The two
> previous series have split this common struct dsa_switch_ops into 5
> to connect the ksz_dev_ops's implentations directly to the new
> dsa_swicth ops.
>
> [...]
Here is the summary with links:
- [net-next,v2,01/10] net: dsa: microchip: remove useless common cls_flower_{add/del} operations
https://git.kernel.org/netdev/net-next/c/e6759c4acc8c
- [net-next,v2,02/10] net: dsa: microchip: remove VLAN operations for ksz8463
https://git.kernel.org/netdev/net-next/c/b54a8087c43c
- [net-next,v2,03/10] net: dsa: microchip: implement get_phy_flags only if needed
https://git.kernel.org/netdev/net-next/c/4d574a5cfa48
- [net-next,v2,04/10] net: dsa: microchip: wrap the MAC configuration checks in a function
https://git.kernel.org/netdev/net-next/c/c90e80103ba5
- [net-next,v2,05/10] net: dsa: microchip: remove setup_rgmii_delay() KSZ operation
https://git.kernel.org/netdev/net-next/c/b97d51f4501c
- [net-next,v2,06/10] net: dsa: microchip: implement .support_eee() only if needed
https://git.kernel.org/netdev/net-next/c/d654b3241436
- [net-next,v2,07/10] net: dsa: microchip: implement .{get/set}_wol only if needed
https://git.kernel.org/netdev/net-next/c/75ad8c1bc79f
- [net-next,v2,08/10] net: dsa: microchip: implement port_hsr_join for KSZ9477 only
https://git.kernel.org/netdev/net-next/c/e33c16843686
- [net-next,v2,09/10] net: dsa: microchip: implement lan937x-specific MDIO registration
https://git.kernel.org/netdev/net-next/c/03d10c776802
- [net-next,v2,10/10] net: dsa: microchip: implement port_teardown only if needed
https://git.kernel.org/netdev/net-next/c/af472a40b276
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH bpf v5 0/2] bpf: Fix generic devmap egress skb sharing
From: patchwork-bot+netdevbpf @ 2026-06-13 1:30 UTC (permalink / raw)
To: Sun Jian
Cc: bpf, netdev, linux-kernel, linux-kselftest, ast, daniel, andrii,
martin.lau, davem, kuba, hawk, john.fastabend, sdf, shuah,
jiayuan.chen, toke, menglong.dong, emil
In-Reply-To: <20260612114032.244616-1-sun.jian.kdev@gmail.com>
Hello:
This series was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:
On Fri, 12 Jun 2026 19:40:30 +0800 you wrote:
> Generic XDP devmap multi redirect can leave cloned skbs sharing packet
> data. When a devmap egress program mutates packet data, another
> destination sharing the same data may observe that mutation.
>
> Fix this by making cloned skbs private before running the generic devmap
> egress program. The private copy is made in dev_map_generic_redirect()
> so dev_map_bpf_prog_run_skb() can keep returning the XDP action directly.
>
> [...]
Here is the summary with links:
- [bpf,v5,1/2] bpf: Run generic devmap egress prog on private skb
https://git.kernel.org/bpf/bpf-next/c/6001896f0098
- [bpf,v5,2/2] selftests/bpf: Cover generic devmap egress last-dst rewrite
https://git.kernel.org/bpf/bpf-next/c/f0eff94d07cd
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH v2 bpf-next/net 5/5] selftest: bpf: Add test for hwtstamp proxy.
From: Alexei Starovoitov @ 2026-06-13 1:31 UTC (permalink / raw)
To: Kuniyuki Iwashima, Alexei Starovoitov, Daniel Borkmann,
Martin KaFai Lau, Stanislav Fomichev, Andrii Nakryiko,
John Fastabend, Kumar Kartikeya Dwivedi, Eduard Zingerman
Cc: Song Liu, Yonghong Song, Jiri Olsa, Andrew Lunn, David S . Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
Willem de Bruijn, Kuniyuki Iwashima, bpf, netdev
In-Reply-To: <20260613010039.1362312-6-kuniyu@google.com>
On Fri Jun 12, 2026 at 6:00 PM PDT, Kuniyuki Iwashima wrote:
> diff --git a/tools/testing/selftests/bpf/bpf_kfuncs.h b/tools/testing/selftests/bpf/bpf_kfuncs.h
> index 7dad01439391..8d119b10ed0d 100644
> --- a/tools/testing/selftests/bpf/bpf_kfuncs.h
> +++ b/tools/testing/selftests/bpf/bpf_kfuncs.h
> @@ -92,4 +92,14 @@ extern int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str,
> const struct bpf_dynptr *value_p, int flags) __ksym __weak;
> extern int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str) __ksym __weak;
>
> +extern int bpf_skb_scrub_tx_tstamp(struct __sk_buff *s) __ksym __weak;
> +
> +struct bpf_hwtstamp;
> +extern int bpf_skb_set_hwtstamp(struct __sk_buff *s,
> + struct bpf_hwtstamp *attrs, int attrs__sz) __ksym __weak;
> +
> +struct bpf_tx_tstamp_cmpl;
> +extern int bpf_skb_complete_tx_tstamp(struct __sk_buff *s,
> + struct bpf_tx_tstamp_cmpl *attrs,
> + int attrs__sz) __ksym __weak;
...
> diff --git a/tools/testing/selftests/bpf/progs/proxy_hwtstamp.c b/tools/testing/selftests/bpf/progs/proxy_hwtstamp.c
> new file mode 100644
> index 000000000000..e13963f2393e
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/proxy_hwtstamp.c
> @@ -0,0 +1,236 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright 2026 Google LLC */
> +
> +#include "vmlinux.h"
Please update your pahole. It must be old, since you had
the need to add kfuncs defs into bpf_kfuncs.h.
pahole was improved couple years ago and all kfuncs now
in vmlinux.h automatically.
Overall, the set looks good to me, but I would like
to see an ack for networking bits from somebody.
bpf-next/net was the right subj to use, but this is very late
into release cycle, so I prefer to land it to bpf-next/master
and send together with main bpf-next PR.
Kuba,
is is ok with you? thoughts on patchset overall?
^ permalink raw reply
* Re: [PATCH bpf-next v3 3/7] bpf, sockmap: zero-initialize pages allocated in bpf_msg_push_data
From: Alexei Starovoitov @ 2026-06-13 1:36 UTC (permalink / raw)
To: Kuniyuki Iwashima, jiayuan.chen
Cc: andrii, ast, bestswngs, bpf, cong.wang, daniel, davem, eddyz87,
edumazet, emil, hawk, horms, ihor.solodrai, jakub, john.fastabend,
jolsa, kuba, linux-kernel, linux-kselftest, martin.lau, memxor,
mmmxny, netdev, pabeni, rhkrqnwk98, sdf, shuah, song, xmei5,
yonghong.song
In-Reply-To: <20260613002906.1336958-1-kuniyu@google.com>
On Fri Jun 12, 2026 at 5:28 PM PDT, Kuniyuki Iwashima wrote:
> From: Jiayuan Chen <jiayuan.chen@linux.dev>
> Date: Fri, 12 Jun 2026 21:07:47 +0800
>> From: Weiming Shi <bestswngs@gmail.com>
>>
>> bpf_msg_push_data() allocates pages via alloc_pages() without
>> __GFP_ZERO. In the non-copy path, the entire page of uninitialized
>> heap content is added directly to the sk_msg scatterlist, which is
>> then transmitted over TCP to userspace via tcp_bpf_push(). In the
>> copy path, a gap of len bytes between the front and back memcpy
>> regions is similarly left uninitialized.
>>
>> This leads to a kernel heap information leak: stale page content
>> including kernel pointers from the direct-map and vmemmap regions
>> is transmitted to userspace, which can be used to defeat KASLR.
>>
>> Add __GFP_ZERO to the alloc_pages() call to ensure the allocated
>> page is always zeroed before it enters the scatterlist.
>>
>> Link: https://lore.kernel.org/all/20260424155913.A19FDC19425@smtp.kernel.org
>> Fixes: 6fff607e2f14 ("bpf: sk_msg program helper bpf_msg_push_data")
>> Tested-by: Xiang Mei <xmei5@asu.edu>
>> Tested-by: Xinyu Ma <mmmxny@gmail.com>
>> Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
>> Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
>> Signed-off-by: Weiming Shi <bestswngs@gmail.com>
>> Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
>> ---
>> net/core/filter.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/core/filter.c b/net/core/filter.c
>> index 3e555f276ba80..6e345ca65ca14 100644
>> --- a/net/core/filter.c
>> +++ b/net/core/filter.c
>> @@ -2832,7 +2832,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
>> if (unlikely(copy + len < copy))
>> return -EINVAL;
>>
>> - page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
>> + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP | __GFP_ZERO,
>
> This is a red flag.
>
> We have a bunch of KMSAN reports due to raw/packet sockets,
> which requires CAP_NET_ADMIN, and leave them unfixed although
> some people attempted to "fix" them by adding __GFP_ZERO to
> __alloc_skb().
yep. It's a bpf prog responsibility to avoid garbage in the payload.
pw-bot: cr
^ permalink raw reply
* Re: [PATCH net] net/mlx5e: Fix oops from ERR_PTR in act-miss restore teardown
From: Jakub Kicinski @ 2026-06-13 1:42 UTC (permalink / raw)
To: tariqt
Cc: Jakub Kicinski, edumazet, pabeni, andrew+netdev, davem, saeedm,
mbloch, leon, vladbu, paulb, netdev, linux-rdma, linux-kernel,
gal, lkayal, cratiu
In-Reply-To: <20260611134836.534015-1-tariqt@nvidia.com>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
---
net/mlx5e: Fix oops from ERR_PTR in act-miss restore teardown
This change clears attr->act_id_restore_rule to NULL after
esw_add_restore_rule() fails, so the teardown path no longer
dereferences an ERR_PTR cookie as if it were a real mlx5_flow_handle.
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> index a9001d1c902f..4c135858f297 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
> @@ -5863,6 +5863,7 @@ int mlx5e_tc_action_miss_mapping_get(struct mlx5e_priv *priv, struct mlx5_flow_a
> attr->act_id_restore_rule = esw_add_restore_rule(esw, *act_miss_mapping);
> if (IS_ERR(attr->act_id_restore_rule)) {
> err = PTR_ERR(attr->act_id_restore_rule);
> + attr->act_id_restore_rule = NULL;
> goto err_rule;
> }
[High]
The commit message states that after this change "teardown only sees
NULL or a valid handle", framing NULL as benign. Is that invariant
actually enforced on the consumer side?
Looking at the teardown helper:
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c:mlx5e_tc_action_miss_mapping_put() {
struct mapping_ctx *ctx = mlx5e_get_priv_obj_mapping(priv);
if (is_mdev_switchdev_mode(priv->mdev))
mlx5_del_flow_rules(attr->act_id_restore_rule);
mapping_remove(ctx, act_miss_mapping);
}
There is no NULL check before mlx5_del_flow_rules(), and
mlx5_del_flow_rules() itself dereferences handle->rule[0] without a
NULL check:
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c:mlx5_del_flow_rules() {
struct fs_fte *fte;
int i;
...
fs_get_obj(fte, handle->rule[0]->node.parent);
down_write_ref_node(&fte->node, false);
...
}
So on any path where mlx5e_tc_action_miss_mapping_put() is reached with
attr->act_id_restore_rule == NULL in switchdev mode, this patch
converts the original ERR_PTR dereference into a NULL dereference at
the same site rather than avoiding the crash.
Today the put is gated by attr->ct_attr.offloaded in
mlx5_tc_ct_delete_flow(), and __mlx5_tc_ct_flow_offload() leaves
offloaded clear when mlx5e_tc_action_miss_mapping_get() fails, so the
specific oops in the call trace should no longer be reachable. But
should the put be made symmetric with the new "NULL or valid" producer
contract, e.g.:
if (is_mdev_switchdev_mode(priv->mdev) && attr->act_id_restore_rule)
mlx5_del_flow_rules(attr->act_id_restore_rule);
so that any future caller that relies on the documented invariant
(including mlx5e_clone_flow_attr_for_post_act() in en_tc.c, which
already inherits this field) does not reintroduce the same oops with a
slightly different fault address?
--
pw-bot: cr
^ permalink raw reply
* [PATCH net-next V2] selftests: drv-net: Test queue stall upon reconfig
From: Mohsin Bashir @ 2026-06-13 1:48 UTC (permalink / raw)
To: netdev; +Cc: andrew+netdev, davem, edumazet, kuba, pabeni, shuah,
linux-kselftest
From: Mohsin Bashir <hmohsin@meta.com>
Add a reconfig_tx_stall test that detects the possibility of a TX stall
after ring reconfiguration. The key observation is that drivers using
netif_tx_start_all_queues() are prone to experiencing a stall when
reconfiguration completes compared to drivers using
netif_tx_wake_all_queues(). start_all_queues only clears DRV_XOFF, while
wake_all_queues also calls __netif_schedule() to kick the qdisc. Without
the kick, qdisc backlog present at reconfig time can stay stuck until a
new trigger is issued.
The test caps the TX ring at 64 entries so it fills quickly, then
installs FQ on a target TX queue and sends UDP packets with SO_TXTIME
scheduled in the future. With napi_defer_hard_irqs slowing completions,
the small ring can fill when FQ releases the burst, leaving requeued
qdisc backlog with no FQ timer to rescue it. A subsequent ring reconfig
must wake the queues to drain the backlog. Simply starting the queues can
leave it stuck.
On host with problematic driver:
Sent 128 SO_TXTIME packets (+100ms)
Sent 128 SO_TXTIME packets (+200ms)
Backlog before reconfig: 52632 bytes
Check| At /root/ksft-net-drv/./drivers/net/ring_reconfig.py, ...
Check| ksft_eq(0, backlog,
Check failed 0 != 52632 qdisc backlog stuck on queue 1 after ring reconfig
not ok 3 ring_reconfig.reconfig_tx_stall
On host with fixed driver:
Sent 128 SO_TXTIME packets (+100ms)
Sent 128 SO_TXTIME packets (+200ms)
Backlog before reconfig: 76024 bytes
ok 3 ring_reconfig.reconfig_tx_stall
Signed-off-by: Mohsin Bashir <hmohsin@meta.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
Changelog:
V2:
- Reoreder config file changes to handle check_selftest failure
- Chain exception with from exc in _create_sotxtime_socket
- Use getattr to get arch-specific value for SO_TXTIME, fallback to 61
V1: https://lore.kernel.org/netdev/20260612001754.2489868-1-mohsin.bashr@gmail.com
---
tools/testing/selftests/drivers/net/config | 4 +
.../selftests/drivers/net/ring_reconfig.py | 176 +++++++++++++++++-
2 files changed, 178 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/drivers/net/config b/tools/testing/selftests/drivers/net/config
index 617de8aaf551..1ef07fae74c1 100644
--- a/tools/testing/selftests/drivers/net/config
+++ b/tools/testing/selftests/drivers/net/config
@@ -4,6 +4,10 @@ CONFIG_DEBUG_INFO_BTF_MODULES=n
CONFIG_INET_PSP=y
CONFIG_IPV6=y
CONFIG_MACSEC=m
+CONFIG_NET_ACT_SKBEDIT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_FLOWER=m
+CONFIG_NET_CLS_MATCHALL=m
CONFIG_NETCONSOLE=m
CONFIG_NETCONSOLE_DYNAMIC=y
CONFIG_NETCONSOLE_EXTENDED_LOG=y
diff --git a/tools/testing/selftests/drivers/net/ring_reconfig.py b/tools/testing/selftests/drivers/net/ring_reconfig.py
index f9530a8b0856..11491a0b7013 100755
--- a/tools/testing/selftests/drivers/net/ring_reconfig.py
+++ b/tools/testing/selftests/drivers/net/ring_reconfig.py
@@ -5,10 +5,18 @@
Test channel and ring size configuration via ethtool (-L / -G).
"""
+import socket
+import struct
+import time
+
from lib.py import ksft_run, ksft_exit, ksft_pr
from lib.py import ksft_eq
+from lib.py import KsftSkipEx
from lib.py import NetDrvEpEnv, EthtoolFamily, GenerateTraffic
-from lib.py import defer, NlError
+from lib.py import cmd, defer, rand_port, tc, NlError
+
+# Added in Python 3.13; fallback to 61 for x86/ARM/MIPS
+SO_TXTIME = getattr(socket, "SO_TXTIME", 61)
def channels(cfg) -> None:
@@ -151,6 +159,169 @@ def ringparam(cfg) -> None:
GenerateTraffic(cfg).wait_pkts_and_stop(10000)
+def _write_sysfs(cfg, path, val):
+ with open(path, "r", encoding="utf-8") as fp:
+ orig_val = fp.read().strip()
+ if str(val) == orig_val:
+ return
+ with open(path, "w", encoding="utf-8") as fp:
+ fp.write(str(val))
+ defer(lambda p=path, v=orig_val: open(p, "w").write(v))
+
+
+def _get_mq_handle(cfg):
+ qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ for q in qdiscs:
+ if q.get("kind") == "mq":
+ return q["handle"]
+ raise KsftSkipEx(f"no mq qdisc found on {cfg.ifname}")
+
+
+def _get_qdisc_backlog(cfg, queue, mq_handle):
+ qdiscs = tc(f"-s qdisc show dev {cfg.ifname}", json=True)
+ target_parent = f"{mq_handle}{queue + 1:x}"
+ for q in qdiscs:
+ if q.get("parent", "") == target_parent:
+ return q.get("backlog")
+ return None
+
+
+def _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue):
+ mq_child_parent = f"{mq_handle}{target_queue + 1:x}"
+
+ # Save the original child qdisc to restore after test
+ qdiscs = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ default_qdisc = cmd("sysctl -n net.core.default_qdisc").stdout.strip()
+ orig_kind = default_qdisc
+ for q in qdiscs:
+ if q.get("parent", "") == mq_child_parent:
+ orig_kind = q.get("kind", default_qdisc)
+ break
+ try:
+ tc(f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} fq")
+ except Exception as exc:
+ raise KsftSkipEx("fq not available (CONFIG_NET_SCH_FQ)") from exc
+ defer(tc,
+ f"qdisc replace dev {cfg.ifname} parent {mq_child_parent} {orig_kind}")
+
+ qdisc_j = tc(f"qdisc show dev {cfg.ifname}", json=True)
+ has_clsact = any(q['kind'] == 'clsact' for q in qdisc_j)
+ if not has_clsact:
+ tc(f"qdisc add dev {cfg.ifname} clsact")
+ defer(tc, f"qdisc del dev {cfg.ifname} clsact")
+
+ proto = "ipv6" if int(cfg.addr_ipver) == 6 else "ip"
+ try:
+ tc(f"filter add dev {cfg.ifname} egress protocol {proto} "
+ f"pref 1 flower ip_proto udp dst_port {port} "
+ f"action skbedit queue_mapping {target_queue}")
+ except Exception as exc:
+ raise KsftSkipEx("tc flower/act_skbedit not available") from exc
+ defer(tc, f"filter del dev {cfg.ifname} egress pref 1")
+
+ tc(f"filter add dev {cfg.ifname} egress pref 100 "
+ f"matchall action skbedit queue_mapping {other_queue}")
+ defer(tc, f"filter del dev {cfg.ifname} egress pref 100")
+
+
+def _create_sotxtime_socket(cfg):
+ sock = socket.socket(socket.AF_INET6 if cfg.addr_ipver == "6"
+ else socket.AF_INET, socket.SOCK_DGRAM)
+ try:
+ sock.setsockopt(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Ii", 1, 0))
+ except OSError as exc:
+ sock.close()
+ raise KsftSkipEx("SO_TXTIME not supported") from exc
+ sock.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE,
+ cfg.ifname.encode())
+ return sock
+
+
+def _send_sotxtime_burst(sock, addr, port, count, delay_ns, ipver):
+ payload = b'\x00' * 1400
+ txtime_ns = time.clock_gettime_ns(time.CLOCK_MONOTONIC) + delay_ns
+
+ ancdata = [(socket.SOL_SOCKET, SO_TXTIME, struct.pack("Q", txtime_ns))]
+ if int(ipver) == 6:
+ dest = (addr, port, 0, 0)
+ else:
+ dest = (addr, port)
+ for _ in range(count):
+ sock.sendmsg([payload], ancdata, 0, dest)
+
+
+def reconfig_tx_stall(cfg) -> None:
+ target_queue = 1
+ other_queue = 0
+
+ ehdr = {'header': {'dev-index': cfg.ifindex}}
+ chans = cfg.eth.channels_get(ehdr)
+
+ if 'combined-max' not in chans:
+ raise KsftSkipEx("device does not support combined channels")
+ if chans['combined-count'] < 2:
+ raise KsftSkipEx("need at least 2 combined channels")
+
+ rings = cfg.eth.rings_get(ehdr)
+ if 'rx' not in rings or 'tx' not in rings:
+ raise KsftSkipEx("device does not expose rx/tx ring params")
+ tx_cur = rings['tx']
+ if tx_cur <= 64:
+ raise KsftSkipEx("tx ring size already at minimum")
+ defer(cfg.eth.rings_set, ehdr | {'tx': tx_cur})
+
+ tx_min = 64
+ cfg.eth.rings_set(ehdr | {'tx': tx_min})
+
+ # Slow completions so the ring stays full after FQ releases packets
+ napi_defer = f"/sys/class/net/{cfg.ifname}/napi_defer_hard_irqs"
+ gro_timeout = f"/sys/class/net/{cfg.ifname}/gro_flush_timeout"
+ _write_sysfs(cfg, napi_defer, 100)
+ _write_sysfs(cfg, gro_timeout, 1000000000)
+
+ mq_handle = _get_mq_handle(cfg)
+ port = rand_port()
+ _setup_fq_qdisc(cfg, mq_handle, port, target_queue, other_queue)
+
+ sock = _create_sotxtime_socket(cfg)
+ defer(sock.close)
+
+ pkt_count = tx_min * 2
+
+ for delay_ms in [100, 200, 500]:
+ delay_ns = delay_ms * 1_000_000
+ _send_sotxtime_burst(sock, cfg.remote_addr, port, pkt_count,
+ delay_ns, cfg.addr_ipver)
+ ksft_pr(f"Sent {pkt_count} SO_TXTIME packets (+{delay_ms}ms)")
+ time.sleep(delay_ms / 1000 + 0.3)
+
+ backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
+ if backlog:
+ break
+ else:
+ raise KsftSkipEx("failed to build qdisc backlog")
+
+ ksft_pr(f"Backlog before reconfig: {backlog} bytes")
+
+ # Trigger ring reconfig — driver should call wake, not just start
+ cfg.eth.rings_set(ehdr | {'tx': tx_cur})
+
+ # Let completions proceed normally
+ _write_sysfs(cfg, napi_defer, 0)
+ _write_sysfs(cfg, gro_timeout, 0)
+
+ # Poll for backlog to drain
+ for _ in range(100):
+ backlog = _get_qdisc_backlog(cfg, target_queue, mq_handle)
+ if not backlog:
+ break
+ time.sleep(0.1)
+
+ ksft_eq(0, backlog,
+ comment=f"qdisc backlog stuck on queue {target_queue} "
+ f"after ring reconfig")
+
+
def main() -> None:
""" Ksft boiler plate main """
@@ -158,7 +329,8 @@ def main() -> None:
cfg.eth = EthtoolFamily()
ksft_run([channels,
- ringparam],
+ ringparam,
+ reconfig_tx_stall],
args=(cfg, ))
ksft_exit()
--
2.52.0
^ permalink raw reply related
* Re: [PATCH net] net/mlx5: Check max_macs devlink param value against max capability
From: patchwork-bot+netdevbpf @ 2026-06-13 1:50 UTC (permalink / raw)
To: Tariq Toukan
Cc: edumazet, kuba, pabeni, andrew+netdev, davem, saeedm, leon,
mbloch, moshe, shayd, parav, netdev, linux-rdma, linux-kernel,
gal, dtatulea, ychemla, cjubran
In-Reply-To: <20260611135230.534513-1-tariqt@nvidia.com>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 11 Jun 2026 16:52:30 +0300 you wrote:
> From: Dragos Tatulea <dtatulea@nvidia.com>
>
> The max_macs devlink param is checked against the FW max value only at
> param register time (driver load) and inside the validate callback
> (devlink param set). The stored DRIVERINIT value persists across FW
> resets and devlink reloads without any further checks against the max.
>
> [...]
Here is the summary with links:
- [net] net/mlx5: Check max_macs devlink param value against max capability
https://git.kernel.org/netdev/net/c/d7b0413b3571
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH] net: qrtr: fix 32-bit integer overflow in qrtr_endpoint_post()
From: patchwork-bot+netdevbpf @ 2026-06-13 1:50 UTC (permalink / raw)
To: Michael Bommarito
Cc: mani, davem, edumazet, kuba, pabeni, horms, netdev, linux-arm-msm,
stable, linux-kernel
In-Reply-To: <20260611125455.2352279-1-michael.bommarito@gmail.com>
Hello:
This patch was applied to netdev/net.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Thu, 11 Jun 2026 08:54:55 -0400 you wrote:
> qrtr_endpoint_post() validates an incoming packet with
>
> if (!size || len != ALIGN(size, 4) + hdrlen)
> goto err;
>
> where size comes from the wire. On 32-bit, size_t is 32 bits and
> ALIGN(size, 4) wraps to 0 for size >= 0xfffffffd, so the check
> passes and skb_put_data(skb, data + hdrlen, size) writes past the
> hdrlen-sized skb and oopses the kernel. 64-bit is unaffected.
>
> [...]
Here is the summary with links:
- net: qrtr: fix 32-bit integer overflow in qrtr_endpoint_post()
https://git.kernel.org/netdev/net/c/20054869770c
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH v16 net-next 00/10] psp: Add support for dev-assoc/disassoc
From: patchwork-bot+netdevbpf @ 2026-06-13 1:50 UTC (permalink / raw)
To: Wei Wang
Cc: netdev, kuba, daniel.zahka, willemdebruijn.kernel, dw,
andrew+netdev, davem, edumazet, pabeni, horms, weibunny
In-Reply-To: <20260608233118.2694144-1-weibunny.kernel@gmail.com>
Hello:
This series was applied to netdev/net-next.git (main)
by Jakub Kicinski <kuba@kernel.org>:
On Mon, 8 Jun 2026 16:31:08 -0700 you wrote:
> From: Wei Wang <weibunny@fb.com>
>
> The main purpose of this feature is to associate virtual devices like
> veth or netkit with a real PSP device, so we could provide PSP
> functionality to the application running with virtual devices.
>
> A typical deployment that works with this feature is as follows:
> Host Namespace:
> psp_dev_local ←──physically linked──→ psp_dev_peer
> (PSP device)
> │
> │ BPF on psp_dev_local ingress: bpf_redirect_peer() to nk_guest
> │
> nk_host / veth_host
> │
> │ BPF on nk_host ingress: bpf_redirect_neigh() to psp_dev_local
> │
> Guest Namespace (netns):
> │
> nk_guest / veth_guest
> ★ PSP application run here
>
> [...]
Here is the summary with links:
- [v16,net-next,01/10] psp: add admin/non-admin version of psp_device_get_locked
https://git.kernel.org/netdev/net-next/c/1c88a4664779
- [v16,net-next,02/10] psp: add new netlink cmd for dev-assoc and dev-disassoc
https://git.kernel.org/netdev/net-next/c/06c2dce2d0f6
- [v16,net-next,03/10] psp: add a new netdev event for dev unregister
https://git.kernel.org/netdev/net-next/c/0ddb69e2406e
- [v16,net-next,04/10] selftests/net: psp: refactor test builders to use ksft_variants
https://git.kernel.org/netdev/net-next/c/89ed478a6c90
- [v16,net-next,05/10] selftests/net: add _find_bpf_obj() to search hw/ for BPF objects
https://git.kernel.org/netdev/net-next/c/b9d51f2e133c
- [v16,net-next,06/10] selftests/net: rename _nk_host_ifname to nk_host_ifname
https://git.kernel.org/netdev/net-next/c/593e22f6524b
- [v16,net-next,07/10] selftests/net: psp: support PSP in NetDrvContEnv infrastructure
https://git.kernel.org/netdev/net-next/c/1c1c2e5b1fe9
- [v16,net-next,08/10] selftests/net: psp: add dev-assoc data path test
https://git.kernel.org/netdev/net-next/c/43cf629700fa
- [v16,net-next,09/10] selftests/net: psp: add cross-namespace notification tests
https://git.kernel.org/netdev/net-next/c/5280303605bc
- [v16,net-next,10/10] selftests/net: psp: add dev-get, no-nsid, and cleanup tests
https://git.kernel.org/netdev/net-next/c/50d3bdfb84c8
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* Re: [PATCH net-next v3 3/4] selftests/net: ncdevmem: add -b option to set rx-buf-size on bind
From: Stanislav Fomichev @ 2026-06-13 2:03 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
Paolo Abeni, Simon Horman, Andrew Lunn, Gerd Hoffmann,
Vivek Kasireddy, Sumit Semwal, Christian König, Shuah Khan,
netdev, linux-kernel, dri-devel, linux-media, linaro-mm-sig,
linux-kselftest, sdf, razor, daniel, almasrymina, matttbe,
skhawaja, dw, Bobby Eshleman
In-Reply-To: <20260612-tcpdm-large-niovs-v3-3-a3b693e76fcb@meta.com>
On 06/12, Bobby Eshleman wrote:
> From: Bobby Eshleman <bobbyeshleman@meta.com>
>
> Add -b <bytes> to request a non-default niov size via
> NETDEV_A_DMABUF_RX_BUF_SIZE. When the value exceeds PAGE_SIZE,
> udmabuf_alloc() switches to an MFD_HUGETLB-backed memfd so each 2 MB
> hugepage produces one naturally-aligned sg entry.
>
> Reject values > 2 MB up front: MFD_HUGETLB + udmabuf can only guarantee
> 2 MB per sg entry (one hugepage), so a larger rx_buf_size would fail the
> per-sg length/alignment check.
>
> Add CONFIG_HUGETLBFS=y to drivers/net/hw/config so the new path is
> reachable in the CI kernels built for these tests.
>
> Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
^ permalink raw reply
* Re: [PATCH net-next v3 4/4] selftests/net: devmem.py: add check_rx_large_niov
From: Stanislav Fomichev @ 2026-06-13 2:03 UTC (permalink / raw)
To: Bobby Eshleman
Cc: Donald Hunter, Jakub Kicinski, David S. Miller, Eric Dumazet,
Paolo Abeni, Simon Horman, Andrew Lunn, Gerd Hoffmann,
Vivek Kasireddy, Sumit Semwal, Christian König, Shuah Khan,
netdev, linux-kernel, dri-devel, linux-media, linaro-mm-sig,
linux-kselftest, sdf, razor, daniel, almasrymina, matttbe,
skhawaja, dw, Bobby Eshleman
In-Reply-To: <20260612-tcpdm-large-niovs-v3-4-a3b693e76fcb@meta.com>
On 06/12, Bobby Eshleman wrote:
> From: Bobby Eshleman <bobbyeshleman@meta.com>
>
> Add a new devmem test case for binding the dmabuf with rx-buf-size=16K.
> The test sweeps RX payload sizes straddling the niov boundary to cover
> the sub-niov, exact-niov, and multi-niov RX paths.
>
> Signed-off-by: Bobby Eshleman <bobbyeshleman@meta.com>
Acked-by: Stanislav Fomichev <sdf@fomichev.me>
^ permalink raw reply
* Re: [PATCH net-next 2/2] udp: convert udp_lib_getsockopt to sockopt_t
From: Stanislav Fomichev @ 2026-06-13 2:10 UTC (permalink / raw)
To: Breno Leitao
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Willem de Bruijn, Shuah Khan, netdev, linux-kernel,
linux-kselftest, kernel-team
In-Reply-To: <20260612-getsockopt_phase2-v1-2-7b01f1f5d106@debian.org>
On 06/12, Breno Leitao wrote:
> In preparation for converting the proto-layer getsockopt callbacks to the
> sockopt_t interface, switch udp_lib_getsockopt() to take a sockopt_t.
>
> The thin udp_getsockopt()/udpv6_getsockopt() wrappers keep their __user
> signature for now: they build a user-backed sockopt_t with
> sockopt_init_user(), call the helper, and write the returned length back
> to optlen. The helper uses copy_to_iter() instead of copy_to_user().
> No functional change.
>
> Signed-off-by: Breno Leitao <leitao@debian.org>
> ---
> include/net/udp.h | 2 +-
> net/ipv4/udp.c | 40 ++++++++++++++++++++++++++--------------
> net/ipv6/udp.c | 17 ++++++++++++++---
> 3 files changed, 41 insertions(+), 18 deletions(-)
>
> diff --git a/include/net/udp.h b/include/net/udp.h
> index 8262e2b215b4e..1fee17274745f 100644
> --- a/include/net/udp.h
> +++ b/include/net/udp.h
> @@ -430,7 +430,7 @@ struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
> netdev_features_t features,
> bool is_ipv6);
> int udp_lib_getsockopt(struct sock *sk, int level, int optname,
> - char __user *optval, int __user *optlen);
> + sockopt_t *opt);
> int udp_lib_setsockopt(struct sock *sk, int level, int optname,
> sockptr_t optval, unsigned int optlen,
> int (*push_pending_frames)(struct sock *));
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index 70f6cbd4ef73b..0691f74db2c11 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -76,6 +76,7 @@
>
> #include <linux/bpf-cgroup.h>
> #include <linux/uaccess.h>
> +#include <linux/uio.h>
> #include <asm/ioctls.h>
> #include <linux/memblock.h>
> #include <linux/highmem.h>
> @@ -2995,18 +2996,12 @@ static int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t opt
> }
>
> int udp_lib_getsockopt(struct sock *sk, int level, int optname,
> - char __user *optval, int __user *optlen)
> + sockopt_t *opt)
> {
> struct udp_sock *up = udp_sk(sk);
> int val, len;
>
> - if (get_user(len, optlen))
> - return -EFAULT;
[..]
> - if (len < 0)
> - return -EINVAL;
I see this part now in sockopt_init_user, but you mention that it's a
transitional helper. When we drop it, will we loose this <0 check?
Maybe keep `if ((int)opt->optlen < 0))` here for backwards
compatibility?
^ permalink raw reply
* Re: [PATCH net-next 2/2] udp: convert udp_lib_getsockopt to sockopt_t
From: Willem de Bruijn @ 2026-06-13 2:13 UTC (permalink / raw)
To: Breno Leitao, Willem de Bruijn
Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Simon Horman, Shuah Khan, sdf.kernel, netdev, linux-kernel,
linux-kselftest, kernel-team
In-Reply-To: <aiwyvEbSHLHKNXBk@gmail.com>
Breno Leitao wrote:
> Hello Willem,
>
> On Fri, Jun 12, 2026 at 10:58:19AM -0400, Willem de Bruijn wrote:
> > Breno Leitao wrote:
> > > In preparation for converting the proto-layer getsockopt callbacks to the
> > > sockopt_t interface, switch udp_lib_getsockopt() to take a sockopt_t.
> >
> > What is the benefit of this conversion?
>
> This enables kernel callers (io_uring, bpf, etc.) to invoke getsockopt
> directly. The setsockopt() conversion previously used sockptr, but
> Linus objected to that approach and suggested iov_iter instead.
>
> For full context, see:
> https://lore.kernel.org/all/20260408-getsockopt-v3-0-061bb9cb355d@debian.org/
>
> > It does add a lot more complexity and makes the code less obvious.
>
> I agree this adds complexity. Unfortunately, I don't see a simpler way
> to enable getsockopt to work with non-__user addresses.
Oh right, this is already applied for other protocols such as l2tp_ppp
rds and smc.
^ permalink raw reply
* Re: [PATCH iproute2-next] ipaddress: add support for showing IPv4 devconf attributes
From: Stephen Hemminger @ 2026-06-13 2:29 UTC (permalink / raw)
To: Fernando Fernandez Mancera
Cc: netdev, dsahern, davem, edumazet, kuba, pabeni, horms
In-Reply-To: <20260612231722.30579-1-fmancera@suse.de>
On Sat, 13 Jun 2026 01:17:22 +0200
Fernando Fernandez Mancera <fmancera@suse.de> wrote:
> tatic void print_inet(FILE *fp, struct rtattr *inet_attr)
> +{
> + struct rtattr *tb[IFLA_INET_MAX + 1];
> +
> + parse_rtattr_nested(tb, IFLA_INET_MAX, inet_attr);
> +
> + if (tb[IFLA_INET_CONF] && show_details) {
> + int *conf = RTA_DATA(tb[IFLA_INET_CONF]);
> + int max_elements = RTA_PAYLOAD(tb[IFLA_INET_CONF]) / sizeof(int);
> +
> + if (max_elements >= IPV4_DEVCONF_FORWARDING)
> + print_string(PRINT_ANY, "forwarding", "forwarding %s ",
> + conf[IPV4_DEVCONF_FORWARDING - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_MC_FORWARDING)
> + print_string(PRINT_ANY, "mc_forwarding", "mc_forwarding %s ",
> + conf[IPV4_DEVCONF_MC_FORWARDING - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_PROXY_ARP)
> + print_string(PRINT_ANY, "proxy_arp", "proxy_arp %s ",
> + conf[IPV4_DEVCONF_PROXY_ARP - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_ACCEPT_REDIRECTS)
> + print_string(PRINT_ANY, "accept_redirects",
> + "accept_redirects %s ",
> + conf[IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_SECURE_REDIRECTS)
> + print_string(PRINT_ANY, "secure_redirects",
> + "secure_redirects %s ",
> + conf[IPV4_DEVCONF_SECURE_REDIRECTS - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_SEND_REDIRECTS)
> + print_string(PRINT_ANY, "send_redirects", "send_redirects %s ",
> + conf[IPV4_DEVCONF_SEND_REDIRECTS - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_SHARED_MEDIA)
> + print_string(PRINT_ANY, "shared_media", "shared_media %s ",
> + conf[IPV4_DEVCONF_SHARED_MEDIA - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_RP_FILTER)
> + print_int(PRINT_ANY, "rp_filter", "rp_filter %d ",
> + conf[IPV4_DEVCONF_RP_FILTER - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE)
> + print_string(PRINT_ANY, "accept_source_route",
> + "accept_source_route %s ",
> + conf[IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_BOOTP_RELAY)
> + print_string(PRINT_ANY, "bootp_relay", "bootp_relay %s ",
> + conf[IPV4_DEVCONF_BOOTP_RELAY - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_LOG_MARTIANS)
> + print_string(PRINT_ANY, "log_martians", "log_martians %s ",
> + conf[IPV4_DEVCONF_LOG_MARTIANS - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_TAG)
> + print_int(PRINT_ANY, "tag", "tag %d ",
> + conf[IPV4_DEVCONF_TAG - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_ARPFILTER)
> + print_string(PRINT_ANY, "arpfilter", "arpfilter %s ",
> + conf[IPV4_DEVCONF_ARPFILTER - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_MEDIUM_ID)
> + print_int(PRINT_ANY, "medium_id", "medium_id %d ",
> + conf[IPV4_DEVCONF_MEDIUM_ID - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_NOXFRM)
> + print_string(PRINT_ANY, "noxfrm", "noxfrm %s ",
> + conf[IPV4_DEVCONF_NOXFRM - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_NOPOLICY)
> + print_string(PRINT_ANY, "nopolicy", "nopolicy %s ",
> + conf[IPV4_DEVCONF_NOPOLICY - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_FORCE_IGMP_VERSION)
> + print_int(PRINT_ANY, "force_igmp_version", "force_igmp_version %d ",
> + conf[IPV4_DEVCONF_FORCE_IGMP_VERSION - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_ARP_ANNOUNCE)
> + print_int(PRINT_ANY, "arp_announce", "arp_announce %d ",
> + conf[IPV4_DEVCONF_ARP_ANNOUNCE - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_ARP_IGNORE)
> + print_int(PRINT_ANY, "arp_ignore", "arp_ignore %d ",
> + conf[IPV4_DEVCONF_ARP_IGNORE - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_PROMOTE_SECONDARIES)
> + print_string(PRINT_ANY, "promote_secondaries",
> + "promote_secondaries %s ",
> + conf[IPV4_DEVCONF_PROMOTE_SECONDARIES - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_ARP_ACCEPT)
> + print_int(PRINT_ANY, "arp_accept", "arp_accept %d ",
> + conf[IPV4_DEVCONF_ARP_ACCEPT - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_ARP_NOTIFY)
> + print_string(PRINT_ANY, "arp_notify", "arp_notify %s ",
> + conf[IPV4_DEVCONF_ARP_NOTIFY - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_ACCEPT_LOCAL)
> + print_string(PRINT_ANY, "accept_local", "accept_local %s ",
> + conf[IPV4_DEVCONF_ACCEPT_LOCAL - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_SRC_VMARK)
> + print_string(PRINT_ANY, "src_vmark", " src_vmark %s",
> + conf[IPV4_DEVCONF_SRC_VMARK - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_PROXY_ARP_PVLAN)
> + print_string(PRINT_ANY, "proxy_arp_pvlan", "proxy_arp_pvlan %s ",
> + conf[IPV4_DEVCONF_PROXY_ARP_PVLAN - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_ROUTE_LOCALNET)
> + print_string(PRINT_ANY, "route_localnet", "route_localnet %s ",
> + conf[IPV4_DEVCONF_ROUTE_LOCALNET - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_BC_FORWARDING)
> + print_string(PRINT_ANY, "bc_forwarding", "bc_forwarding %s ",
> + conf[IPV4_DEVCONF_BC_FORWARDING - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL)
> + print_int(PRINT_ANY, "igmpv2_unsolicited_report_interval",
> + "igmpv2_unsolicited_report_interval %d ",
> + conf[IPV4_DEVCONF_IGMPV2_UNSOLICITED_REPORT_INTERVAL - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL)
> + print_int(PRINT_ANY, "igmpv3_unsolicited_report_interval",
> + "igmpv3_unsolicited_report_interval %d ",
> + conf[IPV4_DEVCONF_IGMPV3_UNSOLICITED_REPORT_INTERVAL - 1]);
> +
> + if (max_elements >= IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN)
> + print_string(PRINT_ANY, "ignore_routes_with_linkdown",
> + "ignore_routes_with_linkdown %s ",
> + conf[IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1] ?
> + "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST)
> + print_string(PRINT_ANY, "drop_unicast_in_l2_multicast",
> + "drop_unicast_in_l2_multicast %s ",
> + conf[IPV4_DEVCONF_DROP_UNICAST_IN_L2_MULTICAST - 1] ?
> + "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_DROP_GRATUITOUS_ARP)
> + print_string(PRINT_ANY, "drop_gratuitous_arp",
> + "drop_gratuitous_arp %s ",
> + conf[IPV4_DEVCONF_DROP_GRATUITOUS_ARP - 1] ? "on" : "off");
> +
> + if (max_elements >= IPV4_DEVCONF_ARP_EVICT_NOCARRIER)
> + print_string(PRINT_ANY, "arp_evict_nocarrier",
> + "arp_evict_nocarrier %s ",
> + conf[IPV4_DEVCONF_ARP_EVICT_NOCARRIER - 1] ? "on" : "off");
> + }
> +}
> +
There are three different ways to display a flag value in JSON used in iproute2.
This one is my least favorite.
The three ways are:
- print_bool
- print_null (only if on)
- print_string
I would use the print_null pattern but print_bool would also be ok.
^ permalink raw reply
* Re: [PATCH v2 bpf-next/net 1/5] ethtool: Introduce ETHTOOL_MSG_TSINFO_SET for virtual interfaces.
From: Stanislav Fomichev @ 2026-06-13 2:29 UTC (permalink / raw)
To: Kuniyuki Iwashima
Cc: Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau,
Stanislav Fomichev, Andrii Nakryiko, John Fastabend,
Kumar Kartikeya Dwivedi, Eduard Zingerman, Song Liu,
Yonghong Song, Jiri Olsa, Andrew Lunn, David S . Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
Willem de Bruijn, Kuniyuki Iwashima, bpf, netdev
In-Reply-To: <20260613010039.1362312-2-kuniyu@google.com>
On 06/13, Kuniyuki Iwashima wrote:
> Before enabling SO_TIMESTAMPING, applications typically try to
> enable hardware timestamping on network interfaces via SIOCSHWTSTAMP
> (or ETHTOOL_MSG_TSCONFIG_SET).
>
> The timestamping capability on an interface can be checked via
> ETHTOOL_MSG_TSINFO_GET:
>
> # ethtool -T eth0
> Time stamping parameters for eth0:
> Capabilities:
> hardware-transmit
> software-transmit
> hardware-receive
> software-receive
> software-system-clock
> hardware-raw-clock
> PTP Hardware Clock: none
> Hardware Transmit Timestamp Modes:
> off
> on
> Hardware Receive Filter Modes:
> none
> all
>
> These operations rely on the driver implementing two callbacks,
> dev->netdev_ops->ndo_hwtstamp_{get,set}().
>
> However, among all virtual network interfaces, only bond and
> macvlan currently implement them.
>
> As a result, most virtual interfaces cannot advertise the
> capabilities of their underlying devices:
>
> # ip link add ipvl0 link eth0 type ipvlan mode l2 bridge
> # ethtool -T ipvl0
> Time stamping parameters for ipvl0:
> Capabilities:
> software-receive
> software-system-clock
> PTP Hardware Clock: none
> Hardware Transmit Timestamp Modes: none
> Hardware Receive Filter Modes: none
>
> While these callbacks could be implemented in each virtual
> interface, this approach is limited to those directly linked
> to a physical device.
>
> Not all virtual interfaces are tied to real hardware; for
> instance, packets from UDP tunnel devices eventually pass
> through physical devices and can be hardware-timestamped there.
>
> Let's allow configuring the hardware timestamping capability on
> virtual interfaces via ETHTOOL_MSG_TSINFO_SET.
I don't have a lot of state on this, but when adding xdp hw timestamping
I remember fighting a lot with those apis. Now we are adding another
one, but (seemingly?) for sw devices. Can you explain a bit more about
why can't existing SIOCGHWTSTAMP (or ETHTOOL_MSG_TSCONFIG_SET) fallback
to your new dev->tsinfo.enabled codepaths for the devices that
don't implement the ops?
^ permalink raw reply
* RE: [PATCH net] tipc: restrict socket queue dumps in enqueue tracepoints
From: Tung Quang Nguyen @ 2026-06-13 2:34 UTC (permalink / raw)
To: Li Xiasong
Cc: stable@vger.kernel.org, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, netdev@vger.kernel.org,
tipc-discussion@lists.sourceforge.net, yuehaibing@huawei.com,
zhangchangzhong@huawei.com, weiyongjun1@huawei.com, Jon Maloy
In-Reply-To: <20260611135647.3666727-1-lixiasong1@huawei.com>
>Subject: [PATCH net] tipc: restrict socket queue dumps in enqueue tracepoints
>
>tipc_sk_enqueue() runs with sk->sk_lock.slock held while the socket is owned
>by user context. The spinlock protects the backlog queue in this path, but it
>does not serialize against the socket owner consuming or purging
>sk_receive_queue.
>
>KASAN reported:
>
> CPU: 14 UID: 0 PID: 1050 Comm: tipc3 Not tainted 7.1.0-rc6+ #126
>PREEMPT(lazy)
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1
>04/01/2014
> Call Trace:
> <TASK>
> dump_stack_lvl+0x76/0xa0 lib/dump_stack.c:123
> print_report+0xce/0x5b0 mm/kasan/report.c:482
> kasan_report+0xc6/0x100 mm/kasan/report.c:597
> __asan_report_load4_noabort+0x14/0x30 mm/kasan/report_generic.c:380
> tipc_skb_dump+0x1327/0x16f0 net/tipc/trace.c:73
> tipc_list_dump+0x208/0x2e0 net/tipc/trace.c:187
> tipc_sk_dump+0xaf6/0xd60 net/tipc/socket.c:3996
> trace_event_raw_event_tipc_sk_class+0x312/0x5a0 net/tipc/trace.h:188
> tipc_sk_rcv+0xb1d/0x1d50 net/tipc/socket.c:2497
> tipc_node_xmit+0x1c3/0x1440 net/tipc/node.c:1689
> __tipc_sendmsg+0x97a/0x1440 net/tipc/socket.c:1512
> tipc_sendmsg+0x52/0x80 net/tipc/socket.c:1400
> sock_sendmsg+0x2f6/0x3e0 net/socket.c:825
> splice_to_socket+0x7f9/0x1010 fs/splice.c:884
> do_splice+0xe21/0x2330 fs/splice.c:936
> __do_splice+0x153/0x260 fs/splice.c:1431
> __x64_sys_splice+0x150/0x230 fs/splice.c:1616
> x64_sys_call+0xeb5/0x2790 arch/x86/entry/syscall_64.c:41
> do_syscall_64+0xf3/0x620 arch/x86/entry/syscall_64.c:63
> entry_SYSCALL_64_after_hwframe+0x76/0x7e
>arch/x86/entry/entry_64.S:130
> RIP: 0033:0x71624e8aafe2
> Code: 08 0f 85 71 3a ff ff 49 89 fb 48 89 f0 48 89 d7 48 89 ce 4c 89 c2 4d 89 ca
>4c 8b 44 24 08 4c 8b 4c 24 10 4c 89 5c 24 08 0f 05 <c3> 66 2e 0f 1f 84 00 00 00
>00 00 66 2e 0f 1f 84 00 00 00 00 00 66
> RSP: 002b:0000716157ffed68 EFLAGS: 00000246 ORIG_RAX:
>0000000000000113
> RAX: ffffffffffffffda RBX: 0000716157fff6c0 RCX: 000071624e8aafe2
> RDX: 000000000000005f RSI: 0000000000000000 RDI: 0000000000000066
> RBP: 0000716157ffed90 R08: 0000000000008000 R09: 0000000000000001
> R10: 0000000000000000 R11: 0000000000000246 R12: ffffffffffffff00
> R13: 0000000000000021 R14: 0000000000000000 R15: 00007fff89799c40
> </TASK>
>
>The TIPC_DUMP_ALL tracepoints in tipc_sk_enqueue() also dump
>sk_receive_queue and can therefore dereference skbs that the socket owner
>has already dequeued or freed. Restrict these dumps to
>TIPC_DUMP_SK_BKLGQ, which matches the queue protected by the held
>spinlock.
>
>Keep the change limited to the enqueue path, where the unsafe queue dump
>is reachable while the socket is owned by user context.
>
>Fixes: 01e661ebfbad ("tipc: add trace_events for tipc socket")
>Cc: stable@vger.kernel.org
>Signed-off-by: Li Xiasong <lixiasong1@huawei.com>
>---
> net/tipc/socket.c | 6 +++---
> 1 file changed, 3 insertions(+), 3 deletions(-)
>
>diff --git a/net/tipc/socket.c b/net/tipc/socket.c index
>9329919fb07f..6b761003bcd1 100644
>--- a/net/tipc/socket.c
>+++ b/net/tipc/socket.c
>@@ -2452,17 +2452,17 @@ static void tipc_sk_enqueue(struct sk_buff_head
>*inputq, struct sock *sk,
> atomic_set(dcnt, 0);
> lim = rcvbuf_limit(sk, skb) + atomic_read(dcnt);
> if (likely(!sk_add_backlog(sk, skb, lim))) {
>- trace_tipc_sk_overlimit1(sk, skb, TIPC_DUMP_ALL,
>+ trace_tipc_sk_overlimit1(sk, skb,
>TIPC_DUMP_SK_BKLGQ,
> "bklg & rcvq >90%
>allocated!");
> continue;
> }
>
>- trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL,
>"err_overload!");
>+ trace_tipc_sk_dump(sk, skb, TIPC_DUMP_SK_BKLGQ,
>"err_overload!");
> /* Overload => reject message back to sender */
> onode = tipc_own_addr(sock_net(sk));
> sk_drops_inc(sk);
> if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) {
>- trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL,
>+ trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_SK_BKLGQ,
> "@sk_enqueue!");
> __skb_queue_tail(xmitq, skb);
> }
>--
>2.34.1
>
Reviewed-by: Tung Nguyen <tung.quang.nguyen@est.tech>
^ permalink raw reply
* RE: [PATCH net] tipc: fix UAF in tipc_l2_send_msg()
From: Tung Quang Nguyen @ 2026-06-13 2:50 UTC (permalink / raw)
To: Eric Dumazet
Cc: Simon Horman, netdev@vger.kernel.org, eric.dumazet@gmail.com,
syzbot+64ec81389cbad56a8c35@syzkaller.appspotmail.com, Jon Maloy,
David S . Miller, Jakub Kicinski, Paolo Abeni
In-Reply-To: <20260612135949.4010482-1-edumazet@google.com>
>Subject: [PATCH net] tipc: fix UAF in tipc_l2_send_msg()
>
>Syzbot reported a slab-use-after-free in ipvlan_hard_header() when called
>from tipc_l2_send_msg().
>
>The root cause is that tipc_disable_l2_media() calls synchronize_net() while b-
>>media_ptr is still valid. This allows concurrent RCU readers to obtain the
>device pointer after synchronize_net() has finished.
>The pointer is cleared later in bearer_disable(), but without any subsequent
>synchronization, allowing the device to be freed while still in use by readers.
>
>Fix this by clearing b->media_ptr in tipc_disable_l2_media() before calling
>synchronize_net().
>
>This is safe to do now because the call order in bearer_disable() was reversed
>in 0d051bf93c06 ("tipc: make bearer packet filtering generic") to call
>tipc_node_delete_links() (which needs the pointer) before disable_media().
>
>Fixes: 282b3a056225 ("tipc: send out RESET immediately when link goes
>down")
>https://lore.kernel.org/netdev/6a2c1007.428ffe26.258b27.015d.GAE@google.c
>om/T/#u
>Reported-by: syzbot+64ec81389cbad56a8c35@syzkaller.appspotmail.com
>Signed-off-by: Eric Dumazet <edumazet@google.com>
>Cc: Jon Maloy <jmaloy@redhat.com>
>---
> net/tipc/bearer.c | 1 +
> 1 file changed, 1 insertion(+)
>
>diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index
>a3bd1ef17558a37787bb92f2c3805c0fda874d8a..05dcd2f9e887a6e5ca6665ab4
>1e4d5b5107f158c 100644
>--- a/net/tipc/bearer.c
>+++ b/net/tipc/bearer.c
>@@ -482,6 +482,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
> dev = (struct net_device *)rtnl_dereference(b->media_ptr);
> dev_remove_pack(&b->pt);
> RCU_INIT_POINTER(dev->tipc_ptr, NULL);
>+ RCU_INIT_POINTER(b->media_ptr, NULL);
Since 'b->media_ptr' is reset here, Should the same reset be removed in bearer_disable() ?
bearer_disable()
{
...
RCU_INIT_POINTER(b->media_ptr, NULL);
...
}
> synchronize_net();
> dev_put(dev);
> }
>--
>2.54.0.1136.gdb2ca164c4-goog
>
^ permalink raw reply
* [PATCH net-next v6 0/4] net: dsa: mxl862xx: SerDes ports
From: Daniel Golle @ 2026-06-13 3:07 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King,
linux-kernel, netdev
Add support for the two SerDes PCS interfaces of the MxL862xx switch
ICs, which can both either be used to connect PHYs or SFP cages, or as
CPU port(s). 1000Base-X, 2500Base-X, 10GBase-R, 10GBase-KR, SGMII,
QSGMII and USXGMII (single 10G or quad 2.5G) are supported.
The firmware only added the API to directly control the PCS as of
version 1.0.84, so the PCS features are gated behind a version check.
As the driver is growing do some refactoring to break out the phylink
parts into mxl862xx-phylink.h.
---
Changes since v5:
* clear state->link on the pcs_get_state early-return error paths
* document in phylink_get_caps and in the commit message why the
single-lane SerDes modes stay advertised on old firmware
* document why pcs_config does not need to take serdes_lock
Changes since v4:
* use FIELD_GET/FIELD_PREP and macro definitions for the bitfields
instead of endian-aware structs with bit-sized members
* do not error out on old firmware, so driver at least probes and
CPU port keeps working in firmware-configured mode. Issue a
warning instead.
Changes since v3:
* replace atomic_t serdes_refcount with a plain int guarded by a new
serdes_lock mutex
Changes since v2:
* get rid of endian-specific union handling firmware version
* replace serdes_active bitmap with atomic_t serdes_refcount
* defer mpcs->interface assignment until after firmware ack
* handle firmware error codes in pcs_config
* set st.usx_lane_mode in pcs_get_state
* set lu.usx_subport and lu.usx_lane_mode in pcs_link_up
* use phylink_mii_c22_pcs_encode_advertisement() in CL37 adv
* rework commit message
Changes since v1:
* drop custom ethtool stats (former patch 5/5)
* add __{LE,BE}_BITFIELD layouts to ABI structs
* per-sub-port QSGMII AN restart via usx_subport / usx_lane_mode
* shared-SerDes refcount in pcs_disable via per-XPCS slot bitmap
* let every sub-port call pcs_config
* cache phy_interface_t instead of firmware type
* skip pcs_link_up when inband-AN enabled
* gate phylink_get_caps SerDes modes on same FW version as select_pcs
* interpret xpcs_pcs_cfg.result as signed (s16)
* drop dead MXL862XX_PCS_PORT macro
* drop misleading "downshift detection" line from commit message
Daniel Golle (5):
net: dsa: mxl862xx: store firmware version for feature gating
net: dsa: mxl862xx: move phylink stubs to mxl862xx-phylink.c
net: dsa: mxl862xx: move API macros to mxl862xx-host.h
net: dsa: mxl862xx: add support for SerDes ports
net: dsa: mxl862xx: add SerDes ethtool statistics
drivers/net/dsa/mxl862xx/Makefile | 2 +-
drivers/net/dsa/mxl862xx/mxl862xx-api.h | 392 +++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-cmd.h | 11 +
drivers/net/dsa/mxl862xx/mxl862xx-host.h | 8 +
drivers/net/dsa/mxl862xx/mxl862xx-phylink.c | 446 ++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-phylink.h | 27 ++
drivers/net/dsa/mxl862xx/mxl862xx.c | 60 +--
drivers/net/dsa/mxl862xx/mxl862xx.h | 58 +++
8 files changed, 957 insertions(+), 47 deletions(-)
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
--
2.54.0
Daniel Golle (4):
net: dsa: mxl862xx: store firmware version for feature gating
net: dsa: mxl862xx: move phylink stubs to mxl862xx-phylink.c
net: dsa: mxl862xx: move API macros to mxl862xx-host.h
net: dsa: mxl862xx: add support for SerDes ports
drivers/net/dsa/mxl862xx/Makefile | 2 +-
drivers/net/dsa/mxl862xx/mxl862xx-api.h | 215 ++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-cmd.h | 9 +
drivers/net/dsa/mxl862xx/mxl862xx-host.h | 8 +
drivers/net/dsa/mxl862xx/mxl862xx-phylink.c | 446 ++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-phylink.h | 21 +
drivers/net/dsa/mxl862xx/mxl862xx.c | 55 +--
drivers/net/dsa/mxl862xx/mxl862xx.h | 57 +++
8 files changed, 767 insertions(+), 46 deletions(-)
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
base-commit: ec782be97d2d364fec730512259e6da259594109
prerequisite-patch-id: 0000000000000000000000000000000000000000
--
2.54.0
^ permalink raw reply
* [PATCH net-next v6 1/4] net: dsa: mxl862xx: store firmware version for feature gating
From: Daniel Golle @ 2026-06-13 3:07 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King,
linux-kernel, netdev
In-Reply-To: <cover.1781319534.git.daniel@makrotopia.org>
Query the firmware version at init (already done in wait_ready),
cache it in priv->fw_version, and provide MXL862XX_FW_VER_MIN()
for version-gated code paths throughout the driver.
MXL862XX_FW_VER() packs major/minor/revision into a u32 with
bitwise shifts so that versions compare with natural ordering,
independent of host endianness.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
---
v6: no changes
v5: no changes
v4: no changes
v3: use bitwise shifts in macro instead of endian-specific union
v2: no changes
drivers/net/dsa/mxl862xx/mxl862xx.c | 3 +++
drivers/net/dsa/mxl862xx/mxl862xx.h | 23 +++++++++++++++++++++++
2 files changed, 26 insertions(+)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index b60482d93a85..2f22adedfbf6 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -257,6 +257,9 @@ static int mxl862xx_wait_ready(struct dsa_switch *ds)
ver.iv_major, ver.iv_minor,
le16_to_cpu(ver.iv_revision),
le32_to_cpu(ver.iv_build_num));
+ priv->fw_version.major = ver.iv_major;
+ priv->fw_version.minor = ver.iv_minor;
+ priv->fw_version.revision = le16_to_cpu(ver.iv_revision);
return 0;
not_ready_yet:
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.h b/drivers/net/dsa/mxl862xx/mxl862xx.h
index 80053ab40e4c..e3db3711b245 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.h
@@ -3,6 +3,7 @@
#ifndef __MXL862XX_H
#define __MXL862XX_H
+#include <asm/byteorder.h>
#include <linux/mdio.h>
#include <linux/workqueue.h>
#include <net/dsa.h>
@@ -241,6 +242,25 @@ struct mxl862xx_port {
spinlock_t stats_lock; /* protects stats accumulators */
};
+/**
+ * struct mxl862xx_fw_version - firmware version for comparison and display
+ * @major: firmware major version
+ * @minor: firmware minor version
+ * @revision: firmware revision number
+ */
+struct mxl862xx_fw_version {
+ u8 major;
+ u8 minor;
+ u16 revision;
+};
+
+#define MXL862XX_FW_VER(maj, min, rev) \
+ (((u32)(maj) << 24) | ((u32)(min) << 16) | (rev))
+#define MXL862XX_FW_VER_MIN(priv, maj, min, rev) \
+ (MXL862XX_FW_VER((priv)->fw_version.major, (priv)->fw_version.minor, \
+ (priv)->fw_version.revision) >= \
+ MXL862XX_FW_VER(maj, min, rev))
+
/* Bit indices for struct mxl862xx_priv::flags */
#define MXL862XX_FLAG_CRC_ERR 0
#define MXL862XX_FLAG_WORK_STOPPED 1
@@ -258,6 +278,8 @@ struct mxl862xx_port {
* @drop_meter: index of the single shared zero-rate firmware meter
* used to unconditionally drop traffic (used to block
* flooding)
+ * @fw_version: cached firmware version, populated at probe and
+ * compared with MXL862XX_FW_VER_MIN()
* @ports: per-port state, indexed by switch port number
* @bridges: maps DSA bridge number to firmware bridge ID;
* zero means no firmware bridge allocated for that
@@ -275,6 +297,7 @@ struct mxl862xx_priv {
struct work_struct crc_err_work;
unsigned long flags;
u16 drop_meter;
+ struct mxl862xx_fw_version fw_version;
struct mxl862xx_port ports[MXL862XX_MAX_PORTS];
u16 bridges[MXL862XX_MAX_BRIDGES + 1];
u16 evlan_ingress_size;
--
2.54.0
^ permalink raw reply related
* [PATCH net-next v6 2/4] net: dsa: mxl862xx: move phylink stubs to mxl862xx-phylink.c
From: Daniel Golle @ 2026-06-13 3:07 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King,
linux-kernel, netdev
In-Reply-To: <cover.1781319534.git.daniel@makrotopia.org>
Move the phylink MAC operations and get_caps callback from mxl862xx.c
into a dedicated mxl862xx-phylink.c file. This prepares for the SerDes
PCS implementation which adds substantial phylink/PCS code -- keeping
it in a separate file avoids function-position churn in the main
driver file.
No functional change.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
---
v6: no changes
v5: no changes
v4: no changes
v3: no changes
v2: no changes
drivers/net/dsa/mxl862xx/Makefile | 2 +-
drivers/net/dsa/mxl862xx/mxl862xx-phylink.c | 51 +++++++++++++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-phylink.h | 14 ++++++
drivers/net/dsa/mxl862xx/mxl862xx.c | 38 +--------------
4 files changed, 67 insertions(+), 38 deletions(-)
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
create mode 100644 drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
diff --git a/drivers/net/dsa/mxl862xx/Makefile b/drivers/net/dsa/mxl862xx/Makefile
index d23dd3cd511d..a7be0e6669df 100644
--- a/drivers/net/dsa/mxl862xx/Makefile
+++ b/drivers/net/dsa/mxl862xx/Makefile
@@ -1,3 +1,3 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_NET_DSA_MXL862) += mxl862xx_dsa.o
-mxl862xx_dsa-y := mxl862xx.o mxl862xx-host.o
+mxl862xx_dsa-y := mxl862xx.o mxl862xx-host.o mxl862xx-phylink.o
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
new file mode 100644
index 000000000000..f17c429d1f1d
--- /dev/null
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Phylink and PCS support for MaxLinear MxL862xx switch family
+ *
+ * Copyright (C) 2024 MaxLinear Inc.
+ * Copyright (C) 2025 John Crispin <john@phrozen.org>
+ * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
+ */
+
+#include <linux/phylink.h>
+#include <net/dsa.h>
+
+#include "mxl862xx.h"
+#include "mxl862xx-phylink.h"
+
+void mxl862xx_phylink_get_caps(struct dsa_switch *ds, int port,
+ struct phylink_config *config)
+{
+ config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 |
+ MAC_100 | MAC_1000 | MAC_2500FD;
+
+ __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+ config->supported_interfaces);
+}
+
+static void mxl862xx_phylink_mac_config(struct phylink_config *config,
+ unsigned int mode,
+ const struct phylink_link_state *state)
+{
+}
+
+static void mxl862xx_phylink_mac_link_down(struct phylink_config *config,
+ unsigned int mode,
+ phy_interface_t interface)
+{
+}
+
+static void mxl862xx_phylink_mac_link_up(struct phylink_config *config,
+ struct phy_device *phydev,
+ unsigned int mode,
+ phy_interface_t interface,
+ int speed, int duplex,
+ bool tx_pause, bool rx_pause)
+{
+}
+
+const struct phylink_mac_ops mxl862xx_phylink_mac_ops = {
+ .mac_config = mxl862xx_phylink_mac_config,
+ .mac_link_down = mxl862xx_phylink_mac_link_down,
+ .mac_link_up = mxl862xx_phylink_mac_link_up,
+};
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
new file mode 100644
index 000000000000..c3d5215bdf60
--- /dev/null
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __MXL862XX_PHYLINK_H
+#define __MXL862XX_PHYLINK_H
+
+#include <linux/phylink.h>
+
+#include "mxl862xx.h"
+
+extern const struct phylink_mac_ops mxl862xx_phylink_mac_ops;
+void mxl862xx_phylink_get_caps(struct dsa_switch *ds, int port,
+ struct phylink_config *config);
+
+#endif /* __MXL862XX_PHYLINK_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index 2f22adedfbf6..a193f3c07d35 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -22,6 +22,7 @@
#include "mxl862xx-api.h"
#include "mxl862xx-cmd.h"
#include "mxl862xx-host.h"
+#include "mxl862xx-phylink.h"
#define MXL862XX_API_WRITE(dev, cmd, data) \
mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), false, false)
@@ -1424,16 +1425,6 @@ static void mxl862xx_port_teardown(struct dsa_switch *ds, int port)
priv->ports[port].setup_done = false;
}
-static void mxl862xx_phylink_get_caps(struct dsa_switch *ds, int port,
- struct phylink_config *config)
-{
- config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 |
- MAC_100 | MAC_1000 | MAC_2500FD;
-
- __set_bit(PHY_INTERFACE_MODE_INTERNAL,
- config->supported_interfaces);
-}
-
static int mxl862xx_get_fid(struct dsa_switch *ds, struct dsa_db db)
{
struct mxl862xx_priv *priv = ds->priv;
@@ -2099,33 +2090,6 @@ static const struct dsa_switch_ops mxl862xx_switch_ops = {
.get_stats64 = mxl862xx_get_stats64,
};
-static void mxl862xx_phylink_mac_config(struct phylink_config *config,
- unsigned int mode,
- const struct phylink_link_state *state)
-{
-}
-
-static void mxl862xx_phylink_mac_link_down(struct phylink_config *config,
- unsigned int mode,
- phy_interface_t interface)
-{
-}
-
-static void mxl862xx_phylink_mac_link_up(struct phylink_config *config,
- struct phy_device *phydev,
- unsigned int mode,
- phy_interface_t interface,
- int speed, int duplex,
- bool tx_pause, bool rx_pause)
-{
-}
-
-static const struct phylink_mac_ops mxl862xx_phylink_mac_ops = {
- .mac_config = mxl862xx_phylink_mac_config,
- .mac_link_down = mxl862xx_phylink_mac_link_down,
- .mac_link_up = mxl862xx_phylink_mac_link_up,
-};
-
static int mxl862xx_probe(struct mdio_device *mdiodev)
{
struct device *dev = &mdiodev->dev;
--
2.54.0
^ permalink raw reply related
* [PATCH net-next v6 3/4] net: dsa: mxl862xx: move API macros to mxl862xx-host.h
From: Daniel Golle @ 2026-06-13 3:07 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King,
linux-kernel, netdev
In-Reply-To: <cover.1781319534.git.daniel@makrotopia.org>
Move the MXL862XX_API_WRITE, MXL862XX_API_READ and
MXL862XX_API_READ_QUIET convenience macros from mxl862xx.c to
mxl862xx-host.h next to the mxl862xx_api_wrap() prototype they wrap.
This makes them available to other compilation units that include
mxl862xx-host.h, which is needed once the SerDes PCS code in
mxl862xx-phylink.c also calls firmware commands.
No functional change.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
---
v6: no changes
v5: no changes
v4: no changes
v3: no changes
v2: no changes
drivers/net/dsa/mxl862xx/mxl862xx-host.h | 8 ++++++++
drivers/net/dsa/mxl862xx/mxl862xx.c | 7 -------
2 files changed, 8 insertions(+), 7 deletions(-)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-host.h b/drivers/net/dsa/mxl862xx/mxl862xx-host.h
index 84512a30bc18..66d6ae198aff 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-host.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-host.h
@@ -9,6 +9,14 @@ void mxl862xx_host_init(struct mxl862xx_priv *priv);
void mxl862xx_host_shutdown(struct mxl862xx_priv *priv);
int mxl862xx_api_wrap(struct mxl862xx_priv *priv, u16 cmd, void *data, u16 size,
bool read, bool quiet);
+
+#define MXL862XX_API_WRITE(dev, cmd, data) \
+ mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), false, false)
+#define MXL862XX_API_READ(dev, cmd, data) \
+ mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, false)
+#define MXL862XX_API_READ_QUIET(dev, cmd, data) \
+ mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, true)
+
int mxl862xx_reset(struct mxl862xx_priv *priv);
#endif /* __MXL862XX_HOST_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index a193f3c07d35..0b1a23364eb5 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -24,13 +24,6 @@
#include "mxl862xx-host.h"
#include "mxl862xx-phylink.h"
-#define MXL862XX_API_WRITE(dev, cmd, data) \
- mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), false, false)
-#define MXL862XX_API_READ(dev, cmd, data) \
- mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, false)
-#define MXL862XX_API_READ_QUIET(dev, cmd, data) \
- mxl862xx_api_wrap(dev, cmd, &(data), sizeof((data)), true, true)
-
/* Polling interval for RMON counter accumulation. At 2.5 Gbps with
* minimum-size (64-byte) frames, a 32-bit packet counter wraps in ~880s.
* 2s gives a comfortable margin.
--
2.54.0
^ permalink raw reply related
* [PATCH net-next v6 4/4] net: dsa: mxl862xx: add support for SerDes ports
From: Daniel Golle @ 2026-06-13 3:07 UTC (permalink / raw)
To: Daniel Golle, Andrew Lunn, Vladimir Oltean, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Russell King,
linux-kernel, netdev
In-Reply-To: <cover.1781319534.git.daniel@makrotopia.org>
The MxL862xx has two XPCS/SerDes interfaces (XPCS0 for ports 9-12,
XPCS1 for ports 13-16). Each can operate in various single-lane modes
(SGMII, 1000Base-X, 2500Base-X, 10GBase-R, 10GBase-KR, USXGMII) or as
QSGMII or 10G_QXGMII providing four sub-ports per interface.
Implement phylink PCS operations using the firmware's XPCS API:
- pcs_enable/pcs_disable: refcount the sub-ports sharing an XPCS
and power it down once the last sub-port is released.
- pcs_config: configure negotiation mode and CL37/SGMII advertising.
- pcs_get_state: read link state and the link-partner ability word
from firmware and decode using phylink's standard CL37, SGMII, and
USXGMII decoders.
- pcs_an_restart: restart CL37 or CL73 auto-negotiation.
- pcs_link_up: force speed/duplex for SGMII.
- pcs_inband_caps: report per-mode in-band status capabilities.
Register a PCS instance for each SerDes interface and
QSGMII/10G_QXGMII sub-ports during setup. Advertise the supported
interface modes in phylink_get_caps based on port number.
Firmware older than 1.0.84 lacks the XPCS API and instead configures
the SerDes itself, using defaults stored in flash. mac_select_pcs()
returns NULL in that case while the single-lane interface modes stay
advertised, so a CPU port keeps working in the firmware-configured
mode.
Lacking support for expressing PHY-side role modes in Linux only the
MAC-side of SGMII, QSGMII, USXGMII and 10G_QXGMII are implemented for
now.
Signed-off-by: Daniel Golle <daniel@makrotopia.org>
---
v6:
* clear state->link on the pcs_get_state early-return error paths
* document in phylink_get_caps and in the commit message why the
single-lane SerDes modes stay advertised on old firmware
* document why pcs_config does not need to take serdes_lock
v5:
* use FIELD_GET/FIELD_PREP and macro definitions for the bitfields
instead of endian-aware structs with bit-sized members
* do not error out on old firmware, so driver at least probes and
CPU port keeps working in firmware-configured mode. Issue a
warning instead.
v4:
* replace atomic_t serdes_refcount with a plain int guarded by a new
serdes_lock mutex; pcs_disable now holds the lock across the count
and the XPCS power-down so a sibling sub-port enable cannot race the
transition to zero (the atomic only made the counter safe, not the
decision-and-act)
v3:
* replace serdes_active bitmap with atomic_t serdes_refcount
* defer mpcs->interface assignment until after firmware ack
* handle firmware error codes in pcs_config
* set st.usx_lane_mode in pcs_get_state
* set lu.usx_subport and lu.usx_lane_mode in pcs_link_up
* use phylink_mii_c22_pcs_encode_advertisement() in CL37 adv
* rework commit message
v2:
* add __{LE,BE}_BITFIELD layouts to ABI structs
* per-sub-port QSGMII AN restart via usx_subport / usx_lane_mode
* shared-SerDes refcount in pcs_disable via per-XPCS slot bitmap
* let every sub-port call pcs_config
* cache phy_interface_t instead of firmware type
* skip pcs_link_up when inband-AN enabled
* gate phylink_get_caps SerDes modes on same FW version as select_pcs
* interpret xpcs_pcs_cfg.result as signed (s16)
* drop dead MXL862XX_PCS_PORT macro
* drop misleading "downshift detection" line from commit message
drivers/net/dsa/mxl862xx/mxl862xx-api.h | 215 +++++++++++
drivers/net/dsa/mxl862xx/mxl862xx-cmd.h | 9 +
drivers/net/dsa/mxl862xx/mxl862xx-phylink.c | 399 +++++++++++++++++++-
drivers/net/dsa/mxl862xx/mxl862xx-phylink.h | 7 +
drivers/net/dsa/mxl862xx/mxl862xx.c | 7 +-
drivers/net/dsa/mxl862xx/mxl862xx.h | 34 ++
6 files changed, 668 insertions(+), 3 deletions(-)
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-api.h b/drivers/net/dsa/mxl862xx/mxl862xx-api.h
index fb21ddc1bf1c..a180a5decffc 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-api.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-api.h
@@ -1366,4 +1366,219 @@ struct mxl862xx_rmon_port_cnt {
__le64 tx_good_bytes;
} __packed;
+/* XPCS interface mode, MXL862XX_XPCS_*_INTERFACE field values */
+#define MXL862XX_XPCS_IF_SGMII 0
+#define MXL862XX_XPCS_IF_1000BASEX 1
+#define MXL862XX_XPCS_IF_2500BASEX 2
+#define MXL862XX_XPCS_IF_USXGMII 3 /* single or quad */
+#define MXL862XX_XPCS_IF_10GBASER 4
+#define MXL862XX_XPCS_IF_10GKR 5 /* 10GBASE-KR */
+#define MXL862XX_XPCS_IF_5GBASER 6
+#define MXL862XX_XPCS_IF_QSGMII 7
+
+/* PCS negotiation mode, MXL862XX_XPCS_CFG_NEG_MODE field values */
+#define MXL862XX_XPCS_NEG_NONE 0 /* no inband negotiation */
+#define MXL862XX_XPCS_NEG_INBAND_AN_OFF 1 /* inband, AN disabled */
+#define MXL862XX_XPCS_NEG_INBAND_AN_ON 2 /* inband, AN enabled */
+
+/*
+ * PCS protocol role, MXL862XX_XPCS_CFG_ROLE field value. Selects the role
+ * the XPCS plays in protocols with an asymmetric AN code word (Cisco SGMII
+ * / QSGMII / USXGMII), driving VR_MII_AN_CTRL.TX_CONFIG: MAC means the
+ * local end receives the partner's AN word, PHY means it sources one.
+ * Ignored for symmetric protocols (1000BASE-X, 2500BASE-X, 10GBASE-R/KR).
+ */
+#define MXL862XX_XPCS_ROLE_MAC 0 /* local end is MAC side */
+#define MXL862XX_XPCS_ROLE_PHY 1 /* local end is PHY side */
+
+/* USXGMII lane mode, MXL862XX_XPCS_*_USX_LANE_MODE field values */
+#define MXL862XX_XPCS_USX_SINGLE 0 /* single USXGMII lane */
+#define MXL862XX_XPCS_USX_QUAD 1 /* quad USXGMII, 4 ports/lane */
+
+/**
+ * union mxl862xx_xpcs_an_word - XPCS AN code word, tagged by interface mode
+ * @cl37: 16-bit base-page word exchanged over the CL37 hardware AN path
+ * (SR_MII_AN_ADV on write, SR_MII_LP_BABL on read). Carries the
+ * 802.3 CL37 base page for 1000BASE-X/2500BASE-X and the Cisco
+ * SGMII config word for SGMII/QSGMII.
+ * @usx: USXGMII 16-bit AN code word, MDIO_USXGMII_* layout
+ * @cl73: CL73 48-bit base page (10GBASE-KR), three 16-bit registers per
+ * 802.3 Annex 28C
+ * @cl73.adv1: CL73 SR_AN_ADV1 / SR_AN_LP_ABL1
+ * @cl73.adv2: CL73 SR_AN_ADV2 / SR_AN_LP_ABL2
+ * @cl73.adv3: CL73 SR_AN_ADV3 / SR_AN_LP_ABL3
+ *
+ * The host picks the right member based on the interface field of the
+ * surrounding struct (and, for the asymmetric protocols, on the role).
+ */
+union mxl862xx_xpcs_an_word {
+ __le16 cl37;
+ __le16 usx;
+ struct {
+ __le16 adv1;
+ __le16 adv2;
+ __le16 adv3;
+ } cl73;
+} __packed;
+
+/* PCS duplex mode, MXL862XX_XPCS_*_DUPLEX field values */
+#define MXL862XX_XPCS_DUPLEX_HALF 0
+#define MXL862XX_XPCS_DUPLEX_FULL 1
+
+/**
+ * enum mxl862xx_xpcs_loopback_mode - XPCS loopback mode
+ * @MXL862XX_XPCS_LB_DISABLE: disable all loopback
+ * @MXL862XX_XPCS_LB_PCS_SERIAL: PCS TX-to-RX serial loopback
+ * @MXL862XX_XPCS_LB_PCS_PARALLEL: PCS RX-to-TX parallel loopback
+ * @MXL862XX_XPCS_LB_PMA_SERIAL: PMA TX-to-RX serial loopback
+ * @MXL862XX_XPCS_LB_PMA_PARALLEL: PMA RX-to-TX parallel loopback
+ */
+enum mxl862xx_xpcs_loopback_mode {
+ MXL862XX_XPCS_LB_DISABLE = 0,
+ MXL862XX_XPCS_LB_PCS_SERIAL = 1,
+ MXL862XX_XPCS_LB_PCS_PARALLEL = 2,
+ MXL862XX_XPCS_LB_PMA_SERIAL = 3,
+ MXL862XX_XPCS_LB_PMA_PARALLEL = 4,
+};
+
+/* Fields of mxl862xx_xpcs_pcs_cfg.mode */
+#define MXL862XX_XPCS_CFG_PORT_ID GENMASK(1, 0)
+#define MXL862XX_XPCS_CFG_INTERFACE GENMASK(7, 2)
+#define MXL862XX_XPCS_CFG_NEG_MODE GENMASK(9, 8)
+#define MXL862XX_XPCS_CFG_PERMIT_PAUSE BIT(10)
+#define MXL862XX_XPCS_CFG_USX_LANE_MODE GENMASK(12, 11)
+#define MXL862XX_XPCS_CFG_ROLE BIT(13)
+#define MXL862XX_XPCS_CFG_USX_SUBPORT GENMASK(15, 14)
+
+/**
+ * struct mxl862xx_xpcs_pcs_cfg - PCS configuration parameters
+ * @mode: Packed interface and negotiation parameters, see
+ * MXL862XX_XPCS_CFG_*. port_id is the XPCS port index (0-3);
+ * interface is the PCS interface mode (MXL862XX_XPCS_IF_*);
+ * neg_mode is the negotiation mode (MXL862XX_XPCS_NEG_*);
+ * permit_pause allows pause to MAC; usx_lane_mode is the USXGMII
+ * lane mode (MXL862XX_XPCS_USX_*); role is the protocol role
+ * (MXL862XX_XPCS_ROLE_*); usx_subport is the sub-port (0-3) within
+ * the XPCS -- despite the name it also identifies the QSGMII
+ * sub-port -- used by the firmware to set MAC pause per sub-port
+ * and ignored for the XPCS-wide bringup, which is idempotent across
+ * slots.
+ * @advertising: AN code word the local end transmits. The active union
+ * member is selected by the interface field (and, for the
+ * asymmetric protocols, by role). Ignored when the local end
+ * does not transmit an AN word (role=MAC for SGMII/QSGMII/
+ * USXGMII, 10GBASE-R, 5GBASE-R) or when neg_mode is not
+ * INBAND_AN_ON. Pass all-zero to keep the firmware default
+ * advertisement.
+ * @result: Firmware result. >0 means the host must follow with an AN
+ * restart, 0 means no host follow-up is needed, <0 is an errno.
+ */
+struct mxl862xx_xpcs_pcs_cfg {
+ __le16 mode;
+ union mxl862xx_xpcs_an_word advertising;
+ __le16 result;
+} __packed;
+
+/* Fields of mxl862xx_xpcs_pcs_state.mode */
+#define MXL862XX_XPCS_ST_PORT_ID GENMASK(1, 0)
+#define MXL862XX_XPCS_ST_INTERFACE GENMASK(7, 2)
+#define MXL862XX_XPCS_ST_USX_LANE_MODE GENMASK(9, 8)
+#define MXL862XX_XPCS_ST_USX_SUBPORT GENMASK(11, 10)
+#define MXL862XX_XPCS_ST_LINK BIT(12)
+#define MXL862XX_XPCS_ST_AN_COMPLETE BIT(13)
+#define MXL862XX_XPCS_ST_DUPLEX BIT(14)
+#define MXL862XX_XPCS_ST_PCS_FAULT BIT(15)
+#define MXL862XX_XPCS_ST_PAUSE GENMASK(17, 16)
+#define MXL862XX_XPCS_ST_LP_EEE_CAP BIT(18)
+#define MXL862XX_XPCS_ST_LP_EEE_CS_CAP BIT(19)
+
+/**
+ * struct mxl862xx_xpcs_pcs_state - PCS link state
+ * @mode: Packed input parameters and firmware status, see
+ * MXL862XX_XPCS_ST_*. The host writes port_id (XPCS port index 0-3),
+ * interface (MXL862XX_XPCS_IF_*), usx_lane_mode
+ * (MXL862XX_XPCS_USX_*) and usx_subport (0-3); the firmware fills in
+ * link, an_complete, duplex (MXL862XX_XPCS_DUPLEX_*), pcs_fault,
+ * pause (bit 0 symmetric, bit 1 asymmetric), lp_eee_cap and
+ * lp_eee_cs_cap.
+ * @speed: Resolved speed in Mbit/s (output)
+ * @lpa: Link partner ability word (output). Same union as
+ * &union mxl862xx_xpcs_an_word; the host picks the member based on
+ * the interface field.
+ */
+struct mxl862xx_xpcs_pcs_state {
+ __le32 mode;
+ __le16 speed; /* Mbit/s */
+ union mxl862xx_xpcs_an_word lpa;
+} __packed;
+
+/**
+ * struct mxl862xx_xpcs_pcs_disable - PCS disable parameters
+ * @port_id: XPCS port index
+ * @__pad: padding
+ * @result: Firmware result. 0 on success, <0 on error.
+ *
+ * Asserts IDDQ + PHY + XPCS resets to power down the SERDES when the
+ * port is admin-down or no module is plugged in. The next PCS config
+ * implicitly powers it back up and reprograms the desired interface.
+ */
+struct mxl862xx_xpcs_pcs_disable {
+ u8 port_id;
+ u8 __pad;
+ __le16 result;
+} __packed;
+
+/* Fields of mxl862xx_xpcs_an_restart.mode */
+#define MXL862XX_XPCS_ANR_PORT_ID GENMASK(1, 0)
+#define MXL862XX_XPCS_ANR_INTERFACE GENMASK(7, 2)
+#define MXL862XX_XPCS_ANR_USX_LANE_MODE GENMASK(9, 8)
+#define MXL862XX_XPCS_ANR_USX_SUBPORT GENMASK(11, 10)
+
+/**
+ * struct mxl862xx_xpcs_an_restart - AN restart parameters
+ * @mode: Packed input parameters, see MXL862XX_XPCS_ANR_*. port_id is the
+ * XPCS port index (0-3); interface is the PCS interface mode
+ * (MXL862XX_XPCS_IF_*); usx_lane_mode is the USX lane mode
+ * (MXL862XX_XPCS_USX_*); usx_subport (0-3) selects the lane whose
+ * AN is restarted for QSGMII and QUSXGMII and is ignored by
+ * single-lane modes.
+ * @result: Firmware result. 0 on success, <0 on error.
+ *
+ * Restarts auto-negotiation on a single sub-port of the XPCS. The
+ * SERDES must already be configured.
+ */
+struct mxl862xx_xpcs_an_restart {
+ __le16 mode;
+ __le16 result;
+} __packed;
+
+/* Fields of mxl862xx_xpcs_pcs_link_up.mode */
+#define MXL862XX_XPCS_LU_PORT_ID GENMASK(1, 0)
+#define MXL862XX_XPCS_LU_INTERFACE GENMASK(7, 2)
+#define MXL862XX_XPCS_LU_DUPLEX BIT(8)
+#define MXL862XX_XPCS_LU_USX_LANE_MODE GENMASK(10, 9)
+#define MXL862XX_XPCS_LU_USX_SUBPORT GENMASK(12, 11)
+
+/**
+ * struct mxl862xx_xpcs_pcs_link_up - PCS link-up parameters
+ * @mode: Packed input parameters, see MXL862XX_XPCS_LU_*. port_id is the
+ * XPCS port index (0-3); interface is the PCS interface mode
+ * (MXL862XX_XPCS_IF_*); duplex is the duplex mode
+ * (MXL862XX_XPCS_DUPLEX_*); usx_lane_mode is the USX lane mode
+ * (USXGMII only, ignored otherwise, MXL862XX_XPCS_USX_*);
+ * usx_subport (0-3) selects the sub-port for QUSXGMII and QSGMII
+ * (despite the name) and is ignored otherwise.
+ * @speed: Resolved speed in Mbit/s
+ * @result: Firmware result. 0 on success, <0 is errno.
+ *
+ * Called once per link-up event after the host has resolved the
+ * line-side speed/duplex (from the PHY's read_status, from a preceding
+ * PCS get-state, or from a fixed-link description).
+ */
+struct mxl862xx_xpcs_pcs_link_up {
+ __le16 mode;
+ __le16 speed; /* Mbit/s */
+ __le16 result;
+} __packed;
+
#endif /* __MXL862XX_API_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h b/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
index f1ea40aa7ea0..c87a955c13c4 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-cmd.h
@@ -24,6 +24,7 @@
#define MXL862XX_SS_MAGIC 0x1600
#define GPY_GPY2XX_MAGIC 0x1800
#define SYS_MISC_MAGIC 0x1900
+#define MXL862XX_XPCS_MAGIC 0x1a00
#define MXL862XX_COMMON_CFGGET (MXL862XX_COMMON_MAGIC + 0x9)
#define MXL862XX_COMMON_CFGSET (MXL862XX_COMMON_MAGIC + 0xa)
@@ -71,6 +72,14 @@
#define SYS_MISC_FW_VERSION (SYS_MISC_MAGIC + 0x2)
+#define MXL862XX_XPCS_PCS_CONFIG (MXL862XX_XPCS_MAGIC + 0x1)
+#define MXL862XX_XPCS_PCS_GET_STATE (MXL862XX_XPCS_MAGIC + 0x2)
+#define MXL862XX_XPCS_PCS_DISABLE (MXL862XX_XPCS_MAGIC + 0x4)
+#define MXL862XX_XPCS_AN_RESTART (MXL862XX_XPCS_MAGIC + 0x5)
+#define MXL862XX_XPCS_PCS_LINK_UP (MXL862XX_XPCS_MAGIC + 0x7)
+#define MXL862XX_XPCS_LOOPBACK (MXL862XX_XPCS_MAGIC + 0x8)
+#define MXL862XX_XPCS_RESET (MXL862XX_XPCS_MAGIC + 0x9)
+
#define MMD_API_MAXIMUM_ID 0x7fff
#endif /* __MXL862XX_CMD_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
index f17c429d1f1d..b689652aa9b9 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.c
@@ -7,20 +7,414 @@
* Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
*/
+#include <linux/bitfield.h>
+#include <linux/mutex.h>
#include <linux/phylink.h>
#include <net/dsa.h>
#include "mxl862xx.h"
+#include "mxl862xx-api.h"
+#include "mxl862xx-cmd.h"
+#include "mxl862xx-host.h"
#include "mxl862xx-phylink.h"
void mxl862xx_phylink_get_caps(struct dsa_switch *ds, int port,
struct phylink_config *config)
{
+ struct mxl862xx_priv *priv = ds->priv;
+
config->mac_capabilities = MAC_ASYM_PAUSE | MAC_SYM_PAUSE | MAC_10 |
MAC_100 | MAC_1000 | MAC_2500FD;
- __set_bit(PHY_INTERFACE_MODE_INTERNAL,
- config->supported_interfaces);
+ switch (port) {
+ case 1 ... 8:
+ __set_bit(PHY_INTERFACE_MODE_INTERNAL,
+ config->supported_interfaces);
+ break;
+ case 9:
+ case 13:
+ /* Advertised also on old firmware lacking the XPCS API:
+ * there the SerDes runs in its flash-configured mode
+ * without host control (mac_select_pcs returns NULL),
+ * keeping the CPU port working.
+ */
+ __set_bit(PHY_INTERFACE_MODE_SGMII, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_1000BASEX, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_2500BASEX, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10GBASER, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10GKR, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_USXGMII, config->supported_interfaces);
+ fallthrough;
+ case 10 ... 12:
+ case 14 ... 16:
+ if (!MXL862XX_FW_VER_MIN(priv, 1, 0, 84))
+ break;
+ __set_bit(PHY_INTERFACE_MODE_QSGMII, config->supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10G_QXGMII, config->supported_interfaces);
+
+ break;
+ default:
+ break;
+ }
+
+ if (port == 9 || port == 13)
+ config->mac_capabilities |= MAC_10000FD | MAC_5000FD;
+}
+
+static struct mxl862xx_pcs *pcs_to_mxl862xx_pcs(struct phylink_pcs *pcs)
+{
+ return container_of(pcs, struct mxl862xx_pcs, pcs);
+}
+
+static int mxl862xx_xpcs_if_mode(phy_interface_t interface)
+{
+ switch (interface) {
+ case PHY_INTERFACE_MODE_SGMII:
+ return MXL862XX_XPCS_IF_SGMII;
+ case PHY_INTERFACE_MODE_QSGMII:
+ return MXL862XX_XPCS_IF_QSGMII;
+ case PHY_INTERFACE_MODE_1000BASEX:
+ return MXL862XX_XPCS_IF_1000BASEX;
+ case PHY_INTERFACE_MODE_2500BASEX:
+ return MXL862XX_XPCS_IF_2500BASEX;
+ case PHY_INTERFACE_MODE_USXGMII:
+ case PHY_INTERFACE_MODE_10G_QXGMII:
+ return MXL862XX_XPCS_IF_USXGMII;
+ case PHY_INTERFACE_MODE_10GBASER:
+ return MXL862XX_XPCS_IF_10GBASER;
+ case PHY_INTERFACE_MODE_10GKR:
+ return MXL862XX_XPCS_IF_10GKR;
+ default:
+ return -EINVAL;
+ }
+}
+
+static int mxl862xx_xpcs_neg_mode(unsigned int neg_mode)
+{
+ if (!(neg_mode & PHYLINK_PCS_NEG_INBAND))
+ return MXL862XX_XPCS_NEG_NONE;
+ if (neg_mode & PHYLINK_PCS_NEG_ENABLED)
+ return MXL862XX_XPCS_NEG_INBAND_AN_ON;
+ return MXL862XX_XPCS_NEG_INBAND_AN_OFF;
+}
+
+static int mxl862xx_pcs_enable(struct phylink_pcs *pcs)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+
+ /* Bringup is done idempotently by pcs_config; just account this
+ * sub-port so pcs_disable powers the shared XPCS down only after
+ * the last sub-port has been released.
+ */
+ mutex_lock(&mpcs->priv->serdes_lock);
+ mpcs->priv->serdes_refcount[mpcs->serdes_id]++;
+ mutex_unlock(&mpcs->priv->serdes_lock);
+
+ return 0;
+}
+
+static void mxl862xx_pcs_disable(struct phylink_pcs *pcs)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+ struct mxl862xx_xpcs_pcs_disable dis = {};
+ struct mxl862xx_priv *priv = mpcs->priv;
+
+ dis.port_id = mpcs->serdes_id;
+
+ /* The SerDes is shared across QSGMII/QUSXGMII sub-ports; only
+ * power it down once the last active sub-port goes away. Hold
+ * serdes_lock across the count and the power-down so a sibling
+ * sub-port enable cannot race the transition to zero.
+ */
+ mutex_lock(&priv->serdes_lock);
+ if (--priv->serdes_refcount[mpcs->serdes_id] == 0)
+ MXL862XX_API_WRITE(priv, MXL862XX_XPCS_PCS_DISABLE, dis);
+ mutex_unlock(&priv->serdes_lock);
+}
+
+/* The XPCS firmware reports failures in the result field using its own
+ * libc errno values; ENOTSUP (134) in particular has no kernel errno.
+ * Translate the codes the firmware can actually return.
+ */
+static int mxl862xx_xpcs_errno(int result)
+{
+ switch (result) {
+ case -5: /* firmware -EIO */
+ return -EIO;
+ case -134: /* firmware -ENOTSUP */
+ return -EOPNOTSUPP;
+ default: /* firmware -EINVAL and anything unexpected */
+ return -EINVAL;
+ }
+}
+
+static int mxl862xx_pcs_config(struct phylink_pcs *pcs, unsigned int neg_mode,
+ phy_interface_t interface,
+ const unsigned long *advertising,
+ bool permit_pause_to_mac)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+ struct mxl862xx_priv *priv = mpcs->priv;
+ struct mxl862xx_xpcs_pcs_cfg cfg = {};
+ int if_mode, lane, ret, adv;
+
+ if_mode = mxl862xx_xpcs_if_mode(interface);
+ if (if_mode < 0) {
+ dev_err(priv->ds->dev, "unsupported interface: %s\n",
+ phy_modes(interface));
+ return if_mode;
+ }
+
+ /* The XPCS bringup is per-instance and idempotent in the
+ * firmware: every QSGMII/QUSXGMII sub-port may call pcs_config
+ * and the firmware will skip the bringup if the requested mode
+ * matches the cached one, then update MAC pause for the
+ * sub-port indicated by @usx_subport. No serdes_lock is needed
+ * here: the refcount held since pcs_enable keeps a sibling
+ * pcs_disable from powering the XPCS down, and pcs_disable
+ * invalidates the firmware's cached mode so the next pcs_config
+ * redoes the bringup.
+ */
+ lane = (interface == PHY_INTERFACE_MODE_10G_QXGMII) ?
+ MXL862XX_XPCS_USX_QUAD : MXL862XX_XPCS_USX_SINGLE;
+
+ cfg.mode = cpu_to_le16(FIELD_PREP(MXL862XX_XPCS_CFG_PORT_ID,
+ mpcs->serdes_id) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_USX_SUBPORT,
+ mpcs->slot) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_USX_LANE_MODE, lane) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_INTERFACE, if_mode) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_NEG_MODE,
+ mxl862xx_xpcs_neg_mode(neg_mode)) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_ROLE,
+ MXL862XX_XPCS_ROLE_MAC) |
+ FIELD_PREP(MXL862XX_XPCS_CFG_PERMIT_PAUSE,
+ permit_pause_to_mac));
+
+ if (neg_mode & PHYLINK_PCS_NEG_INBAND) {
+ adv = phylink_mii_c22_pcs_encode_advertisement(interface,
+ advertising);
+ if (adv >= 0)
+ cfg.advertising.cl37 = cpu_to_le16(adv);
+ }
+
+ ret = MXL862XX_API_READ(priv, MXL862XX_XPCS_PCS_CONFIG, cfg);
+ if (ret)
+ return ret;
+
+ ret = (s16)le16_to_cpu(cfg.result);
+ if (ret < 0)
+ return mxl862xx_xpcs_errno(ret);
+
+ mpcs->interface = interface;
+ return ret > 0 ? 1 : 0;
+}
+
+static void mxl862xx_pcs_get_state(struct phylink_pcs *pcs,
+ unsigned int neg_mode,
+ struct phylink_link_state *state)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+ struct mxl862xx_priv *priv = mpcs->priv;
+ struct mxl862xx_xpcs_pcs_state st = {};
+ int if_mode, lane, ret;
+ u32 mode;
+ u16 bmsr;
+
+ if_mode = mxl862xx_xpcs_if_mode(state->interface);
+ if (if_mode < 0) {
+ state->link = false;
+ return;
+ }
+
+ lane = (state->interface == PHY_INTERFACE_MODE_10G_QXGMII) ?
+ MXL862XX_XPCS_USX_QUAD : MXL862XX_XPCS_USX_SINGLE;
+
+ st.mode = cpu_to_le32(FIELD_PREP(MXL862XX_XPCS_ST_PORT_ID,
+ mpcs->serdes_id) |
+ FIELD_PREP(MXL862XX_XPCS_ST_INTERFACE, if_mode) |
+ FIELD_PREP(MXL862XX_XPCS_ST_USX_SUBPORT,
+ mpcs->slot) |
+ FIELD_PREP(MXL862XX_XPCS_ST_USX_LANE_MODE, lane));
+
+ ret = MXL862XX_API_READ(priv, MXL862XX_XPCS_PCS_GET_STATE, st);
+ if (ret) {
+ state->link = false;
+ return;
+ }
+
+ mode = le32_to_cpu(st.mode);
+ state->link = FIELD_GET(MXL862XX_XPCS_ST_LINK, mode) &&
+ !FIELD_GET(MXL862XX_XPCS_ST_PCS_FAULT, mode);
+ state->an_complete = FIELD_GET(MXL862XX_XPCS_ST_AN_COMPLETE, mode);
+
+ switch (state->interface) {
+ case PHY_INTERFACE_MODE_1000BASEX:
+ case PHY_INTERFACE_MODE_2500BASEX:
+ case PHY_INTERFACE_MODE_SGMII:
+ case PHY_INTERFACE_MODE_QSGMII:
+ bmsr = (state->link ? BMSR_LSTATUS : 0) |
+ (state->an_complete ? BMSR_ANEGCOMPLETE : 0);
+ phylink_mii_c22_pcs_decode_state(state, neg_mode, bmsr,
+ le16_to_cpu(st.lpa.cl37));
+ break;
+
+ case PHY_INTERFACE_MODE_USXGMII:
+ case PHY_INTERFACE_MODE_10G_QXGMII:
+ if (state->link)
+ phylink_decode_usxgmii_word(state,
+ le16_to_cpu(st.lpa.usx));
+ break;
+
+ case PHY_INTERFACE_MODE_10GBASER:
+ case PHY_INTERFACE_MODE_10GKR:
+ if (state->link) {
+ state->speed = SPEED_10000;
+ state->duplex = DUPLEX_FULL;
+ }
+ break;
+
+ default:
+ state->link = false;
+ break;
+ }
+}
+
+static void mxl862xx_pcs_an_restart(struct phylink_pcs *pcs)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+ struct mxl862xx_priv *priv = mpcs->priv;
+ struct mxl862xx_xpcs_an_restart an = {};
+ int if_mode, lane;
+
+ if_mode = mxl862xx_xpcs_if_mode(mpcs->interface);
+ if (if_mode < 0)
+ return;
+
+ lane = (mpcs->interface == PHY_INTERFACE_MODE_10G_QXGMII) ?
+ MXL862XX_XPCS_USX_QUAD : MXL862XX_XPCS_USX_SINGLE;
+
+ an.mode = cpu_to_le16(FIELD_PREP(MXL862XX_XPCS_ANR_PORT_ID,
+ mpcs->serdes_id) |
+ FIELD_PREP(MXL862XX_XPCS_ANR_INTERFACE, if_mode) |
+ FIELD_PREP(MXL862XX_XPCS_ANR_USX_SUBPORT,
+ mpcs->slot) |
+ FIELD_PREP(MXL862XX_XPCS_ANR_USX_LANE_MODE, lane));
+
+ MXL862XX_API_WRITE(priv, MXL862XX_XPCS_AN_RESTART, an);
+}
+
+static void mxl862xx_pcs_link_up(struct phylink_pcs *pcs, unsigned int neg_mode,
+ phy_interface_t interface, int speed,
+ int duplex)
+{
+ struct mxl862xx_pcs *mpcs = pcs_to_mxl862xx_pcs(pcs);
+ struct mxl862xx_xpcs_pcs_link_up lu = {};
+ struct mxl862xx_priv *priv = mpcs->priv;
+ int if_mode, lane, dup;
+
+ /* With inband-AN enabled (role=MAC), the XPCS auto-resolves
+ * speed/duplex from the partner's AN word and the firmware
+ * short-circuits link_up. Skip the firmware round-trip, same
+ * as pcs-mtk-lynxi.
+ */
+ if (neg_mode == PHYLINK_PCS_NEG_INBAND_ENABLED)
+ return;
+
+ if_mode = mxl862xx_xpcs_if_mode(interface);
+ if (if_mode < 0)
+ return;
+
+ lane = (interface == PHY_INTERFACE_MODE_10G_QXGMII) ?
+ MXL862XX_XPCS_USX_QUAD : MXL862XX_XPCS_USX_SINGLE;
+ dup = (duplex == DUPLEX_FULL) ? MXL862XX_XPCS_DUPLEX_FULL :
+ MXL862XX_XPCS_DUPLEX_HALF;
+
+ lu.mode = cpu_to_le16(FIELD_PREP(MXL862XX_XPCS_LU_PORT_ID,
+ mpcs->serdes_id) |
+ FIELD_PREP(MXL862XX_XPCS_LU_INTERFACE, if_mode) |
+ FIELD_PREP(MXL862XX_XPCS_LU_USX_SUBPORT,
+ mpcs->slot) |
+ FIELD_PREP(MXL862XX_XPCS_LU_USX_LANE_MODE, lane) |
+ FIELD_PREP(MXL862XX_XPCS_LU_DUPLEX, dup));
+ lu.speed = cpu_to_le16(speed);
+
+ MXL862XX_API_WRITE(priv, MXL862XX_XPCS_PCS_LINK_UP, lu);
+}
+
+static unsigned int mxl862xx_pcs_inband_caps(struct phylink_pcs *pcs,
+ phy_interface_t interface)
+{
+ switch (interface) {
+ case PHY_INTERFACE_MODE_SGMII:
+ case PHY_INTERFACE_MODE_QSGMII:
+ case PHY_INTERFACE_MODE_1000BASEX:
+ case PHY_INTERFACE_MODE_2500BASEX:
+ return LINK_INBAND_DISABLE | LINK_INBAND_ENABLE;
+ case PHY_INTERFACE_MODE_USXGMII:
+ case PHY_INTERFACE_MODE_10G_QXGMII:
+ case PHY_INTERFACE_MODE_10GKR:
+ return LINK_INBAND_ENABLE;
+ case PHY_INTERFACE_MODE_10GBASER:
+ return LINK_INBAND_DISABLE;
+ default:
+ return 0;
+ }
+}
+
+static const struct phylink_pcs_ops mxl862xx_pcs_ops = {
+ .pcs_enable = mxl862xx_pcs_enable,
+ .pcs_disable = mxl862xx_pcs_disable,
+ .pcs_config = mxl862xx_pcs_config,
+ .pcs_get_state = mxl862xx_pcs_get_state,
+ .pcs_an_restart = mxl862xx_pcs_an_restart,
+ .pcs_link_up = mxl862xx_pcs_link_up,
+ .pcs_inband_caps = mxl862xx_pcs_inband_caps,
+};
+
+void mxl862xx_setup_pcs(struct mxl862xx_priv *priv, struct mxl862xx_pcs *pcs,
+ int port)
+{
+ pcs->priv = priv;
+ pcs->serdes_id = MXL862XX_SERDES_PORT_ID(port);
+ pcs->slot = MXL862XX_SERDES_SLOT(port);
+ pcs->interface = PHY_INTERFACE_MODE_NA;
+
+ pcs->pcs.ops = &mxl862xx_pcs_ops;
+ pcs->pcs.poll = true;
+
+ __set_bit(PHY_INTERFACE_MODE_QSGMII, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10G_QXGMII, pcs->pcs.supported_interfaces);
+ if (pcs->slot != 0)
+ return;
+
+ __set_bit(PHY_INTERFACE_MODE_SGMII, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_1000BASEX, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_2500BASEX, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10GBASER, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_10GKR, pcs->pcs.supported_interfaces);
+ __set_bit(PHY_INTERFACE_MODE_USXGMII, pcs->pcs.supported_interfaces);
+}
+
+static struct phylink_pcs *
+mxl862xx_phylink_mac_select_pcs(struct phylink_config *config,
+ phy_interface_t interface)
+{
+ struct dsa_port *dp = dsa_phylink_to_port(config);
+ struct mxl862xx_priv *priv = dp->ds->priv;
+ int port = dp->index;
+
+ switch (port) {
+ case 9 ... 16:
+ if (!MXL862XX_FW_VER_MIN(priv, 1, 0, 84)) {
+ dev_warn_once(dp->ds->dev,
+ "SerDes PCS unsupported on old firmware.\n");
+ return NULL;
+ }
+ return &priv->serdes_ports[port - 9].pcs;
+ default:
+ return NULL;
+ }
}
static void mxl862xx_phylink_mac_config(struct phylink_config *config,
@@ -48,4 +442,5 @@ const struct phylink_mac_ops mxl862xx_phylink_mac_ops = {
.mac_config = mxl862xx_phylink_mac_config,
.mac_link_down = mxl862xx_phylink_mac_link_down,
.mac_link_up = mxl862xx_phylink_mac_link_up,
+ .mac_select_pcs = mxl862xx_phylink_mac_select_pcs,
};
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
index c3d5215bdf60..03bb9caad9aa 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx-phylink.h
@@ -7,8 +7,15 @@
#include "mxl862xx.h"
+#define MXL862XX_SERDES_SLOT(port) \
+ (((port) - MXL862XX_FIRST_SERDES_PORT) % MXL862XX_SERDES_SLOTS)
+#define MXL862XX_SERDES_PORT_ID(port) \
+ (((port) - MXL862XX_FIRST_SERDES_PORT) / MXL862XX_SERDES_SLOTS)
+
extern const struct phylink_mac_ops mxl862xx_phylink_mac_ops;
void mxl862xx_phylink_get_caps(struct dsa_switch *ds, int port,
struct phylink_config *config);
+void mxl862xx_setup_pcs(struct mxl862xx_priv *priv, struct mxl862xx_pcs *pcs,
+ int port);
#endif /* __MXL862XX_PHYLINK_H */
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.c b/drivers/net/dsa/mxl862xx/mxl862xx.c
index 0b1a23364eb5..45d237b3a40f 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.c
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.c
@@ -622,7 +622,7 @@ static int mxl862xx_setup(struct dsa_switch *ds)
int n_user_ports = 0, max_vlans;
int ingress_finals, vid_rules;
struct dsa_port *dp;
- int ret;
+ int ret, i;
ret = mxl862xx_reset(priv);
if (ret)
@@ -632,6 +632,11 @@ static int mxl862xx_setup(struct dsa_switch *ds)
if (ret)
return ret;
+ mutex_init(&priv->serdes_lock);
+ for (i = 0; i < ARRAY_SIZE(priv->serdes_ports); i++)
+ mxl862xx_setup_pcs(priv, &priv->serdes_ports[i],
+ i + MXL862XX_FIRST_SERDES_PORT);
+
/* Calculate Extended VLAN block sizes.
* With VLAN Filter handling VID membership checks:
* Ingress: only final catchall rules (PVID insertion, 802.1Q
diff --git a/drivers/net/dsa/mxl862xx/mxl862xx.h b/drivers/net/dsa/mxl862xx/mxl862xx.h
index e3db3711b245..432a5f3f2e08 100644
--- a/drivers/net/dsa/mxl862xx/mxl862xx.h
+++ b/drivers/net/dsa/mxl862xx/mxl862xx.h
@@ -11,6 +11,9 @@
struct mxl862xx_priv;
#define MXL862XX_MAX_PORTS 17
+#define MXL862XX_FIRST_SERDES_PORT 9
+#define MXL862XX_SERDES_SLOTS 4
+
#define MXL862XX_DEFAULT_BRIDGE 0
#define MXL862XX_MAX_BRIDGES 48
#define MXL862XX_MAX_BRIDGE_PORTS 128
@@ -242,6 +245,26 @@ struct mxl862xx_port {
spinlock_t stats_lock; /* protects stats accumulators */
};
+/**
+ * struct mxl862xx_pcs - link SerDes interfaces to bridge ports
+ * @pcs: &struct phylink_pcs instance
+ * @priv: pointer to &struct mxl862xx_priv
+ * @serdes_id: SerDes instance index (0 or 1)
+ * @slot: slot within the SerDes (0-3 for QSGMII/QUSXGMII, 0 otherwise)
+ * @interface: cached PHY interface, last value passed to pcs_config().
+ * %PHY_INTERFACE_MODE_NA before the first successful
+ * pcs_config(). Used by pcs_an_restart() to populate the
+ * firmware command and by pcs_disable() to skip the
+ * firmware power-down for shared (QSGMII/QUSXGMII) modes.
+ */
+struct mxl862xx_pcs {
+ struct phylink_pcs pcs;
+ struct mxl862xx_priv *priv;
+ int serdes_id;
+ int slot;
+ phy_interface_t interface;
+};
+
/**
* struct mxl862xx_fw_version - firmware version for comparison and display
* @major: firmware major version
@@ -280,6 +303,14 @@ struct mxl862xx_fw_version {
* flooding)
* @fw_version: cached firmware version, populated at probe and
* compared with MXL862XX_FW_VER_MIN()
+ * @serdes_ports: SerDes interfaces incl. sub-interfaces in case of
+ * 10G_QXGMII or QSGMII
+ * @serdes_refcount: per-XPCS count of sub-ports enabled by phylink;
+ * pcs_disable powers an XPCS down when the count
+ * reaches zero. Protected by @serdes_lock.
+ * @serdes_lock: serializes the @serdes_refcount transitions with
+ * the XPCS power-down so a sibling sub-port enable
+ * cannot race a power-down to zero
* @ports: per-port state, indexed by switch port number
* @bridges: maps DSA bridge number to firmware bridge ID;
* zero means no firmware bridge allocated for that
@@ -298,6 +329,9 @@ struct mxl862xx_priv {
unsigned long flags;
u16 drop_meter;
struct mxl862xx_fw_version fw_version;
+ struct mxl862xx_pcs serdes_ports[8];
+ int serdes_refcount[2];
+ struct mutex serdes_lock;
struct mxl862xx_port ports[MXL862XX_MAX_PORTS];
u16 bridges[MXL862XX_MAX_BRIDGES + 1];
u16 evlan_ingress_size;
--
2.54.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox