* [PATCH net 3/5] net: hns3: fix permanent link down deadlock after reset
From: Jijie Shao @ 2026-06-17 11:27 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
netdev, linux-kernel, shaojijie
In-Reply-To: <20260617112721.75186-1-shaojijie@huawei.com>
From: Shuaisong Yang <yangshuaisong@h-partners.com>
Fix a critical race condition deadlock where the network interface
remains permanently Link Down after a hardware reset under specific
ethtool sequences.
This issue exclusively manifests in firmware-controlled PHY topologies
where the driver relies on the IMP firmware to arbitrate link parameters.
Standard devices driven by the kernel's native PHY_LIB are unaffected.
The deadlock occurs via the following path:
1. User disables autoneg and forces an unmatched speed, forcing link
down: `ethtool -s ethx autoneg off speed 10 duplex full`
2. User re-enables autoneg: `ethtool -s ethx autoneg on`. The netdev
stack passes cmd->base.speed as SPEED_UNKNOWN (0xffffffff).
3. Driver saves req_autoneg=1, but before the interface can link up,
a hardware reset is triggered.
4. During reset recovery, MAC init reads the un-synchronized runtime
state mac.autoneg (which is still 0/OFF), misinterprets it as
forced mode, and pushes the cached SPEED_UNKNOWN into the hardware
registers, causing the MAC firmware state machine to freeze.
Meanwhile, PHY init reads req_autoneg=1 and enables PHY autoneg.
Since the MAC is frozen with 0xffffffff and PHY is running autoneg,
they mismatch permanently.
Fix this by:
1. Intercepting SPEED_UNKNOWN/DUPLEX_UNKNOWN in
hclge_set_phy_link_ksettings() and hclge_cfg_mac_speed_dup_h() to
prevent it from corrupting the driver's cached valid configuration.
2. Save req_autoneg in hclge_set_autoneg().
3. Aligning the state judgment in hclge_set_autoneg_speed_dup() to use
req_autoneg instead of the un-synchronized runtime mac.autoneg,
ensuring both MAC and PHY consistently enter the autoneg branch to
eliminate configuration discrepancies during reset recovery.
Fixes: 05eb60e9648c ("net: hns3: using user configure after hardware reset")
Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
.../hisilicon/hns3/hns3pf/hclge_main.c | 22 +++++++++++++------
1 file changed, 15 insertions(+), 7 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2c74675b149f..63e7b7458de0 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2652,8 +2652,10 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed,
if (ret)
return ret;
- hdev->hw.mac.req_speed = (u32)speed;
- hdev->hw.mac.req_duplex = duplex;
+ if (speed != SPEED_UNKNOWN)
+ hdev->hw.mac.req_speed = (u32)speed;
+ if (duplex != DUPLEX_UNKNOWN)
+ hdev->hw.mac.req_duplex = duplex;
return 0;
}
@@ -2684,6 +2686,7 @@ static int hclge_set_autoneg(struct hnae3_handle *handle, bool enable)
{
struct hclge_vport *vport = hclge_get_vport(handle);
struct hclge_dev *hdev = vport->back;
+ int ret;
if (!hdev->hw.mac.support_autoneg) {
if (enable) {
@@ -2695,7 +2698,10 @@ static int hclge_set_autoneg(struct hnae3_handle *handle, bool enable)
}
}
- return hclge_set_autoneg_en(hdev, enable);
+ ret = hclge_set_autoneg_en(hdev, enable);
+ if (!ret)
+ hdev->hw.mac.req_autoneg = enable;
+ return ret;
}
static int hclge_get_autoneg(struct hnae3_handle *handle)
@@ -3406,8 +3412,10 @@ hclge_set_phy_link_ksettings(struct hnae3_handle *handle,
return ret;
hdev->hw.mac.req_autoneg = cmd->base.autoneg;
- hdev->hw.mac.req_speed = cmd->base.speed;
- hdev->hw.mac.req_duplex = cmd->base.duplex;
+ if (cmd->base.speed != SPEED_UNKNOWN)
+ hdev->hw.mac.req_speed = cmd->base.speed;
+ if (cmd->base.duplex != DUPLEX_UNKNOWN)
+ hdev->hw.mac.req_duplex = cmd->base.duplex;
return 0;
}
@@ -11731,12 +11739,12 @@ static int hclge_set_autoneg_speed_dup(struct hclge_dev *hdev)
int ret;
if (hdev->hw.mac.support_autoneg) {
- ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
+ ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.req_autoneg);
if (ret)
return ret;
}
- if (!hdev->hw.mac.autoneg) {
+ if (!hdev->hw.mac.req_autoneg) {
ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
hdev->hw.mac.req_duplex,
hdev->hw.mac.lane_num);
--
2.33.0
^ permalink raw reply related
* [PATCH net 2/5] net: hns3: refactor MAC autoneg and speed configuration
From: Jijie Shao @ 2026-06-17 11:27 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
netdev, linux-kernel, shaojijie
In-Reply-To: <20260617112721.75186-1-shaojijie@huawei.com>
From: Shuaisong Yang <yangshuaisong@h-partners.com>
Extract the MAC autoneg and speed/duplex/lane configuration logic out
of hclge_mac_init() and encapsulate it into a new dedicated helper
function hclge_set_autoneg_speed_dup().
Currently, hclge_mac_init() handles various heterogeneous operations
including MTU settings, buffer allocation, and loopback initialization.
Stripping the complex link state machine configuration improves code
readability and reduces cyclomatic complexity. This helper function
will also be invoked during the hardware reset recovery path to
re-apply link settings without repeating unnecessary buffer or MTU
initializations.
Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
.../hisilicon/hns3/hns3pf/hclge_main.c | 49 +++++++++++++------
1 file changed, 35 insertions(+), 14 deletions(-)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 5a00797d9252..2c74675b149f 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2957,20 +2957,6 @@ static int hclge_mac_init(struct hclge_dev *hdev)
if (!test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
hdev->hw.mac.duplex = HCLGE_MAC_FULL;
- if (hdev->hw.mac.support_autoneg) {
- ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
- if (ret)
- return ret;
- }
-
- if (!hdev->hw.mac.autoneg) {
- ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
- hdev->hw.mac.req_duplex,
- hdev->hw.mac.lane_num);
- if (ret)
- return ret;
- }
-
mac->link = 0;
if (mac->user_fec_mode & BIT(HNAE3_FEC_USER_DEF)) {
@@ -11740,6 +11726,27 @@ static int hclge_set_wol(struct hnae3_handle *handle,
return ret;
}
+static int hclge_set_autoneg_speed_dup(struct hclge_dev *hdev)
+{
+ int ret;
+
+ if (hdev->hw.mac.support_autoneg) {
+ ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
+ if (ret)
+ return ret;
+ }
+
+ if (!hdev->hw.mac.autoneg) {
+ ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
+ hdev->hw.mac.req_duplex,
+ hdev->hw.mac.lane_num);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
{
struct pci_dev *pdev = ae_dev->pdev;
@@ -11901,6 +11908,13 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
if (ret)
goto err_ptp_uninit;
+ ret = hclge_set_autoneg_speed_dup(hdev);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "failed to set autoneg speed duplex, ret = %d\n", ret);
+ goto err_ptp_uninit;
+ }
+
INIT_KFIFO(hdev->mac_tnl_log);
hclge_dcb_ops_set(hdev);
@@ -12231,6 +12245,13 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
return ret;
}
+ ret = hclge_set_autoneg_speed_dup(hdev);
+ if (ret) {
+ dev_err(&pdev->dev,
+ "failed to set autoneg speed duplex, ret = %d\n", ret);
+ return ret;
+ }
+
ret = hclge_tp_port_init(hdev);
if (ret) {
dev_err(&pdev->dev, "failed to init tp port, ret = %d\n",
--
2.33.0
^ permalink raw reply related
* [PATCH net 0/5] net: hns3: fix configuration deadlocks and refactor link setup
From: Jijie Shao @ 2026-06-17 11:27 UTC (permalink / raw)
To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
netdev, linux-kernel, shaojijie
This patch series addresses a sequence of link configuration deadlocks
and parameter contamination issues in the hns3 network driver, which
typically occur during hardware resets or driver initialization under
specific user-configured scenarios.
The bugs root from asynchronous discrepancies between the MAC state machine
and cached user requests during sudden hardware resets, leading to invalid
parameter combos or frozen registers.
The series is organized as follows:
- Patch 1 refactors the ethtool link settings entry path to unify copper
port handling (both native kernel PHY_LIB and firmware-controlled PHY)
and ensures req_xxx configurations are uniformly saved across all modes.
- Patch 2 refactors the MAC initialization by extracting the autoneg and
speed configuration logic out of hclge_mac_init() into a dedicated
helper function.
- Patch 3 fixes a permanent link-down deadlock after a reset by
ensuring that the driver caches and uses the user's intended autoneg
/speed settings (req_***) rather than un-synchronized runtime states
or SPEED_UNKNOWN tokens.
- Patch 4 fixes a link loss issue on optical ports during
initialization by differentiating autoneg default values between
copper and fiber media types.
- Patch 5 fixes an initialization (probe) failure caused by lane_num
contamination from previous active lifecycle by introducing
req_lane_num=0, which leverages firmware automatic lane matching.
Shuaisong Yang (5):
net: hns3: unify copper port ksettings configuration path
net: hns3: refactor MAC autoneg and speed configuration
net: hns3: fix permanent link down deadlock after reset
net: hns3: differentiate autoneg default values between copper and
fiber
net: hns3: fix init failure caused by lane_num contamination
.../ethernet/hisilicon/hns3/hns3_ethtool.c | 26 ++---
.../hisilicon/hns3/hns3pf/hclge_main.c | 100 ++++++++++++++----
.../hisilicon/hns3/hns3pf/hclge_main.h | 1 +
3 files changed, 90 insertions(+), 37 deletions(-)
base-commit: 406e8a651a7b854c41fecd5117bb282b3a6c2c6b
--
2.33.0
^ permalink raw reply
* Re: [PATCH net-next v7 2/2] net: ti: icssg-prueth: Add ethtool ops for Frame Preemption MAC Merge
From: Meghana Malladi @ 2026-06-17 11:25 UTC (permalink / raw)
To: MD Danish Anwar, Jakub Kicinski
Cc: elfring, haokexin, vadim.fedorenko, devnexen, horms,
jacob.e.keller, arnd, basharath, afd, parvathi, vladimir.oltean,
rogerq, pabeni, edumazet, davem, andrew+netdev, linux-arm-kernel,
netdev, linux-kernel, srk, vigneshr
In-Reply-To: <a62d5243-d641-48e7-a1f5-88150513be48@ti.com>
On 6/17/26 10:58, MD Danish Anwar wrote:
> Meghana,
>
> On 16/06/26 6:24 pm, Meghana Malladi wrote:
>> Hi Jakub,
>>
>> On 6/16/26 05:09, Jakub Kicinski wrote:
>>> On Mon, 15 Jun 2026 16:10:41 -0700 Jakub Kicinski wrote:
>>>>> diff --git a/drivers/net/ethernet/ti/icssg/icssg_stats.h b/drivers/
>>>>> net/ethernet/ti/icssg/icssg_stats.h
>>>>> index 5ec0b38e0c67..8073deac35c3 100644
>>>>> --- a/drivers/net/ethernet/ti/icssg/icssg_stats.h
>>>>> +++ b/drivers/net/ethernet/ti/icssg/icssg_stats.h
>>>>> @@ -189,6 +187,11 @@ static const struct icssg_pa_stats
>>>>> icssg_all_pa_stats[] = {
>>>>> ICSSG_PA_STATS(FW_INF_DROP_PRIOTAGGED),
>>>>> ICSSG_PA_STATS(FW_INF_DROP_NOTAG),
>>>>> ICSSG_PA_STATS(FW_INF_DROP_NOTMEMBER),
>>>>> + ICSSG_PA_STATS(FW_PREEMPT_BAD_FRAG),
>>>>> + ICSSG_PA_STATS(FW_PREEMPT_ASSEMBLY_ERR),
>>>>> + ICSSG_PA_STATS(FW_PREEMPT_FRAG_CNT_TX),
>>>>> + ICSSG_PA_STATS(FW_PREEMPT_ASSEMBLY_OK),
>>>>> + ICSSG_PA_STATS(FW_PREEMPT_FRAG_CNT_RX),
>>>>> ICSSG_PA_STATS(FW_RX_EOF_SHORT_FRMERR),
>>>>> ICSSG_PA_STATS(FW_RX_B0_DROP_EARLY_EOF),
>>>>> ICSSG_PA_STATS(FW_TX_JUMBO_FRM_CUTOFF),
>>>>
>>>> [Medium]
>>>> Are these five new entries duplicating values that already have a
>>>> standard uAPI?
>>>>
>>>> The same five firmware counters are exposed through the new
>>>> .get_mm_stats callback as the standardized MAC Merge stats
>>>> (MACMergeFrameAssOkCount, MACMergeFrameAssErrorCount,
>>>> MACMergeFragCountRx,
>>>> MACMergeFragCountTx, MACMergeFrameSmdErrorCount in struct
>>>> ethtool_mm_stats), and adding them to icssg_all_pa_stats[] also
>>>> publishes them via emac_get_strings() / emac_get_ethtool_stats() as
>>>> ethtool -S strings.
>>>>
>>>> Documentation/networking/statistics.rst describes ethtool -S as the
>>>> private-driver-stats interface; counters that have a standard uAPI are
>>>> expected to flow only through that uAPI.
>>>>
>>>> Could the firmware-register lookup table used by emac_get_stat_by_name()
>>>> be separated from the ethtool -S string table, so the new preemption
>>>> counters feed get_mm_stats without also showing up under ethtool -S?
>>>
>>> This -- not sure about the other complaints but this one looks legit.
>>
>> I agree that this is legit, but right now there is no other place holder
>> other than pa stats to put the mac merge firmware counters. I believe
>
> You can put a boolean is_standard_stats. Only those where
> is_standard_stats=false will be populated via ethtool. Others will be
> populated via the standard interface.
>
> Look at icssg_miig_stats for reference.
>
Sure, since you were already doing some refactoring w.r.t HSR standard
stats I thought this could also be covered there.
I will send out another version addressing this then.
>> the effort needs to go in re-structuring the hardware and firmware stats
>> implementation to address this issue.
>>
>
^ permalink raw reply
* Re: [PATCH v2 4/4] drm/xe/hw_error: Use HW_ERR prefix in log
From: Raag Jadav @ 2026-06-17 11:21 UTC (permalink / raw)
To: Michal Wajdeczko
Cc: intel-xe, dri-devel, netdev, rodrigo.vivi, riana.tauro, dev,
airlied, simona, kuba
In-Reply-To: <ah16Hfwq7goxBm27@black.igk.intel.com>
On Mon, Jun 01, 2026 at 02:25:06PM +0200, Raag Jadav wrote:
> On Mon, Jun 01, 2026 at 01:13:12PM +0200, Michal Wajdeczko wrote:
> > On 5/23/2026 7:00 AM, Raag Jadav wrote:
> > > Hardware errors should be logged with HW_ERR prefix. Make them
> > > consistent with existing logs.
> > >
> > > Fixes: 01aab7e1c9d4 ("drm/xe/xe_hw_error: Add support for PVC SoC errors")
> > > Signed-off-by: Raag Jadav <raag.jadav@intel.com>
> > > ---
> > > drivers/gpu/drm/xe/xe_hw_error.c | 12 ++++++------
> > > 1 file changed, 6 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/xe/xe_hw_error.c b/drivers/gpu/drm/xe/xe_hw_error.c
> > > index 5135e8e4093f..4b72959b2276 100644
> > > --- a/drivers/gpu/drm/xe/xe_hw_error.c
> > > +++ b/drivers/gpu/drm/xe/xe_hw_error.c
> > > @@ -223,9 +223,9 @@ static void log_hw_error(struct xe_tile *tile, const char *name,
> > > struct xe_device *xe = tile_to_xe(tile);
> > >
> > > if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
> > > - drm_warn(&xe->drm, "%s %s detected\n", name, severity_str);
> > > + drm_warn(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str);
> >
> > function is per-tile, so shouldn't we use tile-oriented logs instead?
>
> Agree, but then it needs to be done file-wide which I'll pursue once
> the fix lands and propageted.
Even better, there's a driver-wide refactor[1] incoming.
Raag
[1] https://patchwork.freedesktop.org/series/168333/
> > xe_tile_warn(tile, HW_ERR ...)
> >
> > > else
> > > - drm_err_ratelimited(&xe->drm, "%s %s detected\n", name, severity_str);
> > > + drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected\n", name, severity_str);
> >
> > xe_tile_err_ratelimited(tile, HW_ERR ...)
> >
> > > }
> > >
> > > static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
> > > @@ -235,10 +235,10 @@ static void log_gt_err(struct xe_tile *tile, const char *name, int i, u32 err,
> > > struct xe_device *xe = tile_to_xe(tile);
> > >
> > > if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
> > > - drm_warn(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
> > > + drm_warn(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
> > > name, severity_str, i, err);
> > > else
> > > - drm_err_ratelimited(&xe->drm, "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
> > > + drm_err_ratelimited(&xe->drm, HW_ERR "%s %s detected, ERROR_STAT_GT_VECTOR%d:0x%08x\n",
> > > name, severity_str, i, err);
> > > }
> > >
> > > @@ -255,9 +255,9 @@ static void log_soc_error(struct xe_tile *tile, const char * const *reg_info,
> > >
> > > if (strcmp(name, "Undefined")) {
> > > if (severity == DRM_XE_RAS_ERR_SEV_CORRECTABLE)
> > > - drm_warn(&xe->drm, "%s SOC %s detected", name, severity_str);
> > > + drm_warn(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str);
> > > else
> > > - drm_err_ratelimited(&xe->drm, "%s SOC %s detected", name, severity_str);
> > > + drm_err_ratelimited(&xe->drm, HW_ERR "%s SOC %s detected", name, severity_str);
> > > atomic_inc(&info[index].counter);
> > > }
> > > }
> >
^ permalink raw reply
* Re: [PATCH net] netpoll: run NAPI poll in softirq context to avoid rq->lock self-deadlock
From: Peter Zijlstra @ 2026-06-17 11:19 UTC (permalink / raw)
To: Petr Mladek
Cc: Jakub Kicinski, Sebastian Andrzej Siewior, John Ogness,
Sergey Senozhatsky, Vlad Poenaru, Thomas Gleixner, netdev,
David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
Breno Leitao, Clark Williams, Steven Rostedt, linux-rt-devel,
linux-kernel, stable, Frederic Weisbecker, Ingo Molnar,
Vincent Guittot, Dietmar Eggemann, K Prateek Nayak
In-Reply-To: <ajJ46o4fomfxY5CX@pathway.suse.cz>
On Wed, Jun 17, 2026 at 12:37:30PM +0200, Petr Mladek wrote:
> On Tue 2026-06-16 14:17:19, Jakub Kicinski wrote:
> > On Tue, 16 Jun 2026 19:02:57 +0200 Peter Zijlstra wrote:
> > > > So this is not an issue since commit 7eab73b18630e ("netconsole: convert
> > > > to NBCON console infrastructure"). Because from here now on writes are
> > > > deferred to the nbcon thread. So this purely about -stable in this case.
> > >
> > > Hmm, I thought netconsole had some reserved skbs and could to writes
> > > 'atomic' like? That said, it was 2.6 era the last time I looked at
> > > netconsole.
> >
> > Yes, that part is fine. The problem is that netconsole tries
> > to reap Tx completions if the Tx queue is full. We can't call
> > skb destructor in irq context so we put the completed skbs on
> > a queue and try to arm softirq to get to them later.
> > Arming softirq causes a ksoftirq wake up.
> >
> > We already skip the completion polling if we detect getting called
> > from the same networking driver. It's best effort, anyway.
> > Networking-side fix would be to toss another OR condition into
> > the skip. But we don't have one that'd work cleanly :S
>
> Alternative solution might be to offload the ksoftirq wake up
> to an irq_work. It might make this part safe for the
> console->write_atomic() call.
>
> Well, my understanding is that there are more problems.
> AFAIK, some drivers do not use an IRQ safe locking, see
> https://lore.kernel.org/all/oth5t27z6acp7qxut7u45ekyil7djirg2ny3bnsvnzeqasavxb@nhwdxahvcosh/
But anything using locking is not ->write_atomic() and should be driven
from a kthread, no?
^ permalink raw reply
* [PATCH 2/2] selftests/bpf: Add test for bpf_sock_read_xattr() kfunc
From: Christian Brauner @ 2026-06-17 11:18 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann
Cc: Alexander Viro, Jan Kara, Simon Horman, Kuniyuki Iwashima,
Willem de Bruijn, linux-fsdevel, netdev, bpf, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Song Liu, Yonghong Song, Jiri Olsa, Christian Brauner (Amutable)
In-Reply-To: <20260617-work-bpf-sock-xattr-v1-0-a1276f7c9da3@kernel.org>
Add a selftest that loads the kfunc in sleepable and non-sleepable
lsm/socket_connect programs and checks that a value set via fsetxattr()
on a socket is read back.
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
tools/testing/selftests/bpf/bpf_experimental.h | 3 +
.../testing/selftests/bpf/prog_tests/sock_xattr.c | 67 ++++++++++++++++++++++
.../testing/selftests/bpf/progs/sock_read_xattr.c | 54 +++++++++++++++++
3 files changed, 124 insertions(+)
diff --git a/tools/testing/selftests/bpf/bpf_experimental.h b/tools/testing/selftests/bpf/bpf_experimental.h
index 2234bd6bc9d3..5b825157b125 100644
--- a/tools/testing/selftests/bpf/bpf_experimental.h
+++ b/tools/testing/selftests/bpf/bpf_experimental.h
@@ -446,6 +446,9 @@ extern void bpf_iter_dmabuf_destroy(struct bpf_iter_dmabuf *it) __weak __ksym;
extern int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__str,
struct bpf_dynptr *value_p) __weak __ksym;
+extern int bpf_sock_read_xattr(struct socket *sock, const char *name__str,
+ struct bpf_dynptr *value_p) __weak __ksym;
+
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define HARDIRQ_BITS 4
diff --git a/tools/testing/selftests/bpf/prog_tests/sock_xattr.c b/tools/testing/selftests/bpf/prog_tests/sock_xattr.c
new file mode 100644
index 000000000000..b5816e90f01a
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/sock_xattr.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2026 Christian Brauner */
+
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/xattr.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <test_progs.h>
+
+#include "sock_read_xattr.skel.h"
+
+static const char xattr_value[] = "bpf_sock_value";
+static const char xattr_name[] = "user.bpf_test";
+
+static void test_read_sock_xattr(void)
+{
+ struct sockaddr_in addr = {};
+ struct sock_read_xattr *skel = NULL;
+ struct bpf_link *link = NULL;
+ int sock_fd = -1, err;
+
+ sock_fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (!ASSERT_OK_FD(sock_fd, "socket"))
+ return;
+
+ err = fsetxattr(sock_fd, xattr_name, xattr_value, sizeof(xattr_value), 0);
+ if (!ASSERT_OK(err, "fsetxattr"))
+ goto out;
+
+ skel = sock_read_xattr__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "sock_read_xattr__open_and_load"))
+ goto out;
+
+ skel->bss->monitored_pid = sys_gettid();
+
+ /* Only attach the functional program; the verifier-only programs
+ * above are not pid-gated and would clobber the shared globals.
+ */
+ link = bpf_program__attach(skel->progs.read_sock_xattr);
+ if (!ASSERT_OK_PTR(link, "attach read_sock_xattr"))
+ goto out;
+
+ addr.sin_family = AF_INET;
+ addr.sin_port = htons(1234);
+ addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+ /* Only the lsm/socket_connect hook matters; the connect may fail. */
+ connect(sock_fd, (struct sockaddr *)&addr, sizeof(addr));
+
+ ASSERT_EQ(skel->data->read_ret, sizeof(xattr_value), "read_ret");
+ ASSERT_STREQ(skel->bss->value, xattr_value, "value");
+
+out:
+ bpf_link__destroy(link);
+ if (sock_fd >= 0)
+ close(sock_fd);
+ sock_read_xattr__destroy(skel);
+}
+
+void test_sock_xattr(void)
+{
+ RUN_TESTS(sock_read_xattr);
+
+ if (test__start_subtest("read_sock_xattr"))
+ test_read_sock_xattr();
+}
diff --git a/tools/testing/selftests/bpf/progs/sock_read_xattr.c b/tools/testing/selftests/bpf/progs/sock_read_xattr.c
new file mode 100644
index 000000000000..c4a8eae8cc3c
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/sock_read_xattr.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2026 Christian Brauner */
+
+#include <vmlinux.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "bpf_experimental.h"
+#include "bpf_misc.h"
+
+char _license[] SEC("license") = "GPL";
+
+char value[16];
+int read_ret = -1;
+__u32 monitored_pid = 0;
+
+static __always_inline void read_xattr(struct socket *sock)
+{
+ struct bpf_dynptr value_ptr;
+
+ bpf_dynptr_from_mem(value, sizeof(value), 0, &value_ptr);
+ bpf_sock_read_xattr(sock, "user.bpf_test", &value_ptr);
+}
+
+SEC("lsm.s/socket_connect")
+__success
+int BPF_PROG(trusted_sock_ptr_sleepable, struct socket *sock)
+{
+ read_xattr(sock);
+ return 0;
+}
+
+SEC("lsm/socket_connect")
+__success
+int BPF_PROG(trusted_sock_ptr_non_sleepable, struct socket *sock)
+{
+ read_xattr(sock);
+ return 0;
+}
+
+SEC("lsm.s/socket_connect")
+__success
+int BPF_PROG(read_sock_xattr, struct socket *sock)
+{
+ struct bpf_dynptr value_ptr;
+ __u32 pid = bpf_get_current_pid_tgid() >> 32;
+
+ if (pid != monitored_pid)
+ return 0;
+
+ bpf_dynptr_from_mem(value, sizeof(value), 0, &value_ptr);
+ read_ret = bpf_sock_read_xattr(sock, "user.bpf_test", &value_ptr);
+ return 0;
+}
--
2.47.3
^ permalink raw reply related
* [PATCH 1/2] fs: Add bpf_sock_read_xattr() kfunc to read socket xattrs
From: Christian Brauner @ 2026-06-17 11:18 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann
Cc: Alexander Viro, Jan Kara, Simon Horman, Kuniyuki Iwashima,
Willem de Bruijn, linux-fsdevel, netdev, bpf, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Song Liu, Yonghong Song, Jiri Olsa, Christian Brauner (Amutable)
In-Reply-To: <20260617-work-bpf-sock-xattr-v1-0-a1276f7c9da3@kernel.org>
In c8db08110cbe ("Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")
we added support for extended attributes for sockets. This comes in two
flavors: sockfs and non-sockfs/filesystem sockets. Filesystem sockets
are actual filesystem objects so reading xattrs must use dedicated fs
helpers such as bpf_get_dentry_xattr() and bpf_get_file_xattr(). Those
are inherently sleeping operations. Sockfs sockets on the other hand
don't need to use sleeping operations as the underlying data structure
is lockless. In addition, retrieval of sockfs extended attributes often
happens from LSM hooks that only provide struct socket and it's
completely nonsensical to grab a reference to a file, then force a
sleeping operation to retrieve the xattr and drop the reference. We know
that the sockfs file cannot go away while the LSM hook runs.
This series adds a bpf_sock_read_xattr() kfunc that, given a struct
socket, reads a user.* extended attribute from the socket's sockfs inode
into a bpf_dynptr. Together with fsetxattr() from userspace this lets a
process label a socket with a user.* xattr and have a BPF LSM program
retrieve that label locklessly. The kfunc mirrors the existing
bpf_cgroup_read_xattr(), including the restriction to the user.*
namespace.
systemd uses user.* xattrs on sockets to implement socket rate limiting
and to tag sockets for other purposes [1] such as implementing a varlink
registry. There is currently no efficient way for a BPF program to read
those labels back. The new helper allows a listening socket marked with
an extended attribute to be read back during bind/connect and then act
on the connect()ing socket. Extended attributes make it possible to
allow an unprivileged user manager such as systemd --user to mark
sockets from userspace and then rediscover them or implement policies.
The kfunc is registered KF_RCU and only for BPF LSM programs. A struct
socket is only guaranteed to live in sockfs when an LSM socket hook hands
it out, which is what keeps SOCK_INODE() valid. Sockets that embed struct
socket outside sockfs (tun, tap) are only reachable from tracing programs
and are excluded by the registration. (Btw, for consistency it would
be nice to force allocation of struct socket from sockfs instead of
simply embedding it in e.g., struct tun_file which makes the SOCKFS_I()
pattern a hazard - at least outside of sockfs functions.)
The read never sleeps and takes no lock. For sockfs the value lives in
the inode's in-memory xattr store and simple_xattr_get() resolves it
with an RCU-protected rhashtable lookup, taking neither the inode lock
nor any xattr lock. The kfunc is therefore usable from both sleepable
and non-sleepable LSM hooks.
Link: https://github.com/systemd/systemd/pull/40559 [1]
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
fs/bpf_fs_kfuncs.c | 37 +++++++++++++++++++++++++++++++++++++
include/linux/net.h | 1 +
net/socket.c | 25 +++++++++++++++++++++++++
3 files changed, 63 insertions(+)
diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c
index 11841c3d4260..85fc9519d1ff 100644
--- a/fs/bpf_fs_kfuncs.c
+++ b/fs/bpf_fs_kfuncs.c
@@ -11,6 +11,7 @@
#include <linux/file.h>
#include <linux/kernfs.h>
#include <linux/mm.h>
+#include <linux/net.h>
#include <linux/xattr.h>
__bpf_kfunc_start_defs();
@@ -359,6 +360,39 @@ __bpf_kfunc int bpf_cgroup_read_xattr(struct cgroup *cgroup, const char *name__s
}
#endif /* CONFIG_CGROUPS */
+#ifdef CONFIG_NET
+/**
+ * bpf_sock_read_xattr - read xattr of a socket's inode in sockfs
+ * @sock: socket to get xattr from
+ * @name__str: name of the xattr
+ * @value_p: output buffer of the xattr value
+ *
+ * Get xattr *name__str* of *sock* and store the output in *value_p*.
+ *
+ * For security reasons, only *name__str* with prefix "user." is allowed.
+ *
+ * Return: length of the xattr value on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_sock_read_xattr(struct socket *sock, const char *name__str,
+ struct bpf_dynptr *value_p)
+{
+ struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p;
+ u32 value_len;
+ void *value;
+
+ /* Only allow reading "user.*" xattrs */
+ if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
+ return -EPERM;
+
+ value_len = __bpf_dynptr_size(value_ptr);
+ value = __bpf_dynptr_data_rw(value_ptr, value_len);
+ if (!value)
+ return -EINVAL;
+
+ return sock_read_xattr(sock, name__str, value, value_len);
+}
+#endif /* CONFIG_NET */
+
/**
* bpf_real_inode - get the real inode backing a dentry
* @dentry: dentry to resolve
@@ -385,6 +419,9 @@ BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_real_inode, KF_SLEEPABLE | KF_RET_NULL)
+#ifdef CONFIG_NET
+BTF_ID_FLAGS(func, bpf_sock_read_xattr, KF_RCU)
+#endif
BTF_KFUNCS_END(bpf_fs_kfunc_set_ids)
static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id)
diff --git a/include/linux/net.h b/include/linux/net.h
index f268f395ce47..fdcf9956805c 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -285,6 +285,7 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags);
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname);
struct socket *sockfd_lookup(int fd, int *err);
struct socket *sock_from_file(struct file *file);
+int sock_read_xattr(struct socket *sock, const char *name, void *value, size_t size);
#define sockfd_put(sock) fput(sock->file)
int net_ratelimit(void);
diff --git a/net/socket.c b/net/socket.c
index 9e8dc769ff7a..3566f8c8ea3f 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -465,6 +465,31 @@ static const struct xattr_handler sockfs_user_xattr_handler = {
.set = sockfs_user_xattr_set,
};
+/**
+ * sock_read_xattr - read a user.* xattr from a socket's sockfs inode
+ * @sock: socket whose inode holds the xattr
+ * @name: full xattr name, e.g. "user.bpf_test"
+ * @value: output buffer
+ * @size: size of @value in bytes
+ *
+ * SOCK_INODE() is valid only for sockfs sockets; sock_from_file() rejects
+ * anything else (e.g. tun, tap).
+ * Lockless: simple_xattr_get() looks up the value under RCU, no inode lock.
+ *
+ * Return: length of the value on success, a negative errno on error.
+ */
+int sock_read_xattr(struct socket *sock, const char *name, void *value, size_t size)
+{
+ struct file *file = sock->file;
+ struct sockfs_inode *si;
+
+ if (!file || sock_from_file(file) != sock)
+ return -EOPNOTSUPP;
+
+ si = SOCKFS_I(SOCK_INODE(sock));
+ return simple_xattr_get(&sockfs_xa_cache, &si->xattrs, name, value, size);
+}
+
static const struct xattr_handler * const sockfs_xattr_handlers[] = {
&sockfs_xattr_handler,
&sockfs_security_xattr_handler,
--
2.47.3
^ permalink raw reply related
* [PATCH 0/2] Add bpf_sock_read_xattr() kfunc to read socket xattrs
From: Christian Brauner @ 2026-06-17 11:18 UTC (permalink / raw)
To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
Alexei Starovoitov, Daniel Borkmann
Cc: Alexander Viro, Jan Kara, Simon Horman, Kuniyuki Iwashima,
Willem de Bruijn, linux-fsdevel, netdev, bpf, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Kumar Kartikeya Dwivedi,
Song Liu, Yonghong Song, Jiri Olsa, Christian Brauner (Amutable)
In c8db08110cbe ("Merge tag 'vfs-7.1-rc1.xattr' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs")
we added support for extended attributes for sockets. This comes in two
flavors: sockfs and non-sockfs/filesystem sockets. Filesystem sockets
are actual filesystem objects so reading xattrs must use dedicated fs
helpers such as bpf_get_dentry_xattr() and bpf_get_file_xattr(). Those
are inherently sleeping operations. Sockfs sockets on the other hand
don't need to use sleeping operations as the underlying data structure
is lockless. In addition, retrieval of sockfs extended attributes often
happens from LSM hooks that only provide struct socket and it's
completely nonsensical to grab a reference to a file, then force a
sleeping operation to retrieve the xattr and drop the reference. We know
that the sockfs file cannot go away while the LSM hook runs.
This series adds a bpf_sock_read_xattr() kfunc that, given a struct
socket, reads a user.* extended attribute from the socket's sockfs inode
into a bpf_dynptr. Together with fsetxattr() from userspace this lets a
process label a socket with a user.* xattr and have a BPF LSM program
retrieve that label locklessly. The kfunc mirrors the existing
bpf_cgroup_read_xattr(), including the restriction to the user.*
namespace.
systemd uses user.* xattrs on sockets to implement socket rate limiting
and to tag sockets for other purposes [1] such as implementing a varlink
registry. There is currently no efficient way for a BPF program to read
those labels back. The new helper allows a listening socket marked with
an extended attribute to be read back during bind/connect and then act
on the connect()ing socket. Extended attributes make it possible to
allow an unprivileged user manager such as systemd --user to mark
sockets from userspace and then rediscover them or implement policies.
The kfunc is registered KF_RCU and only for BPF LSM programs. A struct
socket is only guaranteed to live in sockfs when an LSM socket hook hands
it out, which is what keeps SOCK_INODE() valid. Sockets that embed struct
socket outside sockfs (tun, tap) are only reachable from tracing programs
and are excluded by the registration. (Btw, for consistency it would
be nice to force allocation of struct socket from sockfs instead of
simply embedding it in e.g., struct tun_file which makes the SOCKFS_I()
pattern a hazard - at least outside of sockfs functions.)
The read never sleeps and takes no lock. For sockfs the value lives in
the inode's in-memory xattr store and simple_xattr_get() resolves it
with an RCU-protected rhashtable lookup, taking neither the inode lock
nor any xattr lock. The kfunc is therefore usable from both sleepable
and non-sleepable LSM hooks.
Link: https://github.com/systemd/systemd/pull/40559 [1]
Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
---
Christian Brauner (2):
fs: Add bpf_sock_read_xattr() kfunc to read socket xattrs
selftests/bpf: Add test for bpf_sock_read_xattr() kfunc
fs/bpf_fs_kfuncs.c | 37 ++++++++++++
include/linux/net.h | 1 +
net/socket.c | 25 ++++++++
tools/testing/selftests/bpf/bpf_experimental.h | 3 +
.../testing/selftests/bpf/prog_tests/sock_xattr.c | 67 ++++++++++++++++++++++
.../testing/selftests/bpf/progs/sock_read_xattr.c | 54 +++++++++++++++++
6 files changed, 187 insertions(+)
---
base-commit: 6b5a2b7d9bc156e505f09e698d85d6a1547c1206
change-id: 20260617-work-bpf-sock-xattr-37ec4c991886
^ permalink raw reply
* Re: [RESEND PATCH v1] net: dsa: motorcomm: add yt92xx dsa driver
From: David Yang @ 2026-06-17 11:15 UTC (permalink / raw)
To: Kyle Switch
Cc: andrew, olteanv, davem, edumazet, kuba, pabeni, horms, netdev,
linux-kernel, ming.xu, xiaolin.xu, jianmin.wang, de.ge
In-Reply-To: <88f726d5-1617-4d2e-8fbb-d3da9478b386@motor-comm.com>
On Wed, Jun 17, 2026 at 10:37 AM Kyle Switch <kyle.switch@motor-comm.com> wrote:
> >> +/* To define the from cpu tag format 8 bytes:
> >> + *
> >> + * 0 1 2 3 4 5 6 7 | 0 1 2 3 4 5 6 7
> >> + *|<----------TPID 0x9988---------->|
> >> + *|<--RESERVE-->|<-----DST PORT---->|
> >> + *|-|<---------RESERVE------------->|
> >> + *|<------------------------------->|
> >> + */
> >> +#define YT922X_TAG_FORMAT2_NAME "yt922x-8b"
> >> +#define YT922X_FORMAT2_TAG_LEN 8
> >> +#define YT922X_PKT_TYPE GENMASK(15, 14)
> >> +#define YT922X_8B_CPUTAG_PKT_FROM_CPU 0x1
> >> +#define YT922X_8B_CPUTAG_SRC_PORT GENMASK(6, 2)
> >> +#define YT922X_8B_CPUTAG_DST_PORTMASK GENMASK(8, 0)
> >> +#define YT922X_8B_CPUTAG_DST_PORTMASK_0 BIT(15)
> >> +#define YT922X_8B_CPUTAG_DST_PORTMASK_0_EN 0x1
> >> +#define YT922X_8B_CPUTAG_FORCE_DST BIT(9)
> >> +#define YT922X_8B_CPUTAG_FORCE_DST_EN 0x1
> >
> > If yt922x tag format shares no common with yt921x, make a new tag driver.
>
> Ans: thank you for your suggestion, we will consider whether to create a new driver in the new file.
I'm not an expert in this, but if yt922x tag does support cpu codes
and priority, please consider updating yt921x tagger to support it,
even if you don't use or test these features for now.
> >
> >> +static struct dsa_tag_driver *dsa_tag_driver_array[] = {
> >> + &DSA_TAG_DRIVER_NAME(yt921x_netdev_ops),
> >> + &DSA_TAG_DRIVER_NAME(yt922x_4b_netdev_ops),
> >> + &DSA_TAG_DRIVER_NAME(yt922x_8b_netdev_ops),
> >> +};
> >
> > If both are supported by the chip and 4b does nothing more than 8b
> > does, do not bother with it.
>
> Ans: 4b and 8b dsa tag may have different application scenarios. from my opinion,
> 1. 4b dsa tag can save 4 bytes of payload
> 2. 8b dsa tag carry more package info.
We do not support every tag protocol. For DSA switches,
- the conduit interface supports jumbo frames so there is room for
the DSA header, or
- you end up with MTU less than 1500 anyway.
4-byte reduction does not make a practical difference here. An
alternative protocol poses 2x work to everyone else, and unnecessarily
exposes your driver to interoperability issues, as pointed by Andrew.
As I've commented before, if there is a particular reason to add
4-byte protocol, leave it behind for the moment, and focus on a
minimal yt922x_dsa_switch_ops + yt922x_netdev_ops for your first
patchset without any offloading supports. This way, others can easily
see your changes and move the work forward efficiently.
^ permalink raw reply
* Re: [PATCH net] netpoll: run NAPI poll in softirq context to avoid rq->lock self-deadlock
From: Peter Zijlstra @ 2026-06-17 11:15 UTC (permalink / raw)
To: Petr Mladek
Cc: Sebastian Andrzej Siewior, Jakub Kicinski, John Ogness,
Sergey Senozhatsky, Vlad Poenaru, Thomas Gleixner, netdev,
David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
Breno Leitao, Clark Williams, Steven Rostedt, linux-rt-devel,
linux-kernel, stable, Frederic Weisbecker, Ingo Molnar,
Vincent Guittot, Dietmar Eggemann, K Prateek Nayak
In-Reply-To: <ajJy92ES-Q8ro97A@pathway.suse.cz>
On Wed, Jun 17, 2026 at 12:12:07PM +0200, Petr Mladek wrote:
> On Tue 2026-06-16 17:31:22, Sebastian Andrzej Siewior wrote:
> > On 2026-06-16 08:11:28 [-0700], Jakub Kicinski wrote:
> > > >
> > > > Adding sched and printk folks for opinions while eyeballing
> > > > WARN_ON_DEFERRED().
> > >
> > > Thanks a lot for looking into this! To be clear - the printk_deferred /
> > > WARN_DEFERRED would be just for stable? Or there's still some
> > > sensitivity even with nbcon?
> >
> > We already have printk_deferred(). WARN_DEFERRED() would be new. I
> > *think* this is not limited netpoll/ netconsole but all console drivers
> > not using CON_NBCON if the printk (via WARN) occurs with the rq held.
> > I don't remember all the details but printk_deferred() was introduced to
> > circumvent this until printk is fixed.
>
> Just to make it clear. The problem with the legacy consoles is that
> they are called under console_lock() which is a semaphore. And it
> calls wake_up_process() in console_unlock() when there is another
> waiter on the lock.
>
> > Once we get rid of those legacy drivers and NBCON is the default we can
> > get rid of printk_deferred() :)
>
> Yup.
Can't we push all the legacy consoles into a single legacy kthread? I
mean, converting all consoles is of course awesome, but should we really
wait for that?
^ permalink raw reply
* Re: [PATCH net-next v6 7/7] net: macb: introduce ndo_xdp_xmit support
From: Koehrer Mathias (ETAS-ICA/XPC-Fe1) @ 2026-06-17 11:08 UTC (permalink / raw)
To: Paolo Valerio, netdev@vger.kernel.org
Cc: Nicolas Ferre, Claudiu Beznea, Andrew Lunn, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Lorenzo Bianconi,
Théo Lebrun, Nicolai Buchwitz
In-Reply-To: <GVXPR10MB8360DB5D2F374574CE33C5779F1B2@GVXPR10MB8360.EURPRD10.PROD.OUTLOOK.COM>
Hi,
> within the function gem_xdp_xmit, there should be a call to "xdp_return_frame" for each successful processing in
> "macb_xdp_submit_frame".
> Otherwise, this driver does not work properly with Ethernet drivers that XDP_REDIRECT to this driver and use
> page_pools. In this case, the pages from the pools are not returned to the pool.
Please ignore the complain.
I think the existing code is fine, I found a bug on my side. Sorry for the confusion.
Best regards
Mathias
^ permalink raw reply
* Re: [PATCH 0/3] vmsplice: make vmsplice a trivial wrapper for preadv2/pwritev2
From: Christian Brauner @ 2026-06-17 11:07 UTC (permalink / raw)
To: Joanne Koong
Cc: Val Packett, Al Viro, Linus Torvalds, Askar Safin, linux-kernel,
linux-mm, linux-api, netdev, Matthew Wilcox, Jens Axboe,
Christoph Hellwig, David Howells, Andrew Morton,
David Hildenbrand, Pedro Falcato, Miklos Szeredi, patches,
linux-fsdevel, Jan Kara, Steven Rostedt, fuse-devel,
Bernd Schubert
In-Reply-To: <CAJnrk1Y9egYizkx1H9K0cqxSYuB+7vLvQbV7Tf4C5eHFqnnC-A@mail.gmail.com>
> After this patch, step b) is a straight copy which means step d)'s
> fixup doesn't modify what's in the pipe. This could be fixed up in
> libfuse to not depend on modify-after-vmsplice, but I don't think this
> helps for applications using already-released libfuse versions. I
> think this patch needs to be reverted.
Note, nothing was merged. I deliberately kept in -next though for a long
time to see how quickly we'd see regressions.
^ permalink raw reply
* [GIT PULL] virtio,vhost,vdpa: features, fixes
From: Michael S. Tsirkin @ 2026-06-17 10:55 UTC (permalink / raw)
To: Linus Torvalds, kvm, virtualization, netdev, linux-kernel, a0yami,
ammarfaizi2, arnd, chenhuacai, chenhuacai, christfontanez,
Damir.Shaikhutdinov, david, den, enelsonmoore, eperezma, ethan,
evg28bur, filip.hejsek, francesco, graf, harald.mommer, jasowang,
jiri, johan, johannes.thumshirn, lingshan.zhu, luis.hernandez093,
lulu, mhi, michael.bommarito, mikhail.golubev-ciuchea, mkl, mst,
mvaralar, nathan, oleg, pawel.moll, physicalmtea, polina.vishneva,
q.h.hack.winter, rosenp, schalla, shuangyu, stefanha, vattunuru,
yanlonglong, yichun, yui.washidu, yuka, zhangtianci.1997
The following changes since commit e43ffb69e0438cddd72aaa30898b4dc446f664f8:
Linux 7.1-rc6 (2026-05-31 15:14:24 -0700)
are available in the Git repository at:
https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git tags/for_linus
for you to fetch changes up to 8cb2c9285e4ce9154f45fb15633ebd45dfd8d9cf:
can: virtio: Fix comment in UAPI header (2026-06-10 02:17:00 -0400)
----------------------------------------------------------------
virtio,vhost,vdpa: features, fixes
- new virtio CAN driver
- support for LoongArch architecture in fw_cfg
- support for firmware notifications in vdpa/octeon_ep
- support for VFs in virtio core
- fixes, cleanups all over the place, notably
- vhost: fix vhost_get_avail_idx for a non empty ring
fixing an significant old perf regression
- plus READ_ONCE annotations mean virtio ring is now
free of KCSAN warnings
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
----------------------------------------------------------------
Alexander Graf (1):
virtio_ring: Add READ_ONCE annotations for device-writable fields
Ammar Faizi (1):
virtio_pci: fix vq info pointer lookup via wrong index
Arnd Bergmann (1):
vduse: fix compat handling for VDUSE_IOTLB_GET_FD/VDUSE_VQ_GET_INFO
Christian Fontanez (1):
virtio: add missing kernel-doc for map and vmap members
Cindy Lu (2):
vdpa/mlx5: update mlx_features with driver state check
vdpa/mlx5: update MAC address handling in mlx5_vdpa_set_attr()
Denis V. Lunev (1):
vhost/vsock: Refuse the connection immediately when guest isn't ready
Ethan Carter Edwards (1):
virtio_console: Fix spelling mistake "colums" -> "columns"
Ethan Nelson-Moore (1):
vhost: remove unnecessary module_init/exit functions
Evgenii Burenchev (1):
vdpa/ifcvf: handle dev_set_name() failure in ifcvf_vdpa_dev_add()
Filip Hejsek (1):
virtio_console: read size from config space during device init
Huacai Chen (1):
fw_cfg: Add support for LoongArch architecture
Jason Wang (1):
VDUSE: avoid leaking information to userspace
Jia Jia (1):
virtio: rtc: tear down old virtqueues before restore
Johan Hovold (3):
virtio-mmio: fix device release warning on module unload
vdpa_sim_blk: switch to dynamic root device
vdpa_sim_net: switch to dynamic root device
Matias Ezequiel Vara Larsen (1):
can: virtio: Add virtio CAN driver
Maurice Hieronymus (2):
virtio-balloon: Destroy mutex before freeing virtio_balloon
virtio-mem: Destroy mutex before freeing virtio_mem
Michael Bommarito (1):
hwrng: virtio: clamp device-reported used.len at copy_data()
Michael S. Tsirkin (2):
vhost: fix vhost_get_avail_idx for a non empty ring
tools/virtio: fix build for kmalloc_obj API and missing stubs
Nathan Chancellor (1):
can: virtio: Fix comment in UAPI header
Oleg Nesterov (1):
vhost_task_create: kill unnecessary .exit_signal initialization
Qihang Tang (2):
vduse: hold vduse_lock across IDR lookup in open path
vhost/vdpa: validate virtqueue index in mmap and fault paths
Qing Ming (1):
vhost/net: complete zerocopy ubufs only once
Rosen Penev (1):
vdpa/mlx5: Use kvzalloc_flex() for MTT command memory
Srujana Challa (2):
vdpa/octeon_ep: Fix PF->VF mailbox data address calculation
vdpa/octeon_ep: fix IRQ-to-ring mapping in interrupt handler
Vamsi Attunuru (2):
vdpa/octeon_ep: Use 4 bytes for mailbox signature
vdpa/octeon_ep: Add vDPA device event handling for firmware notifications
Yui Washizu (1):
virtio: add num_vf callback to virtio_bus
Zhang Tianci (2):
vduse: Requeue failed read to send_list head
vduse: Fix race in vduse_dev_msg_sync and vduse_dev_read_iter
longlong yan (1):
tools/virtio: check mmap return value in vringh_test
MAINTAINERS | 9 +
drivers/char/hw_random/virtio-rng.c | 23 +-
drivers/char/virtio_console.c | 52 +-
drivers/firmware/Kconfig | 2 +-
drivers/firmware/qemu_fw_cfg.c | 2 +-
drivers/net/can/Kconfig | 12 +
drivers/net/can/Makefile | 1 +
drivers/net/can/virtio_can.c | 1022 ++++++++++++++++++++++++++++++
drivers/vdpa/ifcvf/ifcvf_main.c | 11 +-
drivers/vdpa/mlx5/core/mr.c | 7 +-
drivers/vdpa/octeon_ep/octep_vdpa.h | 22 +-
drivers/vdpa/octeon_ep/octep_vdpa_main.c | 131 +++-
drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 24 +-
drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 23 +-
drivers/vdpa/vdpa_user/iova_domain.c | 2 +-
drivers/vdpa/vdpa_user/vduse_dev.c | 197 +++++-
drivers/vhost/net.c | 15 +-
drivers/vhost/vdpa.c | 29 +-
drivers/vhost/vhost.c | 23 +-
drivers/vhost/vsock.c | 16 +
drivers/virtio/virtio.c | 9 +
drivers/virtio/virtio_balloon.c | 2 +
drivers/virtio/virtio_mem.c | 2 +
drivers/virtio/virtio_mmio.c | 26 +-
drivers/virtio/virtio_pci_common.c | 10 +-
drivers/virtio/virtio_ring.c | 77 ++-
drivers/virtio/virtio_rtc_driver.c | 28 +-
include/linux/virtio.h | 2 +
include/uapi/linux/virtio_can.h | 78 +++
include/uapi/linux/virtio_console.h | 2 +-
kernel/vhost_task.c | 1 -
tools/virtio/linux/dma-mapping.h | 2 +
tools/virtio/linux/err.h | 1 +
tools/virtio/linux/kernel.h | 6 +
tools/virtio/vringh_test.c | 5 +
35 files changed, 1690 insertions(+), 184 deletions(-)
create mode 100644 drivers/net/can/virtio_can.c
create mode 100644 include/uapi/linux/virtio_can.h
^ permalink raw reply
* Re: [PATCH net] selftests: vlan_bridge_binding: Fix flaky operational state check
From: Nikolay Aleksandrov @ 2026-06-17 10:52 UTC (permalink / raw)
To: Ido Schimmel, netdev; +Cc: davem, kuba, pabeni, edumazet, petrm, horms
In-Reply-To: <20260617104323.1069457-1-idosch@nvidia.com>
On 17/06/2026 13:43, Ido Schimmel wrote:
> check_operstate() busy waits for up to one second for the operational
> state to change to the expected state. This is not enough since carrier
> loss events can be delayed by the kernel for up to one second (see
> __linkwatch_run_queue()), leading to sporadic failures.
>
> Fix by increasing the busy wait period to two seconds.
>
> Fixes: dca12e9ab760 ("selftests: net: Add a VLAN bridge binding selftest")
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/netdev/20260616092733.3a31be4d@kernel.org/
> Signed-off-by: Ido Schimmel <idosch@nvidia.com>
> ---
> tools/testing/selftests/net/vlan_bridge_binding.sh | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh
> index e8c02c64e03a..d04caa14202d 100755
> --- a/tools/testing/selftests/net/vlan_bridge_binding.sh
> +++ b/tools/testing/selftests/net/vlan_bridge_binding.sh
> @@ -64,7 +64,7 @@ check_operstate()
> local expect=$1; shift
> local operstate
>
> - operstate=$(busywait 1000 \
> + operstate=$(busywait 2000 \
> operstate_is "$dev" "$expect")
> check_err $? "Got operstate of $operstate, expected $expect"
> }
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
^ permalink raw reply
* Re: [PATCH net] net: llc: make empty have static storage duration
From: Simon Horman @ 2026-06-17 10:50 UTC (permalink / raw)
To: Wentao Guan; +Cc: kuba, joel.granados, netdev, linux-kernel, zhanjun, niecheng1
In-Reply-To: <20260616064053.690154-1-guanwentao@uniontech.com>
On Tue, Jun 16, 2026 at 02:40:53PM +0800, Wentao Guan wrote:
> Make empty have static storage duration (like net/sysctl_net.c does) to
> avoid a potential use-after-return and keep consistent with
> __register_sysctl_table @table 'should not be free'd after registration'.
>
> Fixes: 73dbd8cf7947 ("net: Remove ctl_table sentinel elements from several networking subsystems")
> Signed-off-by: Wentao Guan <guanwentao@uniontech.com>
Reviewed-by: Simon Horman <horms@kernel.org>
^ permalink raw reply
* Re: [PATCH v3 net] net: watchdog: fix refcount tracking races
From: Marek Szyprowski @ 2026-06-17 10:48 UTC (permalink / raw)
To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
Cc: Simon Horman, netdev, eric.dumazet, syzbot+381d82bbf0253710b35d,
syzbot+3479efbc2821cb2a79f2
In-Reply-To: <20260611152737.2580480-1-edumazet@google.com>
Dear All,
On 11.06.2026 17:27, Eric Dumazet wrote:
> Blamed commit converted the untracked dev_hold()/dev_put() calls
> in the watchdog code to use the tracked dev_hold_track()/dev_put_track()
> (which were later renamed/interfaced to netdev_hold() and netdev_put()).
>
> By introducing dev->watchdog_dev_tracker to store the
> reference tracking information without adding synchronization
> between netdev_watchdog_up() and dev_watchdog(), it enabled the
> race condition where this pointer could be overwritten or freed
> concurrently, leading to the list corruption crash syzbot reported:
>
> list_del corruption, ffff888114a18c00->next is NULL
> kernel BUG at lib/list_debug.c:52 !
> Oops: invalid opcode: 0000 [#1] SMP KASAN PTI
> CPU: 1 UID: 0 PID: 91 Comm: kworker/u8:5 Not tainted syzkaller #0 PREEMPT(lazy)
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/09/2026
> Workqueue: events_unbound linkwatch_event
> RIP: 0010:__list_del_entry_valid_or_report.cold+0x22/0x2a lib/list_debug.c:52
> Call Trace:
> <TASK>
> __list_del_entry_valid include/linux/list.h:132 [inline]
> __list_del_entry include/linux/list.h:246 [inline]
> list_move_tail include/linux/list.h:341 [inline]
> ref_tracker_free+0x1a7/0x6c0 lib/ref_tracker.c:329
> netdev_tracker_free include/linux/netdevice.h:4491 [inline]
> netdev_put include/linux/netdevice.h:4508 [inline]
> netdev_put include/linux/netdevice.h:4504 [inline]
> netdev_watchdog_down net/sched/sch_generic.c:600 [inline]
> dev_deactivate_many+0x28c/0xfe0 net/sched/sch_generic.c:1363
> dev_deactivate+0x109/0x1d0 net/sched/sch_generic.c:1397
> linkwatch_do_dev net/core/link_watch.c:184 [inline]
> linkwatch_do_dev+0xd3/0x120 net/core/link_watch.c:166
> __linkwatch_run_queue+0x3a5/0x810 net/core/link_watch.c:240
> linkwatch_event+0x8f/0xc0 net/core/link_watch.c:314
> process_one_work+0xa0e/0x1980 kernel/workqueue.c:3314
> process_scheduled_works kernel/workqueue.c:3397 [inline]
> worker_thread+0x5ef/0xe50 kernel/workqueue.c:3478
> kthread+0x370/0x450 kernel/kthread.c:436
> ret_from_fork+0x69a/0xc80 arch/x86/kernel/process.c:158
> ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
>
> This patch has three coordinated parts:
>
> 1) Add dev->watchdog_lock and dev->watchdog_ref_held to serialize watchdog operations.
>
> 2) Remove netdev_watchdog_up() call from netif_carrier_on():
> This ensures netdev_watchdog_up() is only called from process/BH context
> (via linkwatch workqueue dev_activate()), allowing us to use
> spin_lock_bh() for synchronization.
>
> 3) Synchronize watchdog up and watchdog timer:
> Protect netdev_watchdog_up() with tx_global_lock and watchdog_lock.
> Only allocate a new tracker in netdev_watchdog_up() if one is
> not already present.
> In dev_watchdog(), ensure we don't release the tracker if the
> timer was rescheduled either by dev_watchdog() itself or concurrently
> by netdev_watchdog_up().
>
> Fixes: f12bf6f3f942 ("net: watchdog: add net device refcount tracker")
> Reported-by: syzbot+381d82bbf0253710b35d@syzkaller.appspotmail.com
> Closes: https://lore.kernel.org/netdev/6a26b751.c25708ab.1b19ef.0013.GAE@google.com/T/#u
> Tested-by: syzbot+3479efbc2821cb2a79f2@syzkaller.appspotmail.com
> Signed-off-by: Eric Dumazet <edumazet@google.com>
This patch landed recently in linux-next as commit 8eed5519e496 ("net: watchdog:
fix refcount tracking races"). In my tests I found that it causes the following
deadlock during system suspend/resume on QEmu's ARM64bit 'virt' machine:
root@target:~# time rtcwake -s10 -mmem
rtcwake: assuming RTC uses UTC ...
rtcwake: wakeup from "mem" using /dev/rtc0 at Wed Jun 17 10:46:12 2026
PM: suspend entry (s2idle)
Filesystems sync: 0.055 seconds
Freezing user space processes
Freezing user space processes completed (elapsed 0.006 seconds)
OOM killer disabled.
Freezing remaining freezable tasks
Freezing remaining freezable tasks completed (elapsed 0.003 seconds)
============================================
WARNING: possible recursive locking detected
7.1.0-rc7+ #13003 Not tainted
--------------------------------------------
rtcwake/254 is trying to acquire lock:
ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netdev_watchdog_up+0x40/0x108
but task is already holding lock:
ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
other info that might help us debug this:
Possible unsafe locking scenario:
CPU0
----
lock(&dev->tx_global_lock);
lock(&dev->tx_global_lock);
*** DEADLOCK ***
May be due to missing lock nesting notation
6 locks held by rtcwake/254:
#0: ffff0000071ab3e8 (sb_writers#5){.+.+}-{0:0}, at: vfs_write+0x1ec/0x35c
#1: ffff00000d22c480 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0xf0/0x1c4
#2: ffff0000049162c8 (kn->active#61){.+.+}-{0:0}, at: kernfs_fop_write_iter+0x100/0x1c4
#3: ffffaa79533c03b0 (system_transition_mutex){+.+.}-{4:4}, at: pm_suspend+0x98/0x608
#4: ffff000005e3a138 (&dev->mutex){....}-{4:4}, at: device_resume+0xb4/0x254
#5: ffff000006de64e8 (&dev->tx_global_lock){+.-.}-{3:3}, at: netif_tx_lock+0x1c/0x34
stack backtrace:
CPU: 1 UID: 0 PID: 254 Comm: rtcwake Not tainted 7.1.0-rc7+ #13003 PREEMPT
Hardware name: linux,dummy-virt (DT)
Call trace:
show_stack+0x18/0x24 (C)
dump_stack_lvl+0x90/0xd0
dump_stack+0x18/0x24
print_deadlock_bug+0x260/0x350
__lock_acquire+0x11b8/0x225c
lock_acquire+0x1c4/0x3f0
_raw_spin_lock_bh+0x50/0x68
netdev_watchdog_up+0x40/0x108
netif_device_attach+0x9c/0xb0
virtnet_restore+0x100/0x21c
virtio_device_restore_priv+0x11c/0x1d0
virtio_device_restore+0x14/0x20
virtio_mmio_restore+0x34/0x40
platform_pm_resume+0x2c/0x68
dpm_run_callback+0xa0/0x240
device_resume+0x120/0x254
dpm_resume+0x1f8/0x2ec
dpm_resume_end+0x18/0x34
suspend_devices_and_enter+0x1d0/0x990
pm_suspend+0x1ec/0x608
state_store+0x8c/0x110
kobj_attr_store+0x18/0x2c
sysfs_kf_write+0x50/0x7c
kernfs_fop_write_iter+0x130/0x1c4
vfs_write+0x2b8/0x35c
ksys_write+0x6c/0x104
__arm64_sys_write+0x1c/0x28
invoke_syscall+0x54/0x110
el0_svc_common.constprop.0+0x40/0xe8
do_el0_svc+0x20/0x2c
el0_svc+0x54/0x338
el0t_64_sync_handler+0xa0/0xe4
el0t_64_sync+0x198/0x19c
Reverting $subject on top of linux-next fixes this issue.
> ---
> v3: added dev->watchdog_lock and dev->watchdog_ref_held instead of messing
> with ref tracker infra. (after sashiko feedback on v2)
> v2: fix compile error when CONFIG_NET_DEV_REFCNT_TRACKER is not set (Jakub and build bots)
>
> include/linux/netdevice.h | 4 ++++
> net/core/dev.c | 3 ++-
> net/sched/sch_generic.c | 44 +++++++++++++++++++++++++++++----------
> 3 files changed, 39 insertions(+), 12 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 0e1e581efc5ac264259b2f0fdfe41c50a6f47239..4a0e83709f29e4bcf12f479e464e6bedecc61c69 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -1980,6 +1980,8 @@ enum netdev_reg_state {
> * @qdisc_hash: qdisc hash table
> * @watchdog_timeo: Represents the timeout that is used by
> * the watchdog (see dev_watchdog())
> + * @watchdog_lock: protect watchdog_ref_held
> + * @watchdog_ref_held: True if the watchdog device ref is taken.
> * @watchdog_timer: List of timers
> *
> * @proto_down_reason: reason a netdev interface is held down
> @@ -2392,6 +2394,8 @@ struct net_device {
> /* These may be needed for future network-power-down code. */
> struct timer_list watchdog_timer;
> int watchdog_timeo;
> + spinlock_t watchdog_lock;
> + bool watchdog_ref_held;
>
> u32 proto_down_reason;
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 0c6c270d9f7d115feb824f4ebe6be122c40d745f..731e661d7be6574d5eca4a600e0a5623be4c2485 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -11217,7 +11217,8 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
>
> netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
> spin_lock_init(&dev->tx_global_lock);
> -
> + spin_lock_init(&dev->watchdog_lock);
> + dev->watchdog_ref_held = false;
> return 0;
> }
>
> diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
> index a93321db8fd75d30c61e146c290bbc139c37c913..6cdf2ccfb0937e45271f8690a0c09d48a24ce769 100644
> --- a/net/sched/sch_generic.c
> +++ b/net/sched/sch_generic.c
> @@ -568,16 +568,24 @@ static void dev_watchdog(struct timer_list *t)
> dev->netdev_ops->ndo_tx_timeout(dev, i);
> netif_unfreeze_queues(dev);
> }
> - if (!mod_timer(&dev->watchdog_timer,
> - round_jiffies(oldest_start +
> - dev->watchdog_timeo)))
> - release = false;
> + spin_lock(&dev->watchdog_lock);
> + mod_timer(&dev->watchdog_timer,
> + round_jiffies(oldest_start +
> + dev->watchdog_timeo));
> + release = false;
> + spin_unlock(&dev->watchdog_lock);
> }
> }
> spin_unlock(&dev->tx_global_lock);
>
> - if (release)
> + spin_lock(&dev->watchdog_lock);
> + if (timer_pending(&dev->watchdog_timer))
> + release = false;
> + if (release && dev->watchdog_ref_held) {
> netdev_put(dev, &dev->watchdog_dev_tracker);
> + dev->watchdog_ref_held = false;
> + }
> + spin_unlock(&dev->watchdog_lock);
> }
>
> void netdev_watchdog_up(struct net_device *dev)
> @@ -586,18 +594,34 @@ void netdev_watchdog_up(struct net_device *dev)
> return;
> if (dev->watchdog_timeo <= 0)
> dev->watchdog_timeo = 5*HZ;
> + spin_lock_bh(&dev->tx_global_lock);
> +
> + spin_lock(&dev->watchdog_lock);
> if (!mod_timer(&dev->watchdog_timer,
> - round_jiffies(jiffies + dev->watchdog_timeo)))
> - netdev_hold(dev, &dev->watchdog_dev_tracker,
> - GFP_ATOMIC);
> + round_jiffies(jiffies + dev->watchdog_timeo))) {
> + if (!dev->watchdog_ref_held) {
> + netdev_hold(dev, &dev->watchdog_dev_tracker,
> + GFP_ATOMIC);
> + dev->watchdog_ref_held = true;
> + }
> + }
> + spin_unlock(&dev->watchdog_lock);
> +
> + spin_unlock_bh(&dev->tx_global_lock);
> }
> EXPORT_SYMBOL_GPL(netdev_watchdog_up);
>
> static void netdev_watchdog_down(struct net_device *dev)
> {
> netif_tx_lock_bh(dev);
> - if (timer_delete(&dev->watchdog_timer))
> +
> + spin_lock(&dev->watchdog_lock);
> + if (timer_delete(&dev->watchdog_timer)) {
> netdev_put(dev, &dev->watchdog_dev_tracker);
> + dev->watchdog_ref_held = false;
> + }
> + spin_unlock(&dev->watchdog_lock);
> +
> netif_tx_unlock_bh(dev);
> }
>
> @@ -614,8 +638,6 @@ void netif_carrier_on(struct net_device *dev)
> return;
> atomic_inc(&dev->carrier_up_count);
> linkwatch_fire_event(dev);
> - if (netif_running(dev))
> - netdev_watchdog_up(dev);
> }
> }
> EXPORT_SYMBOL(netif_carrier_on);
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
^ permalink raw reply
* [PATCH net] octeontx2-af: fix CGX debugfs RVU AF PCI reference leaks
From: Ratheesh Kannoth @ 2026-06-17 10:45 UTC (permalink / raw)
To: davem, hkelam, lcherian, linux-kernel, netdev, pabeni, sgoutham
Cc: andrew+netdev, edumazet, kuba, Ratheesh Kannoth, Yuho Choi
CGX per-lmac debugfs seq readers obtained struct rvu via
pci_get_drvdata(pci_get_device(..., PCI_DEVID_OCTEONTX2_RVU_AF, ...)),
which leaks a PCI device reference on every read. Store rvu and the CGX
handle in debugfs inode private data when creating stats, mac_filter,
and fwdata files (one context per CGX), and use debugfs aux numbers for
fwdata so lmac_id matches the other CGX debugfs entries.
Fixes: f967488d095e ("octeontx2-af: Add per CGX port level NIX Rx/Tx counters")
Fixes: dbc52debf95f ("octeontx2-af: Debugfs support for DMAC filters")
Fixes: 49f02e6877d1 ("Octeontx2-af: Debugfs support for firmware data")
Cc: Linu Cherian <lcherian@marvell.com>
Reported-by: Yuho Choi <dbgh9129@gmail.com>
Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
---
.../marvell/octeontx2/af/rvu_debugfs.c | 77 ++++++++++---------
1 file changed, 42 insertions(+), 35 deletions(-)
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
index fa461489acdd..77ff734438cd 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@ -2809,6 +2809,14 @@ static void rvu_dbg_npa_init(struct rvu *rvu)
&rvu_dbg_npa_ndc_hits_miss_fops);
}
+/* Per-lmac CGX debugfs files need both RVU and CGX handle; inode->i_private
+ * points here so seq_file ops avoid pci_get_device(PCI_DEVID_OCTEONTX2_RVU_AF).
+ */
+struct rvu_cgx_lmac_dbgfs_ctx {
+ struct rvu *rvu;
+ void *cgxd;
+};
+
#define PRINT_CGX_CUML_NIXRX_STATUS(idx, name) \
({ \
u64 cnt; \
@@ -2831,18 +2839,14 @@ static void rvu_dbg_npa_init(struct rvu *rvu)
static int cgx_print_stats(struct seq_file *s, int lmac_id)
{
+ struct rvu_cgx_lmac_dbgfs_ctx *dctx = s->private;
struct cgx_link_user_info linfo;
struct mac_ops *mac_ops;
- void *cgxd = s->private;
+ void *cgxd = dctx->cgxd;
+ struct rvu *rvu = dctx->rvu;
u64 ucast, mcast, bcast;
int stat = 0, err = 0;
u64 tx_stat, rx_stat;
- struct rvu *rvu;
-
- rvu = pci_get_drvdata(pci_get_device(PCI_VENDOR_ID_CAVIUM,
- PCI_DEVID_OCTEONTX2_RVU_AF, NULL));
- if (!rvu)
- return -ENODEV;
mac_ops = get_mac_ops(cgxd);
/* There can be no CGX devices at all */
@@ -2949,20 +2953,16 @@ RVU_DEBUG_SEQ_FOPS(cgx_stat, cgx_stat_display, NULL);
static int cgx_print_dmac_flt(struct seq_file *s, int lmac_id)
{
+ struct rvu_cgx_lmac_dbgfs_ctx *dctx = s->private;
struct pci_dev *pdev = NULL;
- void *cgxd = s->private;
+ void *cgxd = dctx->cgxd;
+ struct rvu *rvu = dctx->rvu;
char *bcast, *mcast;
u16 index, domain;
u8 dmac[ETH_ALEN];
- struct rvu *rvu;
u64 cfg, mac;
int pf;
- rvu = pci_get_drvdata(pci_get_device(PCI_VENDOR_ID_CAVIUM,
- PCI_DEVID_OCTEONTX2_RVU_AF, NULL));
- if (!rvu)
- return -ENODEV;
-
pf = cgxlmac_to_pf(rvu, cgx_get_cgxid(cgxd), lmac_id);
domain = 2;
@@ -3009,17 +3009,13 @@ RVU_DEBUG_SEQ_FOPS(cgx_dmac_flt, cgx_dmac_flt_display, NULL);
static int cgx_print_fwdata(struct seq_file *s, int lmac_id)
{
+ struct rvu_cgx_lmac_dbgfs_ctx *dctx = s->private;
struct cgx_lmac_fwdata_s *fwdata;
- void *cgxd = s->private;
+ void *cgxd = dctx->cgxd;
+ struct rvu *rvu = dctx->rvu;
struct phy_s *phy;
- struct rvu *rvu;
int cgx_id, i;
- rvu = pci_get_drvdata(pci_get_device(PCI_VENDOR_ID_CAVIUM,
- PCI_DEVID_OCTEONTX2_RVU_AF, NULL));
- if (!rvu)
- return -ENODEV;
-
if (!rvu->fwdata)
return -EAGAIN;
@@ -3126,20 +3122,31 @@ static void rvu_dbg_cgx_init(struct rvu *rvu)
rvu->rvu_dbg.cgx = debugfs_create_dir(dname,
rvu->rvu_dbg.cgx_root);
- for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) {
- /* lmac debugfs dir */
- sprintf(dname, "lmac%d", lmac_id);
- rvu->rvu_dbg.lmac =
- debugfs_create_dir(dname, rvu->rvu_dbg.cgx);
-
- debugfs_create_file_aux_num("stats", 0600, rvu->rvu_dbg.lmac,
- cgx, lmac_id, &rvu_dbg_cgx_stat_fops);
- debugfs_create_file_aux_num("mac_filter", 0600,
- rvu->rvu_dbg.lmac, cgx, lmac_id,
- &rvu_dbg_cgx_dmac_flt_fops);
- debugfs_create_file("fwdata", 0600,
- rvu->rvu_dbg.lmac, cgx,
- &rvu_dbg_cgx_fwdata_fops);
+ {
+ struct rvu_cgx_lmac_dbgfs_ctx *ctx;
+
+ ctx = devm_kzalloc(rvu->dev, sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ continue;
+
+ ctx->rvu = rvu;
+ ctx->cgxd = cgx;
+
+ for_each_set_bit(lmac_id, &lmac_bmap, rvu->hw->lmac_per_cgx) {
+ /* lmac debugfs dir */
+ sprintf(dname, "lmac%d", lmac_id);
+ rvu->rvu_dbg.lmac =
+ debugfs_create_dir(dname, rvu->rvu_dbg.cgx);
+
+ debugfs_create_file_aux_num("stats", 0600, rvu->rvu_dbg.lmac,
+ ctx, lmac_id, &rvu_dbg_cgx_stat_fops);
+ debugfs_create_file_aux_num("mac_filter", 0600,
+ rvu->rvu_dbg.lmac, ctx, lmac_id,
+ &rvu_dbg_cgx_dmac_flt_fops);
+ debugfs_create_file_aux_num("fwdata", 0600,
+ rvu->rvu_dbg.lmac, ctx,
+ lmac_id, &rvu_dbg_cgx_fwdata_fops);
+ }
}
}
}
--
2.43.0
^ permalink raw reply related
* [PATCH net] selftests: vlan_bridge_binding: Fix flaky operational state check
From: Ido Schimmel @ 2026-06-17 10:43 UTC (permalink / raw)
To: netdev; +Cc: davem, kuba, pabeni, edumazet, petrm, horms, razor, Ido Schimmel
check_operstate() busy waits for up to one second for the operational
state to change to the expected state. This is not enough since carrier
loss events can be delayed by the kernel for up to one second (see
__linkwatch_run_queue()), leading to sporadic failures.
Fix by increasing the busy wait period to two seconds.
Fixes: dca12e9ab760 ("selftests: net: Add a VLAN bridge binding selftest")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Closes: https://lore.kernel.org/netdev/20260616092733.3a31be4d@kernel.org/
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
---
tools/testing/selftests/net/vlan_bridge_binding.sh | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/testing/selftests/net/vlan_bridge_binding.sh b/tools/testing/selftests/net/vlan_bridge_binding.sh
index e8c02c64e03a..d04caa14202d 100755
--- a/tools/testing/selftests/net/vlan_bridge_binding.sh
+++ b/tools/testing/selftests/net/vlan_bridge_binding.sh
@@ -64,7 +64,7 @@ check_operstate()
local expect=$1; shift
local operstate
- operstate=$(busywait 1000 \
+ operstate=$(busywait 2000 \
operstate_is "$dev" "$expect")
check_err $? "Got operstate of $operstate, expected $expect"
}
--
2.54.0
^ permalink raw reply related
* Re: [PATCH net 2/2] sctp: add INIT verification after cookie unpacking
From: Simon Horman @ 2026-06-17 10:40 UTC (permalink / raw)
To: lucien.xin
Cc: Simon Horman, netdev, linux-sctp, davem, kuba, edumazet, pabeni,
marcelo.leitner
In-Reply-To: <13a54c38586e0e1c7d46dfc16c3f6e1a7685001b.1781570014.git.lucien.xin@gmail.com>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://netdev-ai.bots.linux.dev/sashiko/
---
Now I have enough verified information to produce the report.
commit 9d74027bfd0cb22a103cca034883f0fcb4e2535a
Author: Xin Long <lucien.xin@gmail.com>
sctp: add INIT verification after cookie unpacking
This commit adds sctp_verify_init() checks after cookie unpacking in
the COOKIE-ECHO processing paths when cookie_auth_enable is disabled.
It also changes the bounds comparison inside sctp_verify_init() from
chunk->chunk_end to peer_init + ntohs(peer_init->chunk_hdr.length).
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 41958b8e59fd..21b9eb1c02e9 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -2298,7 +2298,7 @@ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
> * VIOLATION error. We build the ERROR chunk here and let the normal
> * error handling code build and send the packet.
> */
> - if (param.v != (void *)chunk->chunk_end)
> + if (param.v != (void *)peer_init + ntohs(peer_init->chunk_hdr.length))
> return sctp_process_inv_paramlength(asoc, param.p, chunk, errp);
[High]
Can this new comparison reject INIT chunks whose final parameter has a
length that is not a multiple of 4?
sctp_walk_params() advances pos.v by SCTP_PAD4(ntohs(pos.p->length)) on
every iteration:
include/net/sctp/sctp.h:
#define _sctp_walk_params(pos, chunk, end)\
for (pos.v = (u8 *)(chunk + 1);\
...
pos.v += SCTP_PAD4(ntohs(pos.p->length)))
so when the final parameter has an unpadded length L that is not 4-byte
aligned, pos.v ends at peer_init + offset_of_last_param + SCTP_PAD4(L),
which exceeds peer_init + ntohs(peer_init->chunk_hdr.length) by 1..3.
The previous comparison used chunk->chunk_end, which inqueue.c computes
with the padding included:
net/sctp/inqueue.c:
chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length));
so the old check accepted unaligned-length final parameters; the new
check returns 0 from sctp_verify_init(), and the callers then invoke
sctp_abort_on_init_err() and emit an ABORT.
Note that this change is in sctp_verify_init() itself, so it affects
every caller, not just the new COOKIE-ECHO sites: sctp_sf_do_5_1B_init,
sctp_sf_do_5_1C_ack, sctp_sf_do_unexpected_init, sctp_sf_do_5_1D_ce
and sctp_sf_do_5_2_4_dupcook all reach this comparison.
Realistic triggers include:
- Locally-generated INITs constructed by sctp_make_init() when both
auth_enable and asconf_enable are set: with the default chunk IDs
ASCONF + ASCONF_ACK the AUTH-CHUNKS parameter has length 6 and is
typically the last parameter, leaving chunk_hdr.length unaligned.
- Peer INITs containing any final parameter with non-4-aligned
length, e.g. SCTP_PARAM_SUPPORTED_ADDRESS_TYPES with 1 or 3 types
last, AUTH-CHUNKS with N=1,2,3 entries last, or any unknown TLV.
Even in the COOKIE-ECHO context the new bound seems too strict, since
sctp_pack_cookie() copies the INIT verbatim with memcpy(cookie + 1,
init_chunk->chunk_hdr, ntohs(...->length)) followed immediately by
raw_addrs, so unpacked cookies recovered from peers that sent an INIT
with an unaligned-length final parameter would also be spuriously
rejected.
Would comparing against peer_init + SCTP_PAD4(ntohs(peer_init->
chunk_hdr.length)) preserve the long-standing semantics of chunk_end
while still validating the cookie-derived peer_init bounds?
> diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
[ ... ]
^ permalink raw reply
* Re: [PATCH net 1/2] sctp: factor out INIT verification failure handling
From: Simon Horman @ 2026-06-17 10:39 UTC (permalink / raw)
To: lucien.xin
Cc: Simon Horman, netdev, linux-sctp, davem, kuba, edumazet, pabeni,
marcelo.leitner
In-Reply-To: <6fb546c80126a410349e724045ce16a41413c8a6.1781570014.git.lucien.xin@gmail.com>
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
Full review at: https://netdev-ai.bots.linux.dev/sashiko/
---
sctp: factor out INIT verification failure handling
This refactor moves the duplicated INIT/INIT-ACK error handling out of
sctp_sf_do_5_1B_init(), sctp_sf_do_5_1C_ack(), and
sctp_sf_do_unexpected_init() into a new helper sctp_abort_on_init_err().
The commit message states "No functional change intended."
> diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
> index 9b23c11cbb9e..544f308ee527 100644
> --- a/net/sctp/sm_statefuns.c
> +++ b/net/sctp/sm_statefuns.c
[ ... ]
> @@ -548,50 +531,9 @@ enum sctp_disposition sctp_sf_do_5_1C_ack(struct net *net,
> err_chunk = NULL;
> if (!sctp_verify_init(net, ep, asoc, chunk->chunk_hdr->type,
> (struct sctp_init_chunk *)chunk->chunk_hdr, chunk,
> - &err_chunk)) {
> -
> - enum sctp_error error = SCTP_ERROR_NO_RESOURCE;
> -
> - /* This chunk contains fatal error. It is to be discarded.
> - * Send an ABORT, with causes. If there are no causes,
> - * then there wasn't enough memory. Just terminate
> - * the association.
> - */
> - if (err_chunk) {
> - packet = sctp_abort_pkt_new(net, ep, asoc, arg,
> - (__u8 *)(err_chunk->chunk_hdr) +
> - sizeof(struct sctp_chunkhdr),
> - ntohs(err_chunk->chunk_hdr->length) -
> - sizeof(struct sctp_chunkhdr));
> -
> - sctp_chunk_free(err_chunk);
> -
> - if (packet) {
> - sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
> - SCTP_PACKET(packet));
> - SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
> - error = SCTP_ERROR_INV_PARAM;
> - }
> - }
> -
> - /* SCTP-AUTH, Section 6.3:
> - * It should be noted that if the receiver wants to tear
> - * down an association in an authenticated way only, the
> - * handling of malformed packets should not result in
> - * tearing down the association.
> - *
> - * This means that if we only want to abort associations
> - * in an authenticated way (i.e AUTH+ABORT), then we
> - * can't destroy this association just because the packet
> - * was malformed.
> - */
> - if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
> - return sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
> -
> - SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
> - return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED,
> - asoc, chunk->transport);
> - }
> + &err_chunk))
> + return sctp_abort_on_init_err(net, ep, asoc, chunk, arg,
> + commands, err_chunk);
[High]
In the original sctp_sf_do_5_1C_ack() the "if (err_chunk) { ... }"
block is non-controlling: when err_chunk is NULL the code skips the
block, falls through with error == SCTP_ERROR_NO_RESOURCE, consults
sctp_auth_recv_cid(SCTP_CID_ABORT, asoc) and either pdiscards or runs
sctp_stop_t1_and_abort() to tear the half-open association down.
[High]
In the new helper, can the INIT-ACK + NULL err_chunk case still reach
that path? Looking at sctp_abort_on_init_err():
if (!err_chunk)
return sctp_sf_tabort_8_4_8(net, ep, asoc, SCTP_ST_CHUNK(0),
arg, commands);
...
if (chunk->chunk_hdr->type != SCTP_CID_INIT_ACK) {
...
}
/* SCTP-AUTH, Section 6.3 ... */
if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
return sctp_sf_pdiscard(...);
SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED,
asoc, chunk->transport);
[High]
The !err_chunk early return runs before the INIT-ACK type check, so
when sctp_verify_init() returns 0 with *errp left as NULL on an
INIT-ACK, control now goes to sctp_sf_tabort_8_4_8() rather than the
auth-check + sctp_stop_t1_and_abort() path.
[High]
Is the NULL-err_chunk case actually reachable on an INIT-ACK? In
sctp_process_inv_mandatory() and sctp_process_inv_paramlength() in
net/sctp/sm_make_chunk.c, *errp is allocated via sctp_make_op_error_space()
or sctp_make_violation_paramlen() and the function returns 0
unconditionally:
if (!*errp)
*errp = sctp_make_op_error_space(asoc, chunk, 0);
if (*errp)
sctp_init_cause(...);
return 0;
So an allocation failure on a malformed INIT-ACK reaches the helper
with err_chunk == NULL.
[High]
On that path, does the new code skip several actions that the original
performed? sctp_sf_tabort_8_4_8() emits an OOTB ABORT and pdiscards;
it does not stop T1-init, does not call SCTP_CMD_SET_SK_ERR(ECONNREFUSED),
does not issue SCTP_CMD_INIT_FAILED, does not transition the asoc to
SCTP_STATE_CLOSED, and does not increment SCTP_MIB_ABORTEDS.
[High]
As a result, would connect(2) wait for T1-init to exhaust its
retransmissions instead of receiving ECONNREFUSED promptly, and would
SCTP_MIB_ABORTEDS no longer be incremented (with SCTP_MIB_IN_PKT_DISCARDS
being incremented in its place)?
[High]
Does this also bypass the SCTP-AUTH section 6.3 check quoted in the
comment? Previously, when AUTH-ABORT was negotiated and err_chunk
was NULL, sctp_auth_recv_cid() caused sctp_sf_pdiscard() to be returned
and no ABORT was emitted on the wire. In the new path, sctp_sf_tabort_8_4_8()
sends an unauthenticated OOTB ABORT regardless of AUTH negotiation.
[High]
Finally, does the disposition return value also change from
SCTP_DISPOSITION_ABORT to SCTP_DISPOSITION_CONSUME on this path?
[High]
Would moving the chunk->chunk_hdr->type check ahead of the !err_chunk
early return restore the original semantics for INIT-ACK while keeping
the sctp_sf_tabort_8_4_8() fallback for INIT?
[ ... ]
> @@ -6485,6 +6403,61 @@ static void sctp_send_stale_cookie_err(struct net *net,
> }
> }
>
> +static enum sctp_disposition sctp_abort_on_init_err(
> + struct net *net,
> + const struct sctp_endpoint *ep,
> + const struct sctp_association *asoc,
> + const struct sctp_chunk *chunk,
> + void *arg,
> + struct sctp_cmd_seq *commands,
> + struct sctp_chunk *err_chunk)
> +{
> + enum sctp_error error = SCTP_ERROR_NO_RESOURCE;
> + struct sctp_packet *packet;
> + struct sctp_chunkhdr *ch;
> +
> + if (!err_chunk)
> + return sctp_sf_tabort_8_4_8(net, ep, asoc, SCTP_ST_CHUNK(0),
> + arg, commands);
> +
> + ch = err_chunk->chunk_hdr;
> + packet = sctp_abort_pkt_new(net, ep, asoc, arg,
> + (__u8 *)ch + sizeof(*ch),
> + ntohs(ch->length) - sizeof(*ch));
> +
> + sctp_chunk_free(err_chunk);
> +
> + if (packet) {
> + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
> + SCTP_PACKET(packet));
> + SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
> + error = SCTP_ERROR_INV_PARAM;
> + }
> +
> + if (chunk->chunk_hdr->type != SCTP_CID_INIT_ACK) {
> + if (!packet)
> + return SCTP_DISPOSITION_NOMEM;
> + return SCTP_DISPOSITION_CONSUME;
> + }
> + /* SCTP-AUTH, Section 6.3:
> + * It should be noted that if the receiver wants to tear
> + * down an association in an authenticated way only, the
> + * handling of malformed packets should not result in
> + * tearing down the association.
> + *
> + * This means that if we only want to abort associations
> + * in an authenticated way (i.e AUTH+ABORT), then we
> + * can't destroy this association just because the packet
> + * was malformed.
> + */
> + if (sctp_auth_recv_cid(SCTP_CID_ABORT, asoc))
> + return sctp_sf_pdiscard(net, ep, asoc, SCTP_ST_CHUNK(0), arg,
> + commands);
> +
> + SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
> + return sctp_stop_t1_and_abort(net, commands, error, ECONNREFUSED,
> + asoc, chunk->transport);
> +}
^ permalink raw reply
* [PATCH net] eth: fbnic: take netif_addr_lock_bh() around rx mode address programming
From: Daniel Zahka @ 2026-06-17 10:39 UTC (permalink / raw)
To: Alexander Duyck, Jakub Kicinski, kernel-team, Andrew Lunn,
David S. Miller, Eric Dumazet, Paolo Abeni, Sanman Pradhan
Cc: netdev, linux-kernel
When __fbnic_set_rx_mode() is called from contexts other than
.ndo_set_rx_mode_async(), the uc and mc addr lists are accessed
without the addr lock that __hw_addr_sync_dev() and
__hw_addr_unsync_dev() require. Wrap these unprotected accesses with
netif_addr_lock_bh(). fbnic_clear_rx_mode() has similar issues.
Fixes: eb690ef8d1c2 ("eth: fbnic: Add L2 address programming")
Assisted-by: Claude:claude-opus-4-8
Signed-off-by: Daniel Zahka <daniel.zahka@gmail.com>
---
drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 7 ++++++-
drivers/net/ethernet/meta/fbnic/fbnic_pci.c | 4 ++++
drivers/net/ethernet/meta/fbnic/fbnic_rpc.c | 2 ++
3 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
index 4dea2bb58d2f..44c046b25312 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c
@@ -262,8 +262,11 @@ static int fbnic_set_mac(struct net_device *netdev, void *p)
eth_hw_addr_set(netdev, addr->sa_data);
- if (netif_running(netdev))
+ if (netif_running(netdev)) {
+ netif_addr_lock_bh(netdev);
__fbnic_set_rx_mode(fbn->fbd, &netdev->uc, &netdev->mc);
+ netif_addr_unlock_bh(netdev);
+ }
return 0;
}
@@ -308,8 +311,10 @@ void fbnic_clear_rx_mode(struct fbnic_dev *fbd)
/* Write updates to hardware */
fbnic_write_macda(fbd);
+ netif_addr_lock_bh(netdev);
__dev_uc_unsync(netdev, NULL);
__dev_mc_unsync(netdev, NULL);
+ netif_addr_unlock_bh(netdev);
}
static int fbnic_hwtstamp_get(struct net_device *netdev,
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
index 7e85b480203c..8b9bc9e8ea56 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_pci.c
@@ -135,7 +135,9 @@ void fbnic_up(struct fbnic_net *fbn)
fbnic_rss_reinit_hw(fbn->fbd, fbn);
+ netif_addr_lock_bh(fbn->netdev);
__fbnic_set_rx_mode(fbn->fbd, &fbn->netdev->uc, &fbn->netdev->mc);
+ netif_addr_unlock_bh(fbn->netdev);
/* Enable Tx/Rx processing */
fbnic_napi_enable(fbn);
@@ -180,7 +182,9 @@ static int fbnic_fw_config_after_crash(struct fbnic_dev *fbd)
}
fbnic_rpc_reset_valid_entries(fbd);
+ netif_addr_lock_bh(fbd->netdev);
__fbnic_set_rx_mode(fbd, &fbd->netdev->uc, &fbd->netdev->mc);
+ netif_addr_unlock_bh(fbd->netdev);
return 0;
}
diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
index fe95b6f69646..bc0f38b6a2b2 100644
--- a/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
+++ b/drivers/net/ethernet/meta/fbnic/fbnic_rpc.c
@@ -244,7 +244,9 @@ void fbnic_bmc_rpc_check(struct fbnic_dev *fbd)
if (fbd->fw_cap.need_bmc_tcam_reinit) {
fbnic_bmc_rpc_init(fbd);
+ netif_addr_lock_bh(fbd->netdev);
__fbnic_set_rx_mode(fbd, &fbd->netdev->uc, &fbd->netdev->mc);
+ netif_addr_unlock_bh(fbd->netdev);
fbd->fw_cap.need_bmc_tcam_reinit = false;
}
---
base-commit: 406e8a651a7b854c41fecd5117bb282b3a6c2c6b
change-id: 20260616-linux-fbnic-hwaddr-fe83ccc72c13
Best regards,
--
Daniel Zahka <daniel.zahka@gmail.com>
^ permalink raw reply related
* Re: [PATCH net] netpoll: run NAPI poll in softirq context to avoid rq->lock self-deadlock
From: Petr Mladek @ 2026-06-17 10:37 UTC (permalink / raw)
To: Jakub Kicinski
Cc: Peter Zijlstra, Sebastian Andrzej Siewior, John Ogness,
Sergey Senozhatsky, Vlad Poenaru, Thomas Gleixner, netdev,
David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman,
Breno Leitao, Clark Williams, Steven Rostedt, linux-rt-devel,
linux-kernel, stable, Frederic Weisbecker, Ingo Molnar,
Vincent Guittot, Dietmar Eggemann, K Prateek Nayak
In-Reply-To: <20260616141719.67684bf0@kernel.org>
On Tue 2026-06-16 14:17:19, Jakub Kicinski wrote:
> On Tue, 16 Jun 2026 19:02:57 +0200 Peter Zijlstra wrote:
> > > So this is not an issue since commit 7eab73b18630e ("netconsole: convert
> > > to NBCON console infrastructure"). Because from here now on writes are
> > > deferred to the nbcon thread. So this purely about -stable in this case.
> >
> > Hmm, I thought netconsole had some reserved skbs and could to writes
> > 'atomic' like? That said, it was 2.6 era the last time I looked at
> > netconsole.
>
> Yes, that part is fine. The problem is that netconsole tries
> to reap Tx completions if the Tx queue is full. We can't call
> skb destructor in irq context so we put the completed skbs on
> a queue and try to arm softirq to get to them later.
> Arming softirq causes a ksoftirq wake up.
>
> We already skip the completion polling if we detect getting called
> from the same networking driver. It's best effort, anyway.
> Networking-side fix would be to toss another OR condition into
> the skip. But we don't have one that'd work cleanly :S
Alternative solution might be to offload the ksoftirq wake up
to an irq_work. It might make this part safe for the
console->write_atomic() call.
Well, my understanding is that there are more problems.
AFAIK, some drivers do not use an IRQ safe locking, see
https://lore.kernel.org/all/oth5t27z6acp7qxut7u45ekyil7djirg2ny3bnsvnzeqasavxb@nhwdxahvcosh/
Best Regards,
Petr
^ permalink raw reply
* [PATCH v3] vduse: hold vduse_lock across IDR lookup in open path
From: Qihang Tang @ 2026-05-08 9:46 UTC (permalink / raw)
To: mst
Cc: jasowang, w, eperezma, Qihang Tang, kvm, linux-kernel, netdev,
virtualization
In-Reply-To: <20260418211354.3698-1-q.h.hack.winter@gmail.com>
vduse_dev_open() looks up struct vduse_dev through the IDR and then
acquires dev->lock only after vduse_lock has been dropped.
This leaves a window where a concurrent VDUSE_DESTROY_DEV can remove the
same object from the IDR and free it before the open path locks the
device, leading to a use-after-free.
Close this race by keeping vduse_lock held until dev->lock has been
acquired in the open path, matching the lock ordering already used by
the destroy path.
Fixes: c8a6153b6c59 ("vduse: Introduce VDUSE - vDPA Device in Userspace")
Signed-off-by: Qihang Tang <q.h.hack.winter@gmail.com>
---
v2 -> v3:
- keep vduse_lock held until after dropping dev->lock
in vduse_dev_open()
- add changelog requested in review
v1 -> v2:
- add Fixes tag
- remove helper and inline the locking in
vduse_dev_open()
drivers/vdpa/vdpa_user/vduse_dev.c | 21 +++++++--------------
1 file changed, 7 insertions(+), 14 deletions(-)
diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index 6202f6902fcd..d5c34260ed68 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -1637,26 +1637,18 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
return 0;
}
-static struct vduse_dev *vduse_dev_get_from_minor(int minor)
+static int vduse_dev_open(struct inode *inode, struct file *file)
{
+ int ret = -EBUSY;
struct vduse_dev *dev;
mutex_lock(&vduse_lock);
- dev = idr_find(&vduse_idr, minor);
- mutex_unlock(&vduse_lock);
-
- return dev;
-}
-
-static int vduse_dev_open(struct inode *inode, struct file *file)
-{
- int ret;
- struct vduse_dev *dev = vduse_dev_get_from_minor(iminor(inode));
-
- if (!dev)
+ dev = idr_find(&vduse_idr, iminor(inode));
+ if (!dev) {
+ mutex_unlock(&vduse_lock);
return -ENODEV;
+ }
- ret = -EBUSY;
mutex_lock(&dev->lock);
if (dev->connected)
goto unlock;
@@ -1666,6 +1658,7 @@ static int vduse_dev_open(struct inode *inode, struct file *file)
file->private_data = dev;
unlock:
mutex_unlock(&dev->lock);
+ mutex_unlock(&vduse_lock);
return ret;
}
--
2.39.5 (Apple Git-154)
^ permalink raw reply related
* [PATCH v5] vhost/vdpa: validate virtqueue index in mmap and fault paths
From: Qihang Tang @ 2026-05-08 7:58 UTC (permalink / raw)
To: mst
Cc: jasowang, w, eperezma, Qihang Tang, kvm, linux-kernel, netdev,
virtualization
In-Reply-To: <20260508063745.90506-1-q.h.hack.winter@gmail.com>
vhost_vdpa_mmap() and vhost_vdpa_fault() use vma->vm_pgoff as a
virtqueue index for get_vq_notification(), but they do not validate
that the index is smaller than v->nvqs.
The ioctl path already performs both a bounds check and
array_index_nospec(), but the mmap/fault path only checks that the
index fits in u16. This allows an out-of-range queue index to reach
driver-specific get_vq_notification() callbacks.
Fix this by extracting a unified vhost_vdpa_get_vq_notification()
helper that validates the queue index against v->nvqs and applies
array_index_nospec() before calling the driver callback. Both the
mmap and fault paths use this helper, and the bounds checking is
consolidated into a single location.
>From source inspection, the most defensible impact is out-of-bounds
access in the callback path, potentially leading to invalid PFN
remaps and crash/DoS.
Fixes: ddd89d0a059d ("vhost_vdpa: support doorbell mapping via mmap")
Acked-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Qihang Tang <q.h.hack.winter@gmail.com>
---
drivers/vhost/vdpa.c | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 692564b1bcbb..ac55275fa0d0 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -1482,16 +1482,32 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep)
}
#ifdef CONFIG_MMU
-static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
+static int
+vhost_vdpa_get_vq_notification(struct vhost_vdpa *v, unsigned long index,
+ struct vdpa_notification_area *notify)
{
- struct vhost_vdpa *v = vmf->vma->vm_file->private_data;
struct vdpa_device *vdpa = v->vdpa;
const struct vdpa_config_ops *ops = vdpa->config;
+
+ if (index > 65535 || index >= v->nvqs)
+ return -EINVAL;
+
+ index = array_index_nospec(index, v->nvqs);
+
+ *notify = ops->get_vq_notification(vdpa, index);
+
+ return 0;
+}
+
+static vm_fault_t vhost_vdpa_fault(struct vm_fault *vmf)
+{
+ struct vhost_vdpa *v = vmf->vma->vm_file->private_data;
struct vdpa_notification_area notify;
struct vm_area_struct *vma = vmf->vma;
- u16 index = vma->vm_pgoff;
+ unsigned long index = vma->vm_pgoff;
- notify = ops->get_vq_notification(vdpa, index);
+ if (vhost_vdpa_get_vq_notification(v, index, ¬ify))
+ return VM_FAULT_SIGBUS;
return vmf_insert_pfn(vma, vmf->address & PAGE_MASK, PFN_DOWN(notify.addr));
}
@@ -1514,8 +1530,6 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
if (vma->vm_flags & VM_READ)
return -EINVAL;
- if (index > 65535)
- return -EINVAL;
if (!ops->get_vq_notification)
return -ENOTSUPP;
@@ -1523,7 +1537,8 @@ static int vhost_vdpa_mmap(struct file *file, struct vm_area_struct *vma)
* support the doorbell which sits on the page boundary and
* does not share the page with other registers.
*/
- notify = ops->get_vq_notification(vdpa, index);
+ if (vhost_vdpa_get_vq_notification(v, index, ¬ify))
+ return -EINVAL;
if (notify.addr & (PAGE_SIZE - 1))
return -EINVAL;
if (vma->vm_end - vma->vm_start != notify.size)
--
2.39.5 (Apple Git-154)
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox