Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH] nfc: s3fwrn5: switch to GPIO descriptor API
From: deep @ 2026-04-14 10:22 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Linus Walleij, Bartosz Golaszewski, netdev, linux-nfc,
	linux-kernel, linux-gpio, Kenet Jovan Sokoli

From: Kenet Jovan Sokoli <deep@crimson.net.eu.org>

I am working on cleaning up some legacy GPIO usage in the NFC subsystem.
This patch converts the s3fwrn5 driver to use the modern descriptor based
GPIO API instead of the old integer based one.

Specifically:
- I changed the gpio_en and gpio_fw_wake types to struct gpio_desc.
- Replaced the manual DT parsing with devm_gpiod_get() in the probe
functions.
- Updated the calls in phy_common.c to use gpiod_set_value().

This also allowed me to remove the s3fwrn5_i2c_parse_dt and
s3fwrn82_uart_parse_dt functions as they are no longer needed with the
new API.

Signed-off-by: Kenet Jovan Sokoli <deep@crimson.net.eu.org>
---
I have verified that this patch builds successfully with "make M=drivers/nfc/s3fwrn5"
"scripts/checkpatch.pl" with no errors or warnings.
This is very complicated stuff, but I am learning a lot by doing these cleanup works.
I really hope I can contribute as much as possible to this environment, little by little.
---
 drivers/nfc/s3fwrn5/i2c.c        | 58 ++++++--------------------------
 drivers/nfc/s3fwrn5/phy_common.c | 12 +++----
 drivers/nfc/s3fwrn5/phy_common.h |  5 +--
 drivers/nfc/s3fwrn5/uart.c       | 43 ++++++-----------------
 4 files changed, 31 insertions(+), 87 deletions(-)

diff --git a/drivers/nfc/s3fwrn5/i2c.c b/drivers/nfc/s3fwrn5/i2c.c
index 110d086cfe5b..a629fbcd3237 100644
--- a/drivers/nfc/s3fwrn5/i2c.c
+++ b/drivers/nfc/s3fwrn5/i2c.c
@@ -8,9 +8,8 @@
 
 #include <linux/clk.h>
 #include <linux/i2c.h>
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/delay.h>
-#include <linux/of_gpio.h>
 #include <linux/of_irq.h>
 #include <linux/module.h>
 
@@ -146,37 +145,6 @@ static irqreturn_t s3fwrn5_i2c_irq_thread_fn(int irq, void *phy_id)
 	return IRQ_HANDLED;
 }
 
-static int s3fwrn5_i2c_parse_dt(struct i2c_client *client)
-{
-	struct s3fwrn5_i2c_phy *phy = i2c_get_clientdata(client);
-	struct device_node *np = client->dev.of_node;
-
-	if (!np)
-		return -ENODEV;
-
-	phy->common.gpio_en = of_get_named_gpio(np, "en-gpios", 0);
-	if (!gpio_is_valid(phy->common.gpio_en)) {
-		/* Support also deprecated property */
-		phy->common.gpio_en = of_get_named_gpio(np,
-							"s3fwrn5,en-gpios",
-							0);
-		if (!gpio_is_valid(phy->common.gpio_en))
-			return -ENODEV;
-	}
-
-	phy->common.gpio_fw_wake = of_get_named_gpio(np, "wake-gpios", 0);
-	if (!gpio_is_valid(phy->common.gpio_fw_wake)) {
-		/* Support also deprecated property */
-		phy->common.gpio_fw_wake = of_get_named_gpio(np,
-							     "s3fwrn5,fw-gpios",
-							     0);
-		if (!gpio_is_valid(phy->common.gpio_fw_wake))
-			return -ENODEV;
-	}
-
-	return 0;
-}
-
 static int s3fwrn5_i2c_probe(struct i2c_client *client)
 {
 	struct s3fwrn5_i2c_phy *phy;
@@ -193,21 +161,17 @@ static int s3fwrn5_i2c_probe(struct i2c_client *client)
 	phy->i2c_dev = client;
 	i2c_set_clientdata(client, phy);
 
-	ret = s3fwrn5_i2c_parse_dt(client);
-	if (ret < 0)
-		return ret;
-
-	ret = devm_gpio_request_one(&phy->i2c_dev->dev, phy->common.gpio_en,
-				    GPIOF_OUT_INIT_HIGH, "s3fwrn5_en");
-	if (ret < 0)
-		return ret;
-
-	ret = devm_gpio_request_one(&phy->i2c_dev->dev,
-				    phy->common.gpio_fw_wake,
-				    GPIOF_OUT_INIT_LOW, "s3fwrn5_fw_wake");
-	if (ret < 0)
-		return ret;
+	phy->common.gpio_en = devm_gpiod_get(&client->dev, "en", GPIOD_OUT_HIGH);
+	if (IS_ERR(phy->common.gpio_en)) {
+		return dev_err_probe(&client->dev, PTR_ERR(phy->common.gpio_en),
+				"Failed to get EN gpio\n");
+	}
 
+	phy->common.gpio_fw_wake = devm_gpiod_get(&client->dev, "wake", GPIOD_OUT_LOW);
+	if (IS_ERR(phy->common.gpio_fw_wake)) {
+		return dev_err_probe(&client->dev, PTR_ERR(phy->common.gpio_fw_wake),
+			     "Failed to get WAKE gpio\n");
+	}
 	/*
 	 * S3FWRN5 depends on a clock input ("XI" pin) to function properly.
 	 * Depending on the hardware configuration this could be an always-on
diff --git a/drivers/nfc/s3fwrn5/phy_common.c b/drivers/nfc/s3fwrn5/phy_common.c
index deb2c039f0fd..e802b4e609c8 100644
--- a/drivers/nfc/s3fwrn5/phy_common.c
+++ b/drivers/nfc/s3fwrn5/phy_common.c
@@ -8,7 +8,7 @@
  * Bongsu Jeon <bongsu.jeon@samsung.com>
  */
 
-#include <linux/gpio.h>
+#include <linux/gpio/consumer.h>
 #include <linux/delay.h>
 #include <linux/module.h>
 
@@ -19,7 +19,7 @@ void s3fwrn5_phy_set_wake(void *phy_id, bool wake)
 	struct phy_common *phy = phy_id;
 
 	mutex_lock(&phy->mutex);
-	gpio_set_value(phy->gpio_fw_wake, wake);
+	gpiod_set_value(phy->gpio_fw_wake, wake);
 	if (wake)
 		msleep(S3FWRN5_EN_WAIT_TIME);
 	mutex_unlock(&phy->mutex);
@@ -33,14 +33,14 @@ bool s3fwrn5_phy_power_ctrl(struct phy_common *phy, enum s3fwrn5_mode mode)
 
 	phy->mode = mode;
 
-	gpio_set_value(phy->gpio_en, 1);
-	gpio_set_value(phy->gpio_fw_wake, 0);
+	gpiod_set_value(phy->gpio_en, 1);
+	gpiod_set_value(phy->gpio_fw_wake, 0);
 	if (mode == S3FWRN5_MODE_FW)
-		gpio_set_value(phy->gpio_fw_wake, 1);
+		gpiod_set_value(phy->gpio_fw_wake, 1);
 
 	if (mode != S3FWRN5_MODE_COLD) {
 		msleep(S3FWRN5_EN_WAIT_TIME);
-		gpio_set_value(phy->gpio_en, 0);
+		gpiod_set_value(phy->gpio_en, 0);
 		msleep(S3FWRN5_EN_WAIT_TIME);
 	}
 
diff --git a/drivers/nfc/s3fwrn5/phy_common.h b/drivers/nfc/s3fwrn5/phy_common.h
index 9cef25436bf9..10210a8fd755 100644
--- a/drivers/nfc/s3fwrn5/phy_common.h
+++ b/drivers/nfc/s3fwrn5/phy_common.h
@@ -13,6 +13,7 @@
 
 #include <linux/mutex.h>
 #include <net/nfc/nci_core.h>
+#include <linux/gpio/consumer.h>
 
 #include "s3fwrn5.h"
 
@@ -21,8 +22,8 @@
 struct phy_common {
 	struct nci_dev *ndev;
 
-	int gpio_en;
-	int gpio_fw_wake;
+	struct gpio_desc *gpio_en;
+	struct gpio_desc *gpio_fw_wake;
 
 	struct mutex mutex;
 
diff --git a/drivers/nfc/s3fwrn5/uart.c b/drivers/nfc/s3fwrn5/uart.c
index 4ee481bd7e96..5a7c7741a881 100644
--- a/drivers/nfc/s3fwrn5/uart.c
+++ b/drivers/nfc/s3fwrn5/uart.c
@@ -15,8 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/of.h>
 #include <linux/serdev.h>
-#include <linux/gpio.h>
-#include <linux/of_gpio.h>
+#include <linux/gpio/consumer.h>
 
 #include "phy_common.h"
 
@@ -92,25 +91,6 @@ static const struct of_device_id s3fwrn82_uart_of_match[] = {
 };
 MODULE_DEVICE_TABLE(of, s3fwrn82_uart_of_match);
 
-static int s3fwrn82_uart_parse_dt(struct serdev_device *serdev)
-{
-	struct s3fwrn82_uart_phy *phy = serdev_device_get_drvdata(serdev);
-	struct device_node *np = serdev->dev.of_node;
-
-	if (!np)
-		return -ENODEV;
-
-	phy->common.gpio_en = of_get_named_gpio(np, "en-gpios", 0);
-	if (!gpio_is_valid(phy->common.gpio_en))
-		return -ENODEV;
-
-	phy->common.gpio_fw_wake = of_get_named_gpio(np, "wake-gpios", 0);
-	if (!gpio_is_valid(phy->common.gpio_fw_wake))
-		return -ENODEV;
-
-	return 0;
-}
-
 static int s3fwrn82_uart_probe(struct serdev_device *serdev)
 {
 	struct s3fwrn82_uart_phy *phy;
@@ -144,20 +124,19 @@ static int s3fwrn82_uart_probe(struct serdev_device *serdev)
 
 	serdev_device_set_flow_control(serdev, false);
 
-	ret = s3fwrn82_uart_parse_dt(serdev);
-	if (ret < 0)
-		goto err_serdev;
-
-	ret = devm_gpio_request_one(&phy->ser_dev->dev, phy->common.gpio_en,
-				    GPIOF_OUT_INIT_HIGH, "s3fwrn82_en");
-	if (ret < 0)
+	phy->common.gpio_en = devm_gpiod_get(&serdev->dev, "en", GPIOD_OUT_HIGH);
+	if (IS_ERR(phy->common.gpio_en)) {
+		ret = dev_err_probe(&serdev->dev, PTR_ERR(phy->common.gpio_en),
+			     "failed to get en gpio\n");
 		goto err_serdev;
+	}
 
-	ret = devm_gpio_request_one(&phy->ser_dev->dev,
-				    phy->common.gpio_fw_wake,
-				    GPIOF_OUT_INIT_LOW, "s3fwrn82_fw_wake");
-	if (ret < 0)
+	phy->common.gpio_fw_wake = devm_gpiod_get(&serdev->dev, "wake", GPIOD_OUT_LOW);
+	if (IS_ERR(phy->common.gpio_fw_wake)) {
+		ret = dev_err_probe(&serdev->dev, PTR_ERR(phy->common.gpio_fw_wake),
+			     "failed to get wake gpio\n");
 		goto err_serdev;
+	}
 
 	ret = s3fwrn5_probe(&phy->common.ndev, phy, &phy->ser_dev->dev,
 			    &uart_phy_ops);
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net v2] atm: mpoa: keep mpc->dev referenced across mpoad restart
From: Paolo Abeni @ 2026-04-14 10:25 UTC (permalink / raw)
  To: Shuvam Pandey, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Simon Horman, netdev
  Cc: linux-kernel, syzbot+5ec223ccb83b24ef982f
In-Reply-To: <20260411115958.64827-1-shuvampandey1@gmail.com>



On 4/11/26 1:59 PM, Shuvam Pandey wrote:
> atm: mpoa: keep mpc->dev referenced across mpoad restart
> 
> syzbot reported a netdevice refcount warning:
> 
> refcount_t: decrement hit 0; leaking memory.
> WARNING: lib/refcount.c:31 at refcount_warn_saturate+0x70/0x110
> ...
> dev_put include/linux/netdevice.h:4466 [inline]
> mpoad_close+0x1fc/0x3e0 net/atm/mpc.c:889

The full decoded backtrace is preferred to a small excerpt

> mpoad_close() drops the reference held in mpc->dev, but the mpoa_client
> itself stays alive and keeps the same device pointer.
> 
> When mpoad is attached again, atm_mpoa_mpoad_attach() reuses the existing
> mpoa_client and its mpc->dev without reacquiring that reference, so the
> next close can hit the netdevice refcount warning.
> 
> This reference is owned by the mpoa_client/LEC association rather than a
> single mpoad open/close cycle. It is acquired when the client gets its
> LEC device and is released later from mpoa_event_listener() on
> NETDEV_UNREGISTER. Fix the imbalance by removing the dev_put() from
> mpoad_close().
> 
> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
> Reported-by: syzbot+5ec223ccb83b24ef982f@syzkaller.appspotmail.com
> Link: https://groups.google.com/g/syzkaller-bugs/c/qhZ5MJfLBOE/m/UnotmgRdAQAJ

Preferred link is to the syzbot console:
https://syzkaller.appspot.com/bug?extid=5ec223ccb83b24ef982f

> Signed-off-by: Shuvam Pandey <shuvampandey1@gmail.com>
> ---
> Changes in v2:
> - drop the atm_mpoa_cleanup() dev_put()/NULL hunk
> - add the syzbot warning excerpt
> - add a Fixes tag
> - clarify that the final dev_put() comes from the notifier path
> 
>  net/atm/mpc.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/net/atm/mpc.c b/net/atm/mpc.c
> index ce8e9780373b9..90ab8f2889734 100644
> --- a/net/atm/mpc.c
> +++ b/net/atm/mpc.c
> @@ -886,7 +886,6 @@ static void mpoad_close(struct atm_vcc *vcc)
>  		struct lec_priv *priv = netdev_priv(mpc->dev);
>  		priv->lane2_ops->associate_indicator = NULL;
>  		stop_mpc(mpc);
> -		dev_put(mpc->dev);

Sashiko noted a possible regression introduced by this change:

Since this patch removes the dev_put(mpc->dev) here to defer the
netdevice reference release to the NETDEV_UNREGISTER event, does this
introduce a leak of the netdevice reference on module unload?
If the atm_mpoa module is unloaded while a lec device is still active,
atm_mpoa_cleanup() unregisters the netdevice notifier and frees all
mpoa_client structures without releasing their mpc->dev references:
net/atm/mpc.c:atm_mpoa_cleanup() {
    ...
	unregister_netdevice_notifier(&mpoa_notifier);
    ...
	while (mpc != NULL) {
		tmp = mpc->next;
		if (mpc->dev != NULL) {
			stop_mpc(mpc);
            ...
		}
        ...
		kfree(mpc->mps_macs);
		kfree(mpc);
		mpc = tmp;
	}
}
Because the notifier is unregistered, NETDEV_UNREGISTER will never be
delivered to clean up the references, which would permanently leak the
netdevice reference and prevent the interface from ever being unregistered.
Should a dev_put() be added in the module exit function atm_mpoa_cleanup()?

/P


^ permalink raw reply

* Re: [net,PATCH v2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Marek Vasut @ 2026-04-14 10:26 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: Jakub Kicinski, netdev, stable, David S. Miller, Andrew Lunn,
	Eric Dumazet, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
	Yicong Hui, linux-kernel, Thomas Gleixner
In-Reply-To: <20260414085556.SJSDwbpW@linutronix.de>

On 4/14/26 10:55 AM, Sebastian Andrzej Siewior wrote:
> On 2026-04-13 18:03:38 [+0200], To Marek Vasut wrote:
>> On 2026-04-13 17:31:34 [+0200], Marek Vasut wrote:
>>>> I don't see why it needs to disable interrupts.
>>>
>>> Because when the lock is held, the PAR code shouldn't be interrupted by an
>>> interrupt, otherwise it would completely mess up the state of the KS8851
>>> MAC. The spinlock does not protect only the IRQ handler, it protects also
>>> ks8851_start_xmit_par() and ks8851_write_mac_addr() and
>>> ks8851_read_mac_addr() and ks8851_net_open() and ks8851_net_stop() and other
>>> sites which call ks8851_lock()/ks8851_unlock() which cannot be executed
>>> concurrently, but where BHs can be enabled.
>>
>> I need check this once brain is at full power again. But which
>> interrupt? Your interrupt is threaded. So that should be okay.
> 
> I don't understand. There is no point in using spin_lock_irqsave() in
> ks8851_lock_par(). You don't protect against interrupts because none of
> the user actually run in an interrupt. As far as I can see, the
> interrupt is threaded and the mdio phy link checks should come from the
> workqueue.

Ha, now that the IRQ handler is indeed only threaded, I can use 
spin_lock_bh() indeed. I will send a V3 like that.

> What is wrong is that the ndo_start_xmit callback can be invoked from a
> softirq and such you must disable BHs while acquiring a lock which can
> be accessed from both contexts. Therefore spin_lock() is not sufficient,
> it needs the _bh() and _irq() brings no additional value here.

^ permalink raw reply

* [syzbot] [lvs?] BUG: sleeping function called from invalid context in ip_vs_conn_expire
From: syzbot @ 2026-04-14 10:30 UTC (permalink / raw)
  To: coreteam, davem, edumazet, fw, horms, ja, kuba, linux-kernel,
	lvs-devel, netdev, netfilter-devel, pabeni, pablo, phil,
	syzkaller-bugs

Hello,

syzbot found the following issue on:

HEAD commit:    1c7cc4904160 Add linux-next specific files for 20260413
git tree:       linux-next
console output: https://syzkaller.appspot.com/x/log.txt?x=10327cd2580000
kernel config:  https://syzkaller.appspot.com/x/.config?x=56c2b36de3316f1b
dashboard link: https://syzkaller.appspot.com/bug?extid=504e778ddaecd36fdd17
compiler:       Debian clang version 21.1.8 (++20251221033036+2078da43e25a-1~exp1~20251221153213.50), Debian LLD 21.1.8

Unfortunately, I don't have any reproducer for this issue yet.

Downloadable assets:
disk image: https://storage.googleapis.com/syzbot-assets/91a765b703da/disk-1c7cc490.raw.xz
vmlinux: https://storage.googleapis.com/syzbot-assets/da75a3061146/vmlinux-1c7cc490.xz
kernel image: https://storage.googleapis.com/syzbot-assets/d55367ced048/bzImage-1c7cc490.xz

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+504e778ddaecd36fdd17@syzkaller.appspotmail.com

BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 16, name: ktimers/0
preempt_count: 2, expected: 0
RCU nest depth: 3, expected: 3
8 locks held by ktimers/0/16:
 #0: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #1: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline]
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: timer_base_lock_expiry kernel/time/timer.c:1502 [inline]
 #2: ffff8880b8826360 (&base->expiry_lock){+...}-{3:3}, at: __run_timer_base+0x120/0x9f0 kernel/time/timer.c:2384
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: __rt_spin_lock kernel/locking/spinlock_rt.c:50 [inline]
 #3: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rt_spin_lock+0x1e0/0x400 kernel/locking/spinlock_rt.c:57
 #4: ffffc90000157a80 ((&cp->timer)){+...}-{0:0}, at: call_timer_fn+0xd4/0x5e0 kernel/time/timer.c:1745
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:300 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:838 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:315 [inline]
 #5: ffffffff8dfc80c0 (rcu_read_lock){....}-{1:3}, at: ip_vs_conn_expire+0x257/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
 #6: ffffffff8de5f260 (local_bh){.+.+}-{1:3}, at: __local_bh_disable_ip+0x3c/0x420 kernel/softirq.c:163
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: spin_lock include/linux/spinlock_rt.h:45 [inline]
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline]
 #7: ffff888068d4c3f0 (&cp->lock#2){+...}-{3:3}, at: ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
Preemption disabled at:
[<ffffffff898a6358>] bit_spin_lock include/linux/bit_spinlock.h:38 [inline]
[<ffffffff898a6358>] hlist_bl_lock+0x18/0x110 include/linux/list_bl.h:149
CPU: 0 UID: 0 PID: 16 Comm: ktimers/0 Tainted: G        W    L      syzkaller #0 PREEMPT_{RT,(full)} 
Tainted: [W]=WARN, [L]=SOFTLOCKUP
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/18/2026
Call Trace:
 <TASK>
 dump_stack_lvl+0xe8/0x150 lib/dump_stack.c:120
 __might_resched+0x329/0x480 kernel/sched/core.c:9162
 __rt_spin_lock kernel/locking/spinlock_rt.c:48 [inline]
 rt_spin_lock+0xc2/0x400 kernel/locking/spinlock_rt.c:57
 spin_lock include/linux/spinlock_rt.h:45 [inline]
 ip_vs_conn_unlink net/netfilter/ipvs/ip_vs_conn.c:324 [inline]
 ip_vs_conn_expire+0xd4a/0x2390 net/netfilter/ipvs/ip_vs_conn.c:1260
 call_timer_fn+0x192/0x5e0 kernel/time/timer.c:1748
 expire_timers kernel/time/timer.c:1799 [inline]
 __run_timers kernel/time/timer.c:2374 [inline]
 __run_timer_base+0x6a3/0x9f0 kernel/time/timer.c:2386
 run_timer_base kernel/time/timer.c:2395 [inline]
 run_timer_softirq+0xb7/0x170 kernel/time/timer.c:2405
 handle_softirqs+0x1de/0x6d0 kernel/softirq.c:622
 __do_softirq kernel/softirq.c:656 [inline]
 run_ktimerd+0x69/0x100 kernel/softirq.c:1151
 smpboot_thread_fn+0x541/0xa50 kernel/smpboot.c:160
 kthread+0x388/0x470 kernel/kthread.c:436
 ret_from_fork+0x514/0xb70 arch/x86/kernel/process.c:158
 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245
 </TASK>


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkaller@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

If the report is already addressed, let syzbot know by replying with:
#syz fix: exact-commit-title

If you want to overwrite report's subsystems, reply with:
#syz set subsystems: new-subsystem
(See the list of subsystem names on the web dashboard)

If the report is a duplicate of another one, reply with:
#syz dup: exact-subject-of-another-report

If you want to undo deduplication, reply with:
#syz undup

^ permalink raw reply

* [PATCH] net: mdio: MDIO_PIC64HPSC should depend on ARCH_MICROCHIP
From: Geert Uytterhoeven @ 2026-04-14 10:30 UTC (permalink / raw)
  To: Charles Perry, Conor Dooley, Jakub Kicinski, Maxime Chevallier,
	Andrew Lunn, Heiner Kallweit, Russell King, David S . Miller,
	Eric Dumazet, Paolo Abeni
  Cc: netdev, linux-kernel, Geert Uytterhoeven

The PIC64-HPSC/HX MDIO interface is only present on Microchip
PIC64-HPSC/HX SoCs.  Hence add a dependency on ARCH_MICROCHIP, to
prevent asking the user about this driver when configuring a kernel
without Microchip SoC support.

Fixes: f76aef980206e7c6 ("net: mdio: add a driver for PIC64-HPSC/HX MDIO controller")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
---
 drivers/net/mdio/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/net/mdio/Kconfig b/drivers/net/mdio/Kconfig
index 516b0d05e16ebba3..c71132f33f8472e6 100644
--- a/drivers/net/mdio/Kconfig
+++ b/drivers/net/mdio/Kconfig
@@ -147,6 +147,7 @@ config MDIO_OCTEON
 
 config MDIO_PIC64HPSC
 	tristate "PIC64-HPSC/HX MDIO interface support"
+	depends on ARCH_MICROCHIP || COMPILE_TEST
 	depends on HAS_IOMEM && OF_MDIO
 	help
 	  This driver supports the MDIO interface found on the PIC64-HPSC/HX
-- 
2.43.0


^ permalink raw reply related

* Re: [RFC] net: openvswitch: Inroduce a light-weight socket map concept.
From: Minxi Hou @ 2026-04-14 10:32 UTC (permalink / raw)
  To: netdev; +Cc: Minxi Hou, ovs-dev, mhou
In-Reply-To: <20250627210054.114417-1-aconole@redhat.com>

Hi Aaron,

I tested two stages of this work on x86_64, single-node setup:

1. Early RFC (this patch, June 2025): basic socket action plumbing,
   ODP parsing, and socket_lookup configuration.

2. Current development branch (sockmap_2026_feb, built as a scratch
   kernel based on RHEL 9.8 / 5.14.0-611.el9_7), which extends this
   RFC with: rcuify list, sockmap get/del commands, action list rework,
   packet cmd fix, sockmap fixes, tuple detail improvements, and
   flow-socket association. Paired with the OVS userspace sockmap
   branch (sockmap_cmds).

Tests on the current development branch:

- ODP action round-trip parsing (valid and invalid): pass
- Socket action generation via ofproto/trace for TCP (IPv4/IPv6): pass
- Non-TCP exclusion (ICMP, UDP): pass
- socket_lookup enable/disable per port: pass
- socket_lookup with group recirculation: pass
- OpenFlow regression with socket_lookup: pass
- Conntrack regression with socket_lookup: pass
- 2-namespace TCP performance: pass

During 1000-namespace scale testing (2000 veth pairs, socket_lookup
enabled on all ports), the following WARNING burst was observed in the
kernel console log:

[20652.730148] WARNING: CPU: 118 PID: 304284 at
               net/core/skbuff.c:1000 skb_release_head_state+0x95/0xa0
(185 occurrences within 79ms, followed by BUG: scheduling while
 atomic in OVS upcall handler thread handler2052)

This is triggered by ARP table overflow under 2000 veth pairs, which
floods the OVS netlink upcall path. The underlying issue is that skbs
with netlink_skb_destructor are passed to consume_skb() without first
calling skb_orphan(). The WARNING is reproducible on a stock RHEL 9.8
kernel + stock OVS 3.7 without any of these patches, confirming it is
a pre-existing kernel issue unrelated to this work.

Tested-by: Minxi Hou <houminxi@gmail.com>

^ permalink raw reply

* [net,PATCH v3 1/2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Marek Vasut @ 2026-04-14 10:32 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, stable, David S. Miller, Andrew Lunn, Eric Dumazet,
	Jakub Kicinski, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
	Sebastian Andrzej Siewior, Yicong Hui, linux-kernel

If CONFIG_PREEMPT_RT=y is set AND the driver executes ks8851_irq() AND
KSZ_ISR register bit IRQ_RXI is set AND ks8851_rx_pkts() detects that
there are packets in the RX FIFO, then netdev_alloc_skb_ip_align() is
called to allocate SKBs. If netdev_alloc_skb_ip_align() is called with
BH enabled, local_bh_enable() at the end of netdev_alloc_skb_ip_align()
will call __local_bh_enable_ip(), which will call __do_softirq(), which
may trigger net_tx_action() softirq, which may ultimately call the xmit
callback ks8851_start_xmit_par(). The ks8851_start_xmit_par() will try
to lock struct ks8851_net_par .lock spinlock, which is already locked
by ks8851_irq() from which ks8851_start_xmit_par() was called. This
leads to a deadlock, which is reported by the kernel, including a trace
listed below.

Fix the problem by disabling BH around critical sections, including the
IRQ handler, thus preventing the net_tx_action() softirq from triggering
during these critical sections. The net_tx_action() softirq is triggered
at the end of the IRQ handler, once all the other IRQ handler actions have
been completed.

 __schedule from schedule_rtlock+0x1c/0x34
 schedule_rtlock from rtlock_slowlock_locked+0x548/0x904
 rtlock_slowlock_locked from rt_spin_lock+0x60/0x9c
 rt_spin_lock from ks8851_start_xmit_par+0x74/0x1a8
 ks8851_start_xmit_par from netdev_start_xmit+0x20/0x44
 netdev_start_xmit from dev_hard_start_xmit+0xd0/0x188
 dev_hard_start_xmit from sch_direct_xmit+0xb8/0x25c
 sch_direct_xmit from __qdisc_run+0x1f8/0x4ec
 __qdisc_run from qdisc_run+0x1c/0x28
 qdisc_run from net_tx_action+0x1f0/0x268
 net_tx_action from handle_softirqs+0x1a4/0x270
 handle_softirqs from __local_bh_enable_ip+0xcc/0xe0
 __local_bh_enable_ip from __alloc_skb+0xd8/0x128
 __alloc_skb from __netdev_alloc_skb+0x3c/0x19c
 __netdev_alloc_skb from ks8851_irq+0x388/0x4d4
 ks8851_irq from irq_thread_fn+0x24/0x64
 irq_thread_fn from irq_thread+0x178/0x28c
 irq_thread from kthread+0x12c/0x138
 kthread from ret_from_fork+0x14/0x28

Fixes: e0863634bf9f ("net: ks8851: Queue RX packets in IRQ handler instead of disabling BHs")
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@nabladev.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Nicolai Buchwitz <nb@tipi-net.de>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ronald Wahl <ronald.wahl@raritan.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Yicong Hui <yiconghui@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
---
V2: Register dedicated IRQ handler wrapper which disables BH for the
    parallel variant of the MAC, the variant which uses spinlocks as
    the locking primitive. Use stock IRQ handler with BH unchanged
    for the SPI variant, which uses mutexes as locking primitive.
V3: Switch to spin_lock_bh(), update commit message
---
 drivers/net/ethernet/micrel/ks8851.h        |  6 +-
 drivers/net/ethernet/micrel/ks8851_common.c | 64 +++++++++------------
 drivers/net/ethernet/micrel/ks8851_par.c    | 15 ++---
 drivers/net/ethernet/micrel/ks8851_spi.c    | 11 ++--
 4 files changed, 38 insertions(+), 58 deletions(-)

diff --git a/drivers/net/ethernet/micrel/ks8851.h b/drivers/net/ethernet/micrel/ks8851.h
index 31f75b4a67fd7..b795a3a605711 100644
--- a/drivers/net/ethernet/micrel/ks8851.h
+++ b/drivers/net/ethernet/micrel/ks8851.h
@@ -408,10 +408,8 @@ struct ks8851_net {
 	struct gpio_desc	*gpio;
 	struct mii_bus		*mii_bus;
 
-	void			(*lock)(struct ks8851_net *ks,
-					unsigned long *flags);
-	void			(*unlock)(struct ks8851_net *ks,
-					  unsigned long *flags);
+	void			(*lock)(struct ks8851_net *ks);
+	void			(*unlock)(struct ks8851_net *ks);
 	unsigned int		(*rdreg16)(struct ks8851_net *ks,
 					   unsigned int reg);
 	void			(*wrreg16)(struct ks8851_net *ks,
diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 8048770958d60..6c375647b24de 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -28,25 +28,23 @@
 /**
  * ks8851_lock - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock(struct ks8851_net *ks)
 {
-	ks->lock(ks, flags);
+	ks->lock(ks);
 }
 
 /**
  * ks8851_unlock - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock(struct ks8851_net *ks)
 {
-	ks->unlock(ks, flags);
+	ks->unlock(ks);
 }
 
 /**
@@ -129,11 +127,10 @@ static void ks8851_set_powermode(struct ks8851_net *ks, unsigned pwrmode)
 static int ks8851_write_mac_addr(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	u16 val;
 	int i;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	/*
 	 * Wake up chip in case it was powered off when stopped; otherwise,
@@ -149,7 +146,7 @@ static int ks8851_write_mac_addr(struct net_device *dev)
 	if (!netif_running(dev))
 		ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -163,12 +160,11 @@ static int ks8851_write_mac_addr(struct net_device *dev)
 static void ks8851_read_mac_addr(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	u8 addr[ETH_ALEN];
 	u16 reg;
 	int i;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	for (i = 0; i < ETH_ALEN; i += 2) {
 		reg = ks8851_rdreg16(ks, KS_MAR(i));
@@ -177,7 +173,7 @@ static void ks8851_read_mac_addr(struct net_device *dev)
 	}
 	eth_hw_addr_set(dev, addr);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 }
 
 /**
@@ -312,11 +308,10 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 {
 	struct ks8851_net *ks = _ks;
 	struct sk_buff_head rxq;
-	unsigned long flags;
 	unsigned int status;
 	struct sk_buff *skb;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	status = ks8851_rdreg16(ks, KS_ISR);
 	ks8851_wrreg16(ks, KS_ISR, status);
@@ -373,7 +368,7 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 		ks8851_wrreg16(ks, KS_RXCR1, rxc->rxcr1);
 	}
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	if (status & IRQ_LCI)
 		mii_check_link(&ks->mii);
@@ -405,7 +400,6 @@ static void ks8851_flush_tx_work(struct ks8851_net *ks)
 static int ks8851_net_open(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int ret;
 
 	ret = request_threaded_irq(dev->irq, NULL, ks8851_irq,
@@ -418,7 +412,7 @@ static int ks8851_net_open(struct net_device *dev)
 
 	/* lock the card, even if we may not actually be doing anything
 	 * else at the moment */
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	netif_dbg(ks, ifup, ks->netdev, "opening\n");
 
@@ -471,7 +465,7 @@ static int ks8851_net_open(struct net_device *dev)
 
 	netif_dbg(ks, ifup, ks->netdev, "network device up\n");
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 	mii_check_link(&ks->mii);
 	return 0;
 }
@@ -487,23 +481,22 @@ static int ks8851_net_open(struct net_device *dev)
 static int ks8851_net_stop(struct net_device *dev)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 
 	netif_info(ks, ifdown, dev, "shutting down\n");
 
 	netif_stop_queue(dev);
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	/* turn off the IRQs and ack any outstanding */
 	ks8851_wrreg16(ks, KS_IER, 0x0000);
 	ks8851_wrreg16(ks, KS_ISR, 0xffff);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	/* stop any outstanding work */
 	ks8851_flush_tx_work(ks);
 	flush_work(&ks->rxctrl_work);
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	/* shutdown RX process */
 	ks8851_wrreg16(ks, KS_RXCR1, 0x0000);
 
@@ -512,7 +505,7 @@ static int ks8851_net_stop(struct net_device *dev)
 
 	/* set powermode to soft power down to save power */
 	ks8851_set_powermode(ks, PMECR_PM_SOFTDOWN);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	/* ensure any queued tx buffers are dumped */
 	while (!skb_queue_empty(&ks->txq)) {
@@ -566,14 +559,13 @@ static netdev_tx_t ks8851_start_xmit(struct sk_buff *skb,
 static void ks8851_rxctrl_work(struct work_struct *work)
 {
 	struct ks8851_net *ks = container_of(work, struct ks8851_net, rxctrl_work);
-	unsigned long flags;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	/* need to shutdown RXQ before modifying filter parameters */
 	ks8851_wrreg16(ks, KS_RXCR1, 0x00);
 
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 }
 
 static void ks8851_set_rx_mode(struct net_device *dev)
@@ -780,7 +772,6 @@ static int ks8851_set_eeprom(struct net_device *dev,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	int offset = ee->offset;
-	unsigned long flags;
 	int len = ee->len;
 	u16 tmp;
 
@@ -794,7 +785,7 @@ static int ks8851_set_eeprom(struct net_device *dev,
 	if (!(ks->rc_ccr & CCR_EEPROM))
 		return -ENOENT;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	ks8851_eeprom_claim(ks);
 
@@ -817,7 +808,7 @@ static int ks8851_set_eeprom(struct net_device *dev,
 	eeprom_93cx6_wren(&ks->eeprom, false);
 
 	ks8851_eeprom_release(ks);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -827,7 +818,6 @@ static int ks8851_get_eeprom(struct net_device *dev,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	int offset = ee->offset;
-	unsigned long flags;
 	int len = ee->len;
 
 	/* must be 2 byte aligned */
@@ -837,7 +827,7 @@ static int ks8851_get_eeprom(struct net_device *dev,
 	if (!(ks->rc_ccr & CCR_EEPROM))
 		return -ENOENT;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 
 	ks8851_eeprom_claim(ks);
 
@@ -845,7 +835,7 @@ static int ks8851_get_eeprom(struct net_device *dev,
 
 	eeprom_93cx6_multiread(&ks->eeprom, offset/2, (__le16 *)data, len/2);
 	ks8851_eeprom_release(ks);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return 0;
 }
@@ -904,7 +894,6 @@ static int ks8851_phy_reg(int reg)
 static int ks8851_phy_read_common(struct net_device *dev, int phy_addr, int reg)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int result;
 	int ksreg;
 
@@ -912,9 +901,9 @@ static int ks8851_phy_read_common(struct net_device *dev, int phy_addr, int reg)
 	if (ksreg < 0)
 		return ksreg;
 
-	ks8851_lock(ks, &flags);
+	ks8851_lock(ks);
 	result = ks8851_rdreg16(ks, ksreg);
-	ks8851_unlock(ks, &flags);
+	ks8851_unlock(ks);
 
 	return result;
 }
@@ -949,14 +938,13 @@ static void ks8851_phy_write(struct net_device *dev,
 			     int phy, int reg, int value)
 {
 	struct ks8851_net *ks = netdev_priv(dev);
-	unsigned long flags;
 	int ksreg;
 
 	ksreg = ks8851_phy_reg(reg);
 	if (ksreg >= 0) {
-		ks8851_lock(ks, &flags);
+		ks8851_lock(ks);
 		ks8851_wrreg16(ks, ksreg, value);
-		ks8851_unlock(ks, &flags);
+		ks8851_unlock(ks);
 	}
 }
 
diff --git a/drivers/net/ethernet/micrel/ks8851_par.c b/drivers/net/ethernet/micrel/ks8851_par.c
index 78695be2570bf..9f1c33f6ddec0 100644
--- a/drivers/net/ethernet/micrel/ks8851_par.c
+++ b/drivers/net/ethernet/micrel/ks8851_par.c
@@ -55,29 +55,27 @@ struct ks8851_net_par {
 /**
  * ks8851_lock_par - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock_par(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock_par(struct ks8851_net *ks)
 {
 	struct ks8851_net_par *ksp = to_ks8851_par(ks);
 
-	spin_lock_irqsave(&ksp->lock, *flags);
+	spin_lock_bh(&ksp->lock);
 }
 
 /**
  * ks8851_unlock_par - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock_par(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock_par(struct ks8851_net *ks)
 {
 	struct ks8851_net_par *ksp = to_ks8851_par(ks);
 
-	spin_unlock_irqrestore(&ksp->lock, *flags);
+	spin_unlock_bh(&ksp->lock);
 }
 
 /**
@@ -233,7 +231,6 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 {
 	struct ks8851_net *ks = netdev_priv(dev);
 	netdev_tx_t ret = NETDEV_TX_OK;
-	unsigned long flags;
 	unsigned int txqcr;
 	u16 txmir;
 	int err;
@@ -241,7 +238,7 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 	netif_dbg(ks, tx_queued, ks->netdev,
 		  "%s: skb %p, %d@%p\n", __func__, skb, skb->len, skb->data);
 
-	ks8851_lock_par(ks, &flags);
+	ks8851_lock_par(ks);
 
 	txmir = ks8851_rdreg16_par(ks, KS_TXMIR) & 0x1fff;
 
@@ -262,7 +259,7 @@ static netdev_tx_t ks8851_start_xmit_par(struct sk_buff *skb,
 		ret = NETDEV_TX_BUSY;
 	}
 
-	ks8851_unlock_par(ks, &flags);
+	ks8851_unlock_par(ks);
 
 	return ret;
 }
diff --git a/drivers/net/ethernet/micrel/ks8851_spi.c b/drivers/net/ethernet/micrel/ks8851_spi.c
index a161ae45743ab..b9e68520278d0 100644
--- a/drivers/net/ethernet/micrel/ks8851_spi.c
+++ b/drivers/net/ethernet/micrel/ks8851_spi.c
@@ -71,11 +71,10 @@ struct ks8851_net_spi {
 /**
  * ks8851_lock_spi - register access lock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Claim chip register access lock
  */
-static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_lock_spi(struct ks8851_net *ks)
 {
 	struct ks8851_net_spi *kss = to_ks8851_spi(ks);
 
@@ -85,11 +84,10 @@ static void ks8851_lock_spi(struct ks8851_net *ks, unsigned long *flags)
 /**
  * ks8851_unlock_spi - register access unlock
  * @ks: The chip state
- * @flags: Spinlock flags
  *
  * Release chip register access lock
  */
-static void ks8851_unlock_spi(struct ks8851_net *ks, unsigned long *flags)
+static void ks8851_unlock_spi(struct ks8851_net *ks)
 {
 	struct ks8851_net_spi *kss = to_ks8851_spi(ks);
 
@@ -309,7 +307,6 @@ static void ks8851_tx_work(struct work_struct *work)
 	struct ks8851_net_spi *kss;
 	unsigned short tx_space;
 	struct ks8851_net *ks;
-	unsigned long flags;
 	struct sk_buff *txb;
 	bool last;
 
@@ -317,7 +314,7 @@ static void ks8851_tx_work(struct work_struct *work)
 	ks = &kss->ks8851;
 	last = skb_queue_empty(&ks->txq);
 
-	ks8851_lock_spi(ks, &flags);
+	ks8851_lock_spi(ks);
 
 	while (!last) {
 		txb = skb_dequeue(&ks->txq);
@@ -343,7 +340,7 @@ static void ks8851_tx_work(struct work_struct *work)
 	ks->tx_space = tx_space;
 	spin_unlock_bh(&ks->statelock);
 
-	ks8851_unlock_spi(ks, &flags);
+	ks8851_unlock_spi(ks);
 }
 
 /**
-- 
2.53.0


^ permalink raw reply related

* [net,PATCH v3 2/2] net: ks8851: Avoid excess softirq scheduling
From: Marek Vasut @ 2026-04-14 10:32 UTC (permalink / raw)
  To: netdev
  Cc: Marek Vasut, stable, David S. Miller, Andrew Lunn, Eric Dumazet,
	Jakub Kicinski, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
	Sebastian Andrzej Siewior, Yicong Hui, linux-kernel
In-Reply-To: <20260414103327.113500-1-marex@nabladev.com>

The code injects a packet into netif_rx() repeatedly, which will add
it to its internal NAPI and schedule a softirq, and process it. It is
more efficient to queue multiple packets and process them all at the
local_bh_enable() time.

Fixes: e0863634bf9f ("net: ks8851: Queue RX packets in IRQ handler instead of disabling BHs")
Cc: stable@vger.kernel.org
Signed-off-by: Marek Vasut <marex@nabladev.com>
---
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Andrew Lunn <andrew+netdev@lunn.ch>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Nicolai Buchwitz <nb@tipi-net.de>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Ronald Wahl <ronald.wahl@raritan.com>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Yicong Hui <yiconghui@gmail.com>
Cc: linux-kernel@vger.kernel.org
Cc: netdev@vger.kernel.org
---
V3: New patch
---
 drivers/net/ethernet/micrel/ks8851_common.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/micrel/ks8851_common.c b/drivers/net/ethernet/micrel/ks8851_common.c
index 6c375647b24de..4afbb40bc0e4a 100644
--- a/drivers/net/ethernet/micrel/ks8851_common.c
+++ b/drivers/net/ethernet/micrel/ks8851_common.c
@@ -373,9 +373,12 @@ static irqreturn_t ks8851_irq(int irq, void *_ks)
 	if (status & IRQ_LCI)
 		mii_check_link(&ks->mii);
 
-	if (status & IRQ_RXI)
+	if (status & IRQ_RXI) {
+		local_bh_disable();
 		while ((skb = __skb_dequeue(&rxq)))
 			netif_rx(skb);
+		local_bh_enable();
+	}
 
 	return IRQ_HANDLED;
 }
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH] nfc: s3fwrn5: switch to GPIO descriptor API
From: Krzysztof Kozlowski @ 2026-04-14 10:33 UTC (permalink / raw)
  To: deep
  Cc: Linus Walleij, Bartosz Golaszewski, netdev, linux-nfc,
	linux-kernel, linux-gpio
In-Reply-To: <20260414102257.365421-1-deep@crimson.net.eu.org>

On 14/04/2026 12:22, deep@crimson.net.eu.org wrote:
> From: Kenet Jovan Sokoli <deep@crimson.net.eu.org>
> 
> I am working on cleaning up some legacy GPIO usage in the NFC subsystem.
> This patch converts the s3fwrn5 driver to use the modern descriptor based
> GPIO API instead of the old integer based one.
> 
> Specifically:
> - I changed the gpio_en and gpio_fw_wake types to struct gpio_desc.
> - Replaced the manual DT parsing with devm_gpiod_get() in the probe
> functions.
> - Updated the calls in phy_common.c to use gpiod_set_value().
> 
> This also allowed me to remove the s3fwrn5_i2c_parse_dt and
> s3fwrn82_uart_parse_dt functions as they are no longer needed with the
> new API.
> 
> Signed-off-by: Kenet Jovan Sokoli <deep@crimson.net.eu.org>
> ---
> I have verified that this patch builds successfully with "make M=drivers/nfc/s3fwrn5"
> "scripts/checkpatch.pl" with no errors or warnings.
> This is very complicated stuff, but I am learning a lot by doing these cleanup works.
> I really hope I can contribute as much as possible to this environment, little by little.
> ---


Please do not duplicate existing work needlessly.

https://lore.kernel.org/all/?q=dfn%3Adrivers%2Fnfc%2Fs3fwrn5%2Fi2c.c

Best regards,
Krzysztof

^ permalink raw reply

* Re: [RFC] Proposal: Add sysfs interface for PCIe TPH Steering Tag retrieval and configuration
From: Leon Romanovsky @ 2026-04-14 10:35 UTC (permalink / raw)
  To: fengchengwen
  Cc: Jason Gunthorpe, Bjorn Helgaas, linux-rdma, linux-pci, netdev,
	dri-devel, Keith Busch, Yochai Cohen, Yishai Hadas, Zhiping Zhang
In-Reply-To: <84bf119e-fa8c-4c97-9197-3377b7e2b250@huawei.com>

On Tue, Apr 14, 2026 at 05:30:09PM +0800, fengchengwen wrote:
> On 4/14/2026 4:57 PM, Leon Romanovsky wrote:
> > On Tue, Apr 14, 2026 at 09:07:23AM +0800, fengchengwen wrote:
> >> On 4/14/2026 3:19 AM, Leon Romanovsky wrote:
> >>> On Mon, Apr 13, 2026 at 08:04:10PM +0800, fengchengwen wrote:
> >>>> On 4/13/2026 6:01 PM, Leon Romanovsky wrote:
> >>>>> On Fri, Apr 10, 2026 at 10:30:52PM +0800, fengchengwen wrote:
> >>>>>> Hi all,
> >>>>>>
> >>>>>> I'm writing to propose adding a sysfs interface to expose and configure the
> >>>>>> PCIe TPH
> >>>>>> Steering Tag for PCIe devices, which is retrieved inside the kernel.
> >>>>>>
> >>>>>>
> >>>>>> Background: The TPH Steering Tag is tightly coupled with both a PCIe device
> >>>>>> (identified
> >>>>>> by its BDF) and a CPU core. It can only be obtained in kernel mode. To allow
> >>>>>> user-space
> >>>>>> applications to fetch and set this value securely and conveniently, we need
> >>>>>> a standard
> >>>>>> kernel-to-user interface.
> >>>>>>
> >>>>>>
> >>>>>> Proposed Solution: Add several sysfs attributes under each PCIe device's
> >>>>>> sysfs directory:
> >>>>>> 1. /sys/bus/pci/devices/<BDF>/tph_mode to query the TPH mode (interrupt or
> >>>>>> device specific)
> >>>>>> 2. /sys/bus/pci/devices/<BDF>/tph_enable to control the TPH feature
> >>>>>> 3. /sys/bus/pci/devices/<BDF>/tph_st to support both read and write
> >>>>>> operations, e.g.:
> >>>>>>    Read operation:
> >>>>>>      echo "cpu=3" > /sys/bus/pci/devices/0000:01:00.0/tph_st
> >>>>>>      cat /sys/bus/pci/devices/0000:01:00.0/tph_st
> >>>>>>    Write operation:
> >>>>>>      echo "index=10 st=123" > /sys/bus/pci/devices/0000:01:00.0/tph_st
> >>>>>>
> >>>>>>
> >>>>>> The design strictly follows PCI subsystem sysfs standards and has the
> >>>>>> following key properties:
> >>>>>>
> >>>>>> 1. Dynamic Visibility: The sysfs attributes will only be present for PCIe
> >>>>>> devices that
> >>>>>>    support TPH Steering Tag. Devices without TPH capability will not show
> >>>>>> these nodes,
> >>>>>>    avoiding unnecessary user confusion.
> >>>>>>
> >>>>>> 2. Permission Control: The attributes will use 0600 file permissions,
> >>>>>> ensuring only
> >>>>>>    privileged root users can read or write them, which satisfies security
> >>>>>> requirements
> >>>>>>    for hardware configuration interfaces.
> >>>>>>
> >>>>>> 3. Standard Implementation Location: The interface will be implemented in
> >>>>>>    drivers/pci/pci-sysfs.c, the canonical location for all PCI device sysfs
> >>>>>> attributes,
> >>>>>>    ensuring consistency and maintainability within the PCI subsystem.
> >>>>>>
> >>>>>>
> >>>>>> Why sysfs instead of alternatives like VFIO-PCI ioctl:
> >>>>>>
> >>>>>> - Universality: sysfs does not require binding the device to a special
> >>>>>> driver such as
> >>>>>>   vfio-pci. It is available to any privileged user-space component,
> >>>>>> including system
> >>>>>>   utilities, daemons, and monitoring tools.
> >>>>>>
> >>>>>> - Simplicity: Both user-space usage (cat/echo) and kernel implementation are
> >>>>>>   straightforward, reducing code complexity and long-term maintenance cost.
> >>>>>>
> >>>>>> - Design Alignment: TPH Steering Tag is a generic PCIe device feature, not
> >>>>>> specific to
> >>>>>>   user-space drivers like DPDK or VFIO. Exposing it via sysfs matches the
> >>>>>> kernel's
> >>>>>>   standard pattern for hardware capabilities.
> >>>>>>
> >>>>>>
> >>>>>> I look forward to your comments about this design before submitting the
> >>>>>> final patch.
> >>>>>
> >>>>> You need to explain more clearly why this write functionality is useful
> >>>>> and necessary outside the VFIO/RDMA context:
> >>>>> https://lore.kernel.org/all/20260324234615.3731237-1-zhipingz@meta.com/
> >>>>>
> >>>>> AFAIK, for non-VFIO TPH callers, kernel has enough knowledge to set
> >>>>> right ST values.
> >>>>>
> >>>>> There are several comments regarding the implementation, but those can wait
> >>>>> until the rationale behind the proposal is fully clarified.
> >>>>
> >>>> Thanks for your review and comments.
> >>>>
> >>>> Let me clarify the rationale behind this user-space sysfs interface:
> >>>>
> >>>> 1. VFIO is just one of the user-space device access frameworks.
> >>>>    There are many other in-kernel frameworks that expose devices
> >>>>    to user space, such as UIO, UACCE, etc., which may also require
> >>>>    TPH Steering Tag support.
> >>>>
> >>>> 2. The kernel can automatically program Steering Tags only when
> >>>>    the device provides a standard ST table in MSI-X or config space.
> >>>>    However, many devices implement vendor-specific or platform-specific
> >>>>    Steering Tag programming methods that cannot be fully handled
> >>>>    by the generic kernel code.
> >>>>
> >>>> 3. For such devices, user-space applications or framework drivers
> >>>>    need to retrieve and configure TPH Steering Tags directly.
> >>>>    A unified sysfs interface allows all user-space frameworks
> >>>>    (not just VFIO) to use a common, standard way to manage
> >>>>    TPH Steering Tags, rather than implementing duplicated logic
> >>>>    in each subsystem.
> >>>>
> >>>> This interface provides a uniform method for any user-space
> >>>> device access solution to work with TPH, which is why I believe
> >>>> it is useful and necessary beyond the VFIO/RDMA case.
> >>>
> >>> I understand the rationale for providing a read interface, for example for
> >>> debugging, but I do not see any justification for a write interface.
> >>
> >> Thank you for the comment!
> >>
> >> As I explained, read interface is not only for debugging. It was used to
> >> such device who don't declare ST location in MSI-X or config-space, the following
> >> is Intel X710 NIC device's lspci output (only TPH part):
> >>
> >> 	Capabilities: [1a0 v1] Transaction Processing Hints
> >> 		Device specific mode supported
> >> 		No steering table available
> >>
> >> So we could not config the ST for device on kernel because it's vendor specific.
> >> But we could configure ST by it's vendor user-space driver, in this case, we
> >> should get ST from kernel to user-space.
> > 
> > Vendor-specific, in the context of the PCI specification, does not mean the
> > kernel cannot configure it. It simply means that the ST values are not
> > stored in the ST table.
> 
> Thank you for the clarification!
> 
> I agree with your interpretation of "vendor-specific" in PCI spec terms—it
> does not prevent the kernel from handling TPH in principle. However, the
> real problem is that the kernel has no standardized way to know where or
> how to program those vendor-specific ST values.

No one here is opposed to you implementing the appropriate callbacks or
extending the existing in-kernel API to support a device‑specific mode.

> 
> When a device  reports "No steering table available" and operates in
> device-specific mode, the method used to set ST values is entirely
> device-specific and not covered by the PCI specification. If the device
> is taken over to user-space by UIO framework (e.g. VFIO or IGB_UIO), the
> generic kernel cannot infer the proper programming sequence or registers
> for each vendor-specific implementation.
> 
> In these cases, the configuration must be done by the vendor’s
> user-space driver, which is aware of the device’s private programming
> model. But such a user-space driver still needs to obtain valid,
> platform-provided ST values (from ACPI _DSM), which it cannot do
> without a kernel interface.

The objection applies to this point. The PCI device exists in kernel space,
and the kernel is responsible for managing its internal state.

> 
> This is why a read-only interface to retrieve ST values is still
> needed: the kernel holds the valid platform tags, while the user-space
> driver handles the device-specific programming.
> 
> Thanks
> 
> > 
> > Thanks
> 

^ permalink raw reply

* [PATCH net v3] net/sched: taprio: fix NULL pointer dereference in class dump
From: Weiming Shi @ 2026-04-14 10:43 UTC (permalink / raw)
  To: Vinicius Costa Gomes, Jamal Hadi Salim, Jiri Pirko,
	David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, netdev, Xiang Mei, Weiming Shi

When a TAPRIO child qdisc is deleted via RTM_DELQDISC, taprio_graft()
is called with new == NULL and stores NULL into q->qdiscs[cl - 1].
Subsequent RTM_GETTCLASS dump operations walk all classes via
taprio_walk() and call taprio_dump_class(), which calls taprio_leaf()
returning the NULL pointer, then dereferences it to read child->handle,
causing a kernel NULL pointer dereference.

The bug is reachable with namespace-scoped CAP_NET_ADMIN on any kernel
with CONFIG_NET_SCH_TAPRIO enabled. On systems with unprivileged user
namespaces enabled, an unprivileged local user can trigger a kernel
panic by creating a taprio qdisc inside a new network namespace,
grafting an explicit child qdisc, deleting it, and requesting a class
dump. The RTM_GETTCLASS dump itself requires no capability.

 Oops: general protection fault, probably for non-canonical address 0xdffffc0000000007: 0000 [#1] SMP KASAN NOPTI
 KASAN: null-ptr-deref in range [0x0000000000000038-0x000000000000003f]
 RIP: 0010:taprio_dump_class (net/sched/sch_taprio.c:2475)
 Call Trace:
  <TASK>
  tc_fill_tclass (net/sched/sch_api.c:1966)
  qdisc_class_dump (net/sched/sch_api.c:2329)
  taprio_walk (net/sched/sch_taprio.c:2510)
  tc_dump_tclass_qdisc (net/sched/sch_api.c:2353)
  tc_dump_tclass_root (net/sched/sch_api.c:2370)
  tc_dump_tclass (net/sched/sch_api.c:2431)
  rtnl_dumpit (net/core/rtnetlink.c:6827)
  netlink_dump (net/netlink/af_netlink.c:2325)
  rtnetlink_rcv_msg (net/core/rtnetlink.c:6927)
  netlink_rcv_skb (net/netlink/af_netlink.c:2550)
  </TASK>

Fix this by substituting &noop_qdisc when new is NULL in
taprio_graft(), following the same pattern used by multiq_graft() and
prio_graft(). This ensures q->qdiscs[] slots are never NULL, making
control-plane dump paths safe without requiring individual NULL checks.

Since the data-plane paths (taprio_enqueue and taprio_dequeue_from_txq)
previously had explicit NULL guards that would drop/skip the packet
cleanly, update those checks to test for &noop_qdisc instead. Without
this, packets would reach taprio_enqueue_one() which increments the root
qdisc's qlen and backlog before calling the child's enqueue; noop_qdisc
drops the packet but those counters are never rolled back, permanently
inflating the root qdisc's statistics.

Fixes: 665338b2a7a0 ("net/sched: taprio: dump class stats for the actual q->qdiscs[]")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
---
v3: fix broken patch
v2: Also update NULL guards in taprio_enqueue() and
    taprio_dequeue_from_txq() to avoid qlen/backlog inflation (Paolo).
---
 net/sched/sch_taprio.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index f721c03514f60..07723b156c5b3 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -634,7 +634,7 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	queue = skb_get_queue_mapping(skb);

 	child = q->qdiscs[queue];
-	if (unlikely(!child))
+	if (unlikely(child == &noop_qdisc))
 		return qdisc_drop(skb, sch, to_free);

 	if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) {
@@ -717,7 +717,7 @@ static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq,
 	int len;
 	u8 tc;

-	if (unlikely(!child))
+	if (unlikely(child == &noop_qdisc))
 		return NULL;

 	if (TXTIME_ASSIST_IS_ENABLED(q->flags))
@@ -2183,6 +2183,9 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl,
 	if (!dev_queue)
 		return -EINVAL;

+	if (!new)
+		new = &noop_qdisc;
+
 	if (dev->flags & IFF_UP)
 		dev_deactivate(dev);

@@ -2196,14 +2199,14 @@ static int taprio_graft(struct Qdisc *sch, unsigned long cl,
 	*old = q->qdiscs[cl - 1];
 	if (FULL_OFFLOAD_IS_ENABLED(q->flags)) {
 		WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old);
-		if (new)
+		if (new != &noop_qdisc)
 			qdisc_refcount_inc(new);
 		if (*old)
 			qdisc_put(*old);
 	}

 	q->qdiscs[cl - 1] = new;
-	if (new)
+	if (new != &noop_qdisc)
 		new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;

 	if (dev->flags & IFF_UP)
-- 
2.43.0

^ permalink raw reply related

* Re: [net,PATCH v2] net: ks8851: Reinstate disabling of BHs around IRQ handler
From: Sebastian Andrzej Siewior @ 2026-04-14 10:48 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Marek Vasut, netdev, stable, David S. Miller, Andrew Lunn,
	Eric Dumazet, Nicolai Buchwitz, Paolo Abeni, Ronald Wahl,
	Yicong Hui, linux-kernel, Thomas Gleixner
In-Reply-To: <20260412105125.48f0c58f@kernel.org>

On 2026-04-12 10:51:25 [-0700], Jakub Kicinski wrote:
> > 
> >    rt_spin_lock from ks8851_start_xmit_par+0x68/0x1a0
> >    ks8851_start_xmit_par from netdev_start_xmit+0x1c/0x40 <---- this 
> > tries to grab the same PAR spinlock, and deadlocks
> >    netdev_start_xmit from dev_hard_start_xmit+0xec/0x1b0
> >    dev_hard_start_xmit from sch_direct_xmit+0xb8/0x25c
> >    sch_direct_xmit from __qdisc_run+0x20c/0x4fc
> >    __qdisc_run from qdisc_run+0x1c/0x28
> >    qdisc_run from net_tx_action+0x1f4/0x244
> >    net_tx_action from handle_softirqs+0x1c0/0x29c
> >    handle_softirqs from __local_bh_enable_ip+0xdc/0xf4
> >    __local_bh_enable_ip from __netdev_alloc_skb+0x140/0x194
> >    __netdev_alloc_skb from ks8851_irq+0x348/0x4d8 <---- this is called 
> > from ks8851_rx_pkts() via netdev_alloc_skb_ip_align()
> >    ks8851_irq from irq_thread_fn+0x24/0x64 <-------- this here runs with 
> > the PAR spinlock held
> > 
> > > The patch looks way to "advanced" for a driver. Something is going
> > > very wrong here. Or the commit message must be updated to explain
> > > it better to people like me. Or both.  
> > 
> > Does the backtrace make the problem clearer, with the annotation above ?
> 
> Sebastian, do you have any recommendation here? tl;dr is that the driver does
> 
> 	spin_lock_irqsave()
> 	__netdev_alloc_skb()
> 	spin_unlock_irqrestore()

So that is what happens in the backtrace. But not as of v7.0 if I look
at ks8851_irq():

|         if (status & IRQ_TXI) {
|                 unsigned short tx_space = ks8851_rdreg16(ks, KS_TXMIR);
|
|                 netif_dbg(ks, intr, ks->netdev,
|                           "%s: txspace %d\n", __func__, tx_space);
|
|                 spin_lock_bh(&ks->statelock);
disables bh

|                 ks->tx_space = tx_space;
|                 if (netif_queue_stopped(ks->netdev))
|                         netif_wake_queue(ks->netdev);
wakes queue, raise softirq, net-tx which does the qdisc_run() as seen in
the backtrace

|                 spin_unlock_bh(&ks->statelock);
enables bh and runs it
|         }

So this I understand and it would lead to a similar backtrace.
However this shouldn't occur from __netdev_alloc_skb(). 

> And __netdev_alloc_skb() does:
> 
> 	if (in_hardirq() || irqs_disabled()) {
> 		nc = this_cpu_ptr(&netdev_alloc_cache);
> 		data = page_frag_alloc(nc, len, gfp_mask);
> 		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
> 	} else {
> 		local_bh_disable();
> 		local_lock_nested_bh(&napi_alloc_cache.bh_lock);
> 
> 		nc = this_cpu_ptr(&napi_alloc_cache.page);
> 		data = page_frag_alloc(nc, len, gfp_mask);
> 		pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
> 
> 		local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
> 		local_bh_enable();
> 	}
> 
> the local_bh_enable() seems to kick in BH processing inline,
> and BH processing takes the same spin lock the driver is already
> holding.

Yes, it does. But there is nothing between local_bh_disable() and
local_bh_enable() that raises the softirq. Looking at v6.9 there is the
following instead:

|                 spin_lock(&ks->statelock);
|                 ks->tx_space = tx_space;
|                 if (netif_queue_stopped(ks->netdev))
|                         netif_wake_queue(ks->netdev);
|                 spin_unlock(&ks->statelock);

So no _bh() here. So here netif_wake_queue() woke ksoftirqd to
handle it. _Later_ there is this alloc_skb which does
local_bh_disable()/ enable() and the latter will look at pending
softirqs. They are still set from before because ksoftirqd had no chance
processing them. And now you see the deadlock from within
__netdev_alloc_skb().

I *think* lockdep will yell here on RT.
Looking at current kernel from !RT perspective, this isn't good either.
We have:

| ks8851_irq
| {
|    ks8851_lock()
|       -> spin_lock_irqsave()
irqs are off

|    if (status & IRQ_TXI) {
|       spin_lock_bh(&ks->statelock);
|       if (netif_queue_stopped(ks->netdev))
|          netif_wake_queue(ks->netdev);
raise softirq
|       spin_unlock_bh(&ks->statelock);
bh enable with disabled interrupts. And __local_bh_enable_ip() has this
gem:

|void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
| {
|         WARN_ON_ONCE(in_hardirq());
|         lockdep_assert_irqs_enabled();
| #ifdef CONFIG_TRACE_IRQFLAGS
|         local_irq_disable();
| #endif

so lockep will yell if interrupts are disabled. And handle_softirqs()
will enable interrupts before handling softirqs and restore them later
on. But CONFIG_TRACE_IRQFLAGS will keep them enabled. Since the lock is
not acquired in hardirq, it has no other deadlock problem.

What I don't understand is why this is limited to PREEMPT_RT. !RT is
also affected by this:
- ks8851_irq() acquires the lock, disables interrupts
- netif_wake_queue() raises the softirq
- spin_unlock_bh(&ks->statelock) enables BH and handles softirqs, and
  goes to ks8851_start_xmit()

This is only possible in newer kernels due to  0913ec336a6c0 ("net:
ks8851: Fix deadlock with the SPI chip variant") because of the
irq_disabled() check in skb allocation.

So. Using _bh instead _irq remains my recommendation. Lockdep should
already yell on !RT here. 

Sebastian

^ permalink raw reply

* [PATCH v2] netfilter: nfnetlink_osf: fix null-ptr-deref in nf_osf_ttl
From: Kito Xu (veritas501) @ 2026-04-14 10:49 UTC (permalink / raw)
  To: pablo
  Cc: coreteam, davem, edumazet, ffmancera, fw, horms, hxzene, kuba,
	linux-kernel, netdev, netfilter-devel, pabeni, phil
In-Reply-To: <20260414074556.2512750-1-hxzene@gmail.com>

nf_osf_ttl() calls __in_dev_get_rcu(skb->dev) and passes the result
to in_dev_for_each_ifa_rcu() without checking for NULL. When the
receiving device has no IPv4 configuration (ip_ptr is NULL),
__in_dev_get_rcu() returns NULL and in_dev_for_each_ifa_rcu()
dereferences it unconditionally, causing a kernel crash.

This can happen when a packet arrives on a device that has had its
IPv4 configuration removed (e.g., MTU set below IPV4_MIN_MTU causing
inetdev_destroy) or on a device that was never assigned an IPv4
address, while an xt_osf or nft_osf rule with TTL_LESS mode is
active and the packet TTL exceeds the fingerprint TTL.

Add a NULL check for in_dev before using it. When in_dev is NULL,
return 0 (no match) since source-address locality cannot be
determined without IPv4 addresses on the device.

KASAN: null-ptr-deref in range [0x0000000000000010-0x0000000000000017]
RIP: 0010:nf_osf_match_one+0x204/0xa70
Call Trace:
 <IRQ>
 nf_osf_match+0x2f8/0x780
 xt_osf_match_packet+0x11c/0x1f0
 ipt_do_table+0x7fe/0x12b0
 nf_hook_slow+0xac/0x1e0
 ip_rcv+0x123/0x370
 __netif_receive_skb_one_core+0x166/0x1b0
 process_backlog+0x197/0x590
 __napi_poll+0xa1/0x540
 net_rx_action+0x401/0xd80
 handle_softirqs+0x19f/0x610
 </IRQ>

Fixes: a218dc82f0b5 ("netfilter: nft_osf: Add ttl option support")
Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Kito Xu (veritas501) <hxzene@gmail.com>
---
 net/netfilter/nfnetlink_osf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
index d64ce21c7b55..dd2cbbd449e7 100644
--- a/net/netfilter/nfnetlink_osf.c
+++ b/net/netfilter/nfnetlink_osf.c
@@ -36,6 +36,9 @@ static inline int nf_osf_ttl(const struct sk_buff *skb,
 	const struct in_ifaddr *ifa;
 	int ret = 0;

+	if (!in_dev)
+		return 0;
+
 	if (ttl_check == NF_OSF_TTL_TRUE)
 		return ip->ttl == f_ttl;
 	if (ttl_check == NF_OSF_TTL_NOCHECK)
-- 
2.43.0

^ permalink raw reply related

* [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB
From: Jiayuan Chen @ 2026-04-14 10:57 UTC (permalink / raw)
  To: bpf
  Cc: Jiayuan Chen, Quan Sun, Yinhao Hu, Kaiyan Mei, Dongliang Mu,
	Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima, David S. Miller,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Jonathan Corbet,
	Shuah Khan, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
	Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
	John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
	David Ahern, netdev, linux-doc, linux-kernel

A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
to inject custom TCP header options. When the kernel builds a TCP packet,
it calls tcp_established_options() to calculate the header size, which
invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
callback.

If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
__tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
tcp_current_mss(), which calls tcp_established_options() again,
re-triggering the same BPF callback. This creates an infinite recursion
that exhausts the kernel stack and causes a panic.

BPF_SOCK_OPS_HDR_OPT_LEN_CB
  -> bpf_setsockopt(TCP_NODELAY)
	-> tcp_push_pending_frames()
	  -> tcp_current_mss()
		-> tcp_established_options()
		  -> bpf_skops_hdr_opt_len()
                           /* infinite recursion */
			-> BPF_SOCK_OPS_HDR_OPT_LEN_CB

A similar reentrancy issue exists for TCP congestion control, which is
guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
bpf_setsockopt(TCP_NODELAY) calls that would trigger
tcp_push_pending_frames() and cause the recursion.

Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
 Documentation/networking/net_cachelines/tcp_sock.rst |  1 +
 include/linux/tcp.h                                  | 11 ++++++++++-
 net/core/filter.c                                    |  4 ++++
 net/ipv4/tcp_minisocks.c                             |  1 +
 net/ipv4/tcp_output.c                                |  3 +++
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index 563daea10d6c..07d3226d90cc 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -152,6 +152,7 @@ unsigned_int                  keepalive_intvl
 int                           linger2
 u8                            bpf_sock_ops_cb_flags
 u8:1                          bpf_chg_cc_inprogress
+u8:1                          bpf_hdr_opt_len_cb_inprogress
 u16                           timeout_rehash
 u32                           rcv_ooopack
 u32                           rcv_rtt_last_tsecr
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f72eef31fa23..2bfb73cf922e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -475,12 +475,21 @@ struct tcp_sock {
 	u8	bpf_sock_ops_cb_flags;  /* Control calling BPF programs
 					 * values defined in uapi/linux/tcp.h
 					 */
-	u8	bpf_chg_cc_inprogress:1; /* In the middle of
+	u8	bpf_chg_cc_inprogress:1, /* In the middle of
 					  * bpf_setsockopt(TCP_CONGESTION),
 					  * it is to avoid the bpf_tcp_cc->init()
 					  * to recur itself by calling
 					  * bpf_setsockopt(TCP_CONGESTION, "itself").
 					  */
+		bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
+						  * callback so that a nested
+						  * bpf_setsockopt(TCP_NODELAY) or
+						  * bpf_setsockopt(TCP_CORK) cannot
+						  * trigger tcp_push_pending_frames(),
+						  * which would call tcp_current_mss()
+						  * -> bpf_skops_hdr_opt_len(), causing
+						  * infinite recursion.
+						  */
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
 #else
 #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
diff --git a/net/core/filter.c b/net/core/filter.c
index 78b548158fb0..518699429a7a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
 	if (sk->sk_protocol != IPPROTO_TCP)
 		return -EINVAL;
 
+	if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
+	    tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
+		return -EBUSY;
+
 	switch (optname) {
 	case TCP_NODELAY:
 	case TCP_MAXSEG:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dafb63b923d0..fb06c464ac16 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
 
 	newtp->bpf_chg_cc_inprogress = 0;
+	newtp->bpf_hdr_opt_len_cb_inprogress = 0;
 	tcp_bpf_clone(sk, newsk);
 
 	__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 326b58ff1118..c9654e690e1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
 				  unsigned int *remaining)
 {
 	struct bpf_sock_ops_kern sock_ops;
+	struct tcp_sock *tp = tcp_sk(sk);
 	int err;
 
 	if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
@@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
 	if (skb)
 		bpf_skops_init_skb(&sock_ops, skb, 0);
 
+	tp->bpf_hdr_opt_len_cb_inprogress = 1;
 	err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+	tp->bpf_hdr_opt_len_cb_inprogress = 0;
 
 	if (err || sock_ops.remaining_opt_len == *remaining)
 		return;
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH iwl-net v1] i40e: fix napi_enable/disable skipping ringless q_vectors
From: Maciej Fijalkowski @ 2026-04-14 10:57 UTC (permalink / raw)
  To: Aleksandr Loktionov
  Cc: intel-wired-lan, anthony.l.nguyen, netdev, Jakub Kicinski
In-Reply-To: <20260324130922.562714-1-aleksandr.loktionov@intel.com>

On Tue, Mar 24, 2026 at 02:09:22PM +0100, Aleksandr Loktionov wrote:
> After ethtool -L reduces the queue count, i40e_napi_disable_all() sets
> NAPI_STATE_SCHED on all q_vectors, then i40e_vsi_map_rings_to_vectors()
> clears ring pointers on the excess ones.  i40e_napi_enable_all() skips
> those with:
> 
> 	if (q_vector->rx.ring || q_vector->tx.ring)
> 		napi_enable(&q_vector->napi);
> 
> leaving them on dev->napi_list with NAPI_STATE_SCHED permanently set.
> 
> Writing to /sys/class/net/<iface>/threaded calls napi_stop_kthread()
> on every entry in dev->napi_list.  The function loops on msleep(20)
> waiting for NAPI_STATE_SCHED to clear -- which never happens for the
> stale q_vectors.  The task hangs in D state forever; a concurrent write
> deadlocks on dev->lock held by the first.
> 
> Commit 13a8cd191a2b added the guard to prevent a divide-by-zero in
> i40e_napi_poll() when epoll busy-poll iterated all device NAPIs (4.x
> era).  Since 7adc3d57fe2b ("net: Introduce preferred busy-polling",
> v5.11) napi_busy_loop() polls by napi_id keyed to the socket, so
> ringless q_vectors are never selected.  i40e_msix_clean_rings() also
> independently avoids scheduling NAPI for them.  The guard is safe to
> remove.
> 
> Add an early return in i40e_napi_poll() for num_ringpairs == 0 so the
> function is self-defending against a NULL tx.ring dereference at the
> WB_ON_ITR check, should the NAPI ever fire through an unexpected path.
> 
> Reported-by: Jakub Kicinski <kuba@kernel.org>
> Closes: https://lore.kernel.org/intel-wired-lan/20260316133100.6054a11f@kernel.org/
> Fixes: 13a8cd191a2b ("i40e: Do not enable NAPI on q_vectors that have no rings")
> Cc: stable@vger.kernel.org
> Signed-off-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

The problem comes from a fact that napi instances are kept after
rebuilding VSI with lower queue count. Instead of duck taping the driver
and adding conditions in hot path (!) we should fix the issue at its core.

I'm gonna send a fix, please drop this one.

pw-bot: cr

> ---
> Test configuration:
>   Kernel   : Linux 6.19.0-rc8+
>   NIC      : Intel Ethernet Controller XXV710 for 25GbE SFP28 [8086:158b]
>   Driver   : i40e (in-tree)
>   Firmware : 9.40 0x8000ed12 1.3429.0
>   CPU      : 2 x Intel Xeon Gold 6238M (88 logical CPUs, x86_64)
>   RAM      : 64 GiB
> 
> Reproduction steps (FAIL before fix):
>   # 1. Reduce queues so excess q_vectors lose their ring pointers
>   ethtool -L <iface> combined 1
> 
>   # 2. Enable threaded NAPI (completes fast in 6.19, no hang on enable path)
>   echo 1 > /sys/class/net/<iface>/threaded
> 
>   # 3. Two concurrent writes to disable -- fires the msleep deadlock
>   echo 0 > /sys/class/net/<iface>/threaded &
>   echo 0 > /sys/class/net/<iface>/threaded &
> 
>   Both background tasks enter uninterruptible sleep (D state) immediately
>   and never return.
> 
>   Observed kernel stack (W1, holds dev->lock):
>     msleep+0x2d/0x50
>     napi_set_threaded+0x10b/0x110
>     netif_set_threaded+0xe1/0x140
>     threaded_store+0xd2/0x100
>     kernfs_fop_write_iter+0x138/0x1d0
> 
>   Kernel hung_task message (~120 s after trigger):
>     INFO: task bash blocked for more than 122 seconds.
>     INFO: task bash is blocked on a mutex likely owned by task bash.
> 
> Validation (PASS with fix):
>   Both background tasks exit within 1 second.
>   D-state process count: 0.
>   Busy-poll (net.core.busy_poll=50) + 50000-packet UDP flood with
>   1 active queue: no NULL dereference, no crash.
> 
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 28 ++++++++++++---------
>  drivers/net/ethernet/intel/i40e/i40e_txrx.c | 10 ++++++++
>  2 files changed, 26 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 926d001..5042f8c 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -5182,6 +5182,14 @@ static void i40e_clear_interrupt_scheme(struct i40e_pf *pf)
>  /**
>   * i40e_napi_enable_all - Enable NAPI for all q_vectors in the VSI
>   * @vsi: the VSI being configured
> + *
> + * Enable NAPI on every q_vector that is registered with the netdev,
> + * regardless of whether it currently has rings assigned.  After a queue-
> + * count reduction (e.g. ethtool -L combined 1) the excess q_vectors lose
> + * their ring pointers inside i40e_vsi_map_rings_to_vectors but remain on
> + * dev->napi_list.  Leaving them in the napi_disable()-ed state
> + * (NAPI_STATE_SCHED set) causes napi_set_threaded() to spin forever on
> + * msleep(20) waiting for that bit to clear.
>   **/
>  static void i40e_napi_enable_all(struct i40e_vsi *vsi)
>  {
> @@ -5190,17 +5198,17 @@ static void i40e_napi_enable_all(struct i40e_vsi *vsi)
>  	if (!vsi->netdev)
>  		return;
>  
> -	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
> -		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
> -
> -		if (q_vector->rx.ring || q_vector->tx.ring)
> -			napi_enable(&q_vector->napi);
> -	}
> +	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
> +		napi_enable(&vsi->q_vectors[q_idx]->napi);
>  }
>  
>  /**
>   * i40e_napi_disable_all - Disable NAPI for all q_vectors in the VSI
>   * @vsi: the VSI being configured
> + *
> + * Mirror of i40e_napi_enable_all: operate on every registered q_vector so
> + * enable/disable calls are always balanced, even when some q_vectors carry
> + * no rings (as happens after a queue-count reduction).
>   **/
>  static void i40e_napi_disable_all(struct i40e_vsi *vsi)
>  {
> @@ -5209,12 +5217,8 @@ static void i40e_napi_disable_all(struct i40e_vsi *vsi)
>  	if (!vsi->netdev)
>  		return;
>  
> -	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++) {
> -		struct i40e_q_vector *q_vector = vsi->q_vectors[q_idx];
> -
> -		if (q_vector->rx.ring || q_vector->tx.ring)
> -			napi_disable(&q_vector->napi);
> -	}
> +	for (q_idx = 0; q_idx < vsi->num_q_vectors; q_idx++)
> +		napi_disable(&vsi->q_vectors[q_idx]->napi);
>  }
>  
>  /**
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> index 894f2d0..3123459 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
> @@ -2760,6 +2760,16 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
>  		return 0;
>  	}
>  
> +	/* A q_vector can have its ring pointers cleared after a queue-count
> +	 * reduction (ethtool -L combined N) while napi_enable() was already
> +	 * called on it.  Complete immediately so the poll loop exits cleanly
> +	 * and we never dereference the NULL ring pointer below.
> +	 */
> +	if (unlikely(!q_vector->num_ringpairs)) {
> +		napi_complete_done(napi, 0);
> +		return 0;
> +	}
> +
>  	/* Since the actual Tx work is minimal, we can give the Tx a larger
>  	 * budget and be more aggressive about cleaning up the Tx descriptors.
>  	 */
> -- 
> 2.52.0
> 
> 

^ permalink raw reply

* [PATCH net v3 0/5] Fix i40e/ice/iavf VF bonding after netdev lock changes
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez

This series fixes VF bonding failures introduced by commit ad7c7b2172c3
("net: hold netdev instance lock during sysfs operations").

When adding VFs to a bond immediately after setting trust mode, MAC
address changes fail with -EAGAIN, preventing bonding setup. This
affects both i40e (700-series) and ice (800-series) Intel NICs.

The core issue is lock contention: iavf_set_mac() is now called with the
netdev lock held and waits for MAC change completion while holding it.
However, both the watchdog task that sends the request and the adminq_task
that processes PF responses also need this lock, creating a deadlock where
neither can run, causing timeouts.

Additionally, setting VF trust triggers an unnecessary ~10 second VF reset
in i40e driver that delays bonding setup, even though filter
synchronization happens naturally during normal VF operation. For ice
driver, the delay is not so big, but in the same way the operation is not
necessary.

This series:
1. Adds safety guard to prevent MAC changes during reset or early
   initialization (before VF is ready)
2. Eliminates unnecessary VF reset when setting trust in i40e
3. Fixes lock contention by polling admin queue synchronously
4. Eliminates unnecessary VF reset when setting trust in ice
5. Refactors virtchnl polling to unify init-time and runtime code paths

The key fix (patch 3/5) implements a synchronous MAC change operation
similar to the approach used for ndo_change_mtu deadlock fix:
https://lore.kernel.org/intel-wired-lan/20260211191855.1532226-1-poros@redhat.com/ 
Instead of scheduling work and waiting, it:

- Sends the virtchnl message directly (not via watchdog)
- Polls the admin queue hardware directly for responses
- Processes all messages inline (including non-MAC messages)
- Returns when complete or times out

This allows the operation to complete synchronously while holding
netdev_lock, without relying on watchdog or adminq_task. A new generic
iavf_poll_virtchnl_response() function was introduced for this.

Patch 5 refactors the polling implementation based on Przemek Kitszel
feedback, unifying in a centralized polling way, the previously (with
patch 3) separate init-time (avf_poll_virtchnl_msg()) and runtime polling
(iavf_poll_virtchnl_response()) into the original polling function 
(iavf_poll_virtchnl_msg()) allowing both behaviors.
I have preferred to create a separate patch for the refactoring for the
sake of clarity in the solution, and I would prefer to include in the net
series because it is tightly coupled with patch 3.

The function can sleep for up to 2.5 seconds polling hardware, but this
is acceptable since netdev_lock is per-device and only serializes
operations on the same interface.

Testing shows VF bonding now works reliably in ~5 seconds vs 15+ seconds
before (i40e), without timeouts or errors (i40e and ice).

Tested on Intel 700-series (i40e) and 800-series (ice) dual-port NICs
with iavf driver.

Thanks to Jan Tluka <jtluka@redhat.com> and Yuying Ma <yuma@redhat.com> for
reporting the issues.

Jose Ignacio Tornos Martinez (5):
  iavf: return EBUSY if reset in progress or not ready during MAC change
  i40e: skip unnecessary VF reset when setting trust
  iavf: send MAC change request synchronously
  ice: skip unnecessary VF reset when setting trust
  iavf: refactor virtchnl polling to unify init and runtime paths

---
v3:
  - Complete patch 3 with the comments from Przemek Kitszel
  - Added patch 5: Refactor to unify polling into iavf_poll_virtchnl_msg()
    function (Przemek Kitszel suggestion). It processes messages through
    iavf_virtchnl_completion() when appropriate (runtime operations with
    timeout; init-time operations continue to return raw messages without
    completion processing).
  - No changes to patch 1,2 and 4 from v2
v2: https://lore.kernel.org/netdev/20260407165206.1121317-1-jtornosm@redhat.com/

 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |   7 ++++++-
 drivers/net/ethernet/intel/iavf/iavf.h             |   6 +++++-
 drivers/net/ethernet/intel/iavf/iavf_main.c        |  69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
 drivers/net/ethernet/intel/iavf/iavf_virtchnl.c    | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------
 drivers/net/ethernet/intel/ice/ice_sriov.c         |  13 +++++++++----
 5 files changed, 193 insertions(+), 64 deletions(-)
--
2.43.0

^ permalink raw reply

* [PATCH net v3 1/5] iavf: return EBUSY if reset in progress or not ready during MAC change
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez
In-Reply-To: <20260414110006.124286-1-jtornosm@redhat.com>

When a MAC address change is requested while the VF is resetting or still
initializing, return -EBUSY immediately instead of attempting the
operation.

Additionally, during early initialization states (before __IAVF_DOWN),
the PF may be slow to respond to MAC change requests, causing long
delays. Only allow MAC changes once the VF reaches __IAVF_DOWN state or
later, when the watchdog is running and the VF is ready for operations.

After commit ad7c7b2172c3 ("net: hold netdev instance lock
during sysfs operations"), MAC changes are called with the netdev lock
held, so we should not wait with the lock held during reset or
initialization. This allows the caller to retry or handle the busy state
appropriately without blocking other operations.

Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>

 drivers/net/ethernet/intel/iavf/iavf_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index dad001abc908..67aa14350b1b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1060,6 +1060,9 @@ static int iavf_set_mac(struct net_device *netdev, void *p)
 	struct sockaddr *addr = p;
 	int ret;

+	if (iavf_is_reset_in_progress(adapter) || adapter->state < __IAVF_DOWN)
+		return -EBUSY;
+
 	if (!is_valid_ether_addr(addr->sa_data))
 		return -EADDRNOTAVAIL;

-- 
2.53.0

^ permalink raw reply related

* [PATCH net v3 2/5] i40e: skip unnecessary VF reset when setting trust
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez
In-Reply-To: <20260414110006.124286-1-jtornosm@redhat.com>

When VF trust is changed, i40e_ndo_set_vf_trust() always calls
i40e_vc_reset_vf() to sync MAC/VLAN filters. However, this reset is
only necessary when trust is removed from a VF that has ADQ (advanced
queue) filters, which need to be deleted

In all other cases, the reset causes a ~10 second delay during which:
- VF must reinitialize completely
- Any in-progress operations (like bonding enslave) fail with timeouts
- VF is unavailable

The MAC/VLAN filter sync will happen naturally through the normal VF
operations and doesn't require a forced reset.

Fix by only resetting when actually needed: when removing trust from a
VF that has ADQ cloud filters. For all other trust changes, just update
the trust flag and let normal operation continue.

Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index a26c3d47ec15..fea267af7afe 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -4987,16 +4987,21 @@ int i40e_ndo_set_vf_trust(struct net_device *netdev, int vf_id, bool setting)
 	set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state);
 	pf->vsi[vf->lan_vsi_idx]->flags |= I40E_VSI_FLAG_FILTER_CHANGED;
 
-	i40e_vc_reset_vf(vf, true);
 	dev_info(&pf->pdev->dev, "VF %u is now %strusted\n",
 		 vf_id, setting ? "" : "un");
 
+	/* Only reset VF if we're removing trust and it has ADQ cloud filters.
+	 * Cloud filters can only be added when trusted, so they must be
+	 * removed when trust is revoked. Other trust changes don't require
+	 * reset - MAC/VLAN filter sync happens through normal operation.
+	 */
 	if (vf->adq_enabled) {
 		if (!vf->trusted) {
 			dev_info(&pf->pdev->dev,
 				 "VF %u no longer Trusted, deleting all cloud filters\n",
 				 vf_id);
 			i40e_del_all_cloud_filters(vf);
+			i40e_vc_reset_vf(vf, true);
 		}
 	}
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v3 3/5] iavf: send MAC change request synchronously
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez, stable
In-Reply-To: <20260414110006.124286-1-jtornosm@redhat.com>

After commit ad7c7b2172c3 ("net: hold netdev instance lock during sysfs
operations"), iavf_set_mac() is called with the netdev instance lock
already held.

The function queues a MAC address change request via
iavf_replace_primary_mac() and then waits for completion. However, in
the current flow, the actual virtchnl message is sent by the watchdog
task, which also needs to acquire the netdev lock to run. Additionally,
the adminq_task which processes virtchnl responses also needs the netdev
lock.

This creates a deadlock scenario:
1. iavf_set_mac() holds netdev lock and waits for MAC change
2. Watchdog needs netdev lock to send the request -> blocked
3. Even if request is sent, adminq_task needs netdev lock to process
   PF response -> blocked
4. MAC change times out after 2.5 seconds
5. iavf_set_mac() returns -EAGAIN

This particularly affects VFs during bonding setup when multiple VFs are
enslaved in quick succession.

Fix by implementing a synchronous MAC change operation similar to the
approach used in commit fdadbf6e84c4 ("iavf: fix incorrect reset handling
in callbacks").

The solution:
1. Send the virtchnl ADD_ETH_ADDR message directly (not via watchdog)
2. Poll the admin queue hardware directly for responses
3. Process all received messages (including non-MAC messages)
4. Return when MAC change completes or times out

A new generic function iavf_poll_virtchnl_response() is introduced that
can be reused for any future synchronous virtchnl operations. It takes a
callback to check completion, allowing flexible condition checking.

This allows the operation to complete synchronously while holding
netdev_lock, without relying on watchdog or adminq_task. The function
can sleep for up to 2.5 seconds polling hardware, but this is acceptable
since netdev_lock is per-device and only serializes operations on the
same interface.

To support this, change iavf_add_ether_addrs() to return an error code
instead of void, allowing callers to detect failures.

Fixes: ad7c7b2172c3 ("net: hold netdev instance lock during sysfs operations")
cc: stable@vger.kernel.org
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
---
v3: Complete with Przemek Kitszel comments:                                                                                                                                           
    - Moved iavf_poll_virtchnl_response() to iavf_virtchnl.c for reusability                                                                                                                                               
    - Changed kdoc to use "Return:" instead of "Returns"                                                                                                                                                                   
    - Changed to do-while loop structure                                                                                                                                                                                   
    - Added pending parameter to skip sleep when more messages queued                                                                                                                                                      
    - Reduced sleep time to 50-75 usec (from 1000-2000, per commit 9e3f23f44f32)                                                                                                                                           
    - Added v_opcode parameter for standard completion checking                                                                                                                                                            
    - Callback parameter takes priority over opcode check                                                                                                                                                                  
    - Made cond_data parameter const                                                                                                                                                                                       
    - Final condition check after timeout before returning -EAGAIN                                                                                                                                                         
v2: https://lore.kernel.org/netdev/20260407165206.1121317-4-jtornosm@redhat.com/

 drivers/net/ethernet/intel/iavf/iavf.h        |   7 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  57 ++++++---
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   | 111 +++++++++++++++++-
 3 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index e9fb0a0919e3..b012a91b0252 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -589,7 +589,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter);
 void iavf_enable_queues(struct iavf_adapter *adapter);
 void iavf_disable_queues(struct iavf_adapter *adapter);
 void iavf_map_queues(struct iavf_adapter *adapter);
-void iavf_add_ether_addrs(struct iavf_adapter *adapter);
+int iavf_add_ether_addrs(struct iavf_adapter *adapter);
 void iavf_del_ether_addrs(struct iavf_adapter *adapter);
 void iavf_add_vlans(struct iavf_adapter *adapter);
 void iavf_del_vlans(struct iavf_adapter *adapter);
@@ -607,6 +607,11 @@ void iavf_disable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum virtchnl_ops v_opcode,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
+int iavf_poll_virtchnl_response(struct iavf_adapter *adapter,
+				bool (*condition)(struct iavf_adapter *, const void *),
+				const void *cond_data,
+				enum virtchnl_ops v_opcode,
+				unsigned int timeout_ms);
 int iavf_config_rss(struct iavf_adapter *adapter);
 void iavf_cfg_queues_bw(struct iavf_adapter *adapter);
 void iavf_cfg_queues_quanta_size(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 67aa14350b1b..80277d495a8d 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1047,6 +1047,46 @@ static bool iavf_is_mac_set_handled(struct net_device *netdev,
 	return ret;
 }
 
+/**
+ * iavf_mac_change_done - Check if MAC change completed
+ * @adapter: board private structure
+ * @data: MAC address being checked (as const void *)
+ *
+ * Callback for iavf_poll_virtchnl_response() to check if MAC change completed.
+ *
+ * Returns true if MAC change completed, false otherwise
+ */
+static bool iavf_mac_change_done(struct iavf_adapter *adapter, const void *data)
+{
+	const u8 *addr = data;
+
+	return iavf_is_mac_set_handled(adapter->netdev, addr);
+}
+
+/**
+ * iavf_set_mac_sync - Synchronously change MAC address
+ * @adapter: board private structure
+ * @addr: MAC address to set
+ *
+ * Sends MAC change request to PF and polls admin queue for response.
+ * Caller must hold netdev_lock. This can sleep for up to 2.5 seconds.
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int iavf_set_mac_sync(struct iavf_adapter *adapter, const u8 *addr)
+{
+	int ret;
+
+	netdev_assert_locked(adapter->netdev);
+
+	ret = iavf_add_ether_addrs(adapter);
+	if (ret)
+		return ret;
+
+	return iavf_poll_virtchnl_response(adapter, iavf_mac_change_done, addr,
+					   VIRTCHNL_OP_UNKNOWN, 2500);
+}
+
 /**
  * iavf_set_mac - NDO callback to set port MAC address
  * @netdev: network interface device structure
@@ -1067,26 +1107,13 @@ static int iavf_set_mac(struct net_device *netdev, void *p)
 		return -EADDRNOTAVAIL;
 
 	ret = iavf_replace_primary_mac(adapter, addr->sa_data);
-
 	if (ret)
 		return ret;
 
-	ret = wait_event_interruptible_timeout(adapter->vc_waitqueue,
-					       iavf_is_mac_set_handled(netdev, addr->sa_data),
-					       msecs_to_jiffies(2500));
-
-	/* If ret < 0 then it means wait was interrupted.
-	 * If ret == 0 then it means we got a timeout.
-	 * else it means we got response for set MAC from PF,
-	 * check if netdev MAC was updated to requested MAC,
-	 * if yes then set MAC succeeded otherwise it failed return -EACCES
-	 */
-	if (ret < 0)
+	ret = iavf_set_mac_sync(adapter, addr->sa_data);
+	if (ret)
 		return ret;
 
-	if (!ret)
-		return -EAGAIN;
-
 	if (!ether_addr_equal(netdev->dev_addr, addr->sa_data))
 		return -EACCES;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index a52c100dcbc5..df124f840ddb 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -2,6 +2,7 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/net/intel/libie/rx.h>
+#include <net/netdev_lock.h>
 
 #include "iavf.h"
 #include "iavf_ptp.h"
@@ -555,8 +556,10 @@ iavf_set_mac_addr_type(struct virtchnl_ether_addr *virtchnl_ether_addr,
  * @adapter: adapter structure
  *
  * Request that the PF add one or more addresses to our filters.
+ *
+ * Return: 0 on success, negative on failure
  **/
-void iavf_add_ether_addrs(struct iavf_adapter *adapter)
+int iavf_add_ether_addrs(struct iavf_adapter *adapter)
 {
 	struct virtchnl_ether_addr_list *veal;
 	struct iavf_mac_filter *f;
@@ -568,7 +571,7 @@ void iavf_add_ether_addrs(struct iavf_adapter *adapter)
 		/* bail because we already have a command pending */
 		dev_err(&adapter->pdev->dev, "Cannot add filters, command %d pending\n",
 			adapter->current_op);
-		return;
+		return -EBUSY;
 	}
 
 	spin_lock_bh(&adapter->mac_vlan_list_lock);
@@ -580,7 +583,7 @@ void iavf_add_ether_addrs(struct iavf_adapter *adapter)
 	if (!count) {
 		adapter->aq_required &= ~IAVF_FLAG_AQ_ADD_MAC_FILTER;
 		spin_unlock_bh(&adapter->mac_vlan_list_lock);
-		return;
+		return 0;
 	}
 	adapter->current_op = VIRTCHNL_OP_ADD_ETH_ADDR;
 
@@ -595,7 +598,7 @@ void iavf_add_ether_addrs(struct iavf_adapter *adapter)
 	veal = kzalloc(len, GFP_ATOMIC);
 	if (!veal) {
 		spin_unlock_bh(&adapter->mac_vlan_list_lock);
-		return;
+		return -ENOMEM;
 	}
 
 	veal->vsi_id = adapter->vsi_res->vsi_id;
@@ -617,6 +620,7 @@ void iavf_add_ether_addrs(struct iavf_adapter *adapter)
 
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_ADD_ETH_ADDR, (u8 *)veal, len);
 	kfree(veal);
+	return 0;
 }
 
 /**
@@ -2956,3 +2960,102 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 	} /* switch v_opcode */
 	adapter->current_op = VIRTCHNL_OP_UNKNOWN;
 }
+
+/**
+ * iavf_virtchnl_done - Check if virtchnl operation completed
+ * @adapter: board private structure
+ * @condition: optional callback for custom completion check
+ *   (takes priority)
+ * @cond_data: context data for callback
+ * @v_opcode: virtchnl opcode value we're waiting for if no condition
+ *   configured (typically VIRTCHNL_OP_UNKNOWN), if condition not used
+ *
+ * Checks completion status. Callback takes priority if provided. Otherwise
+ * waits for current_op to reach v_opcode (typically VIRTCHNL_OP_UNKNOWN
+ * after completion).
+ *
+ * Return: true if operation completed
+ */
+static inline bool iavf_virtchnl_done(struct iavf_adapter *adapter,
+				      bool (*condition)(struct iavf_adapter *, const void *),
+				      const void *cond_data,
+				      enum virtchnl_ops v_opcode)
+{
+	if (condition)
+		return condition(adapter, cond_data);
+
+	return adapter->current_op == v_opcode;
+}
+
+/**
+ * iavf_poll_virtchnl_response - Poll admin queue for virtchnl response
+ * @adapter: board private structure
+ * @condition: optional callback to check if desired response received
+ *   (takes priority)
+ * @cond_data: context data passed to condition callback
+ * @v_opcode: virtchnl opcode value to wait for if no condition configured
+ *   (typically VIRTCHNL_OP_UNKNOWN), if condition, not used
+ * @timeout_ms: maximum time to wait in milliseconds
+ *
+ * Polls admin queue and processes all messages until condition returns true
+ * or timeout expires. If condition is NULL, waits for current_op to become
+ * v_opcode (typically VIRTCHNL_OP_UNKNOWN after operation completes).
+ * Caller must hold netdev_lock. This can sleep for up to timeout_ms while
+ * polling hardware.
+ *
+ * Return: 0 on success (condition met), -EAGAIN on timeout or error
+ */
+int iavf_poll_virtchnl_response(struct iavf_adapter *adapter,
+				bool (*condition)(struct iavf_adapter *, const void *),
+				const void *cond_data,
+				enum virtchnl_ops v_opcode,
+				unsigned int timeout_ms)
+{
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	enum virtchnl_ops v_op;
+	enum iavf_status v_ret;
+	unsigned long timeout;
+	u16 pending;
+	int ret;
+
+	netdev_assert_locked(adapter->netdev);
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);
+	do {
+		if (iavf_virtchnl_done(adapter, condition, cond_data, v_opcode)) {
+			ret = 0;
+			goto out;
+		}
+
+		ret = iavf_clean_arq_element(hw, &event, &pending);
+		if (!ret) {
+			v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
+			v_ret = (enum iavf_status)le32_to_cpu(event.desc.cookie_low);
+
+			iavf_virtchnl_completion(adapter, v_op, v_ret,
+						 event.msg_buf, event.msg_len);
+
+			memset(event.msg_buf, 0, IAVF_MAX_AQ_BUF_SIZE);
+
+			if (pending)
+				continue;
+		}
+
+		usleep_range(50, 75);
+	} while (time_before(jiffies, timeout));
+
+	if (iavf_virtchnl_done(adapter, condition, cond_data, v_opcode))
+		ret = 0;
+	else
+		ret = -EAGAIN;
+
+out:
+	kfree(event.msg_buf);
+	return ret;
+}
-- 
2.53.0


^ permalink raw reply related

* [PATCH net v3 4/5] ice: skip unnecessary VF reset when setting trust
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez
In-Reply-To: <20260414110006.124286-1-jtornosm@redhat.com>

Similar to the i40e fix, ice_set_vf_trust() unconditionally calls
ice_reset_vf() when the trust setting changes.

The ice driver already has logic to clean up MAC LLDP filters when
removing trust, which is the only operation that requires filter
synchronization. After this cleanup, the VF reset is only necessary if
there were actually filters to remove.

For all other trust state changes (setting trust, or removing trust
when no filters exist), the reset is unnecessary as filter
synchronization happens naturally through normal VF operations.

Fix by only triggering the VF reset when removing trust AND filters
were actually cleaned up (num_mac_lldp was non-zero).

This saves some time and eliminates unnecessary service disruption when
changing VF trust settings if not necessary.

Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
---
 drivers/net/ethernet/intel/ice/ice_sriov.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/ice/ice_sriov.c b/drivers/net/ethernet/intel/ice/ice_sriov.c
index 7e00e091756d..23f692b1e86c 100644
--- a/drivers/net/ethernet/intel/ice/ice_sriov.c
+++ b/drivers/net/ethernet/intel/ice/ice_sriov.c
@@ -1399,14 +1399,19 @@ int ice_set_vf_trust(struct net_device *netdev, int vf_id, bool trusted)

 	mutex_lock(&vf->cfg_lock);

-	while (!trusted && vf->num_mac_lldp)
-		ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf), false);
-
 	vf->trusted = trusted;
-	ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
 	dev_info(ice_pf_to_dev(pf), "VF %u is now %strusted\n",
 		 vf_id, trusted ? "" : "un");

+	/* Only reset VF if removing trust and there are MAC LLDP filters
+	 * to clean up. Reset is needed to ensure filter removal completes.
+	 */
+	if (!trusted && vf->num_mac_lldp) {
+		while (vf->num_mac_lldp)
+			ice_vf_update_mac_lldp_num(vf, ice_get_vf_vsi(vf), false);
+		ice_reset_vf(vf, ICE_VF_RESET_NOTIFY);
+	}
+
 	mutex_unlock(&vf->cfg_lock);

 out_put_vf:
-- 
2.53.0

^ permalink raw reply related

* [PATCH net v3 5/5] iavf: refactor virtchnl polling into single function
From: Jose Ignacio Tornos Martinez @ 2026-04-14 11:00 UTC (permalink / raw)
  To: netdev
  Cc: intel-wired-lan, jesse.brandeburg, anthony.l.nguyen, davem,
	edumazet, kuba, pabeni, Jose Ignacio Tornos Martinez,
	Przemek Kitszel
In-Reply-To: <20260414110006.124286-1-jtornosm@redhat.com>

At this moment, the driver has two separate functions for polling virtchnl
messages from the admin queue:
- iavf_poll_virtchnl_msg() for init-time (no timeout, no completion
  handler)
- iavf_poll_virtchnl_response() for runtime (with timeout, calls
  completion)

Refactor by enhancing iavf_poll_virtchnl_msg() to handle both use cases:
1. Init-time mode (timeout_ms=0):
  - Polls until matching opcode found or queue empty
  - Returns raw message data without processing through completion handler
  - Exits immediately on empty queue (no sleep/retry)
2. Runtime mode (timeout_ms>0):
  - Polls with timeout using condition callback or opcode check
  - Processes all messages through iavf_virtchnl_completion()
  - Supports custom completion callback (takes priority) or falls back
    to checking adapter->current_op against expected opcode
  - Uses pending parameter to skip sleep when more messages queued
  - Uses 50-75 usec sleep (due to commit 9e3f23f44f32 ("i40e: reduce wait
    time for adminq command completion"))

By unifying message handling, both init-time and runtime messages can be
processed through the completion handler when appropriate, ensuring
consistent state updates and maintaining backward compatibility with all
existing call sites.

Suggested-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
Signed-off-by: Jose Ignacio Tornos Martinez <jtornosm@redhat.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |   9 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  13 +-
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   | 247 ++++++++----------
 3 files changed, 125 insertions(+), 144 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index b012a91b0252..9b25c5a65d2a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -607,11 +607,10 @@ void iavf_disable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum virtchnl_ops v_opcode,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
-int iavf_poll_virtchnl_response(struct iavf_adapter *adapter,
-				bool (*condition)(struct iavf_adapter *, const void *),
-				const void *cond_data,
-				enum virtchnl_ops v_opcode,
-				unsigned int timeout_ms);
+int iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
+			   enum virtchnl_ops op_to_poll, unsigned int timeout_ms,
+			   bool (*condition)(struct iavf_adapter *, const void *),
+			   const void *cond_data);
 int iavf_config_rss(struct iavf_adapter *adapter);
 void iavf_cfg_queues_bw(struct iavf_adapter *adapter);
 void iavf_cfg_queues_quanta_size(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 80277d495a8d..b0db15fd8ddb 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1075,6 +1075,7 @@ static bool iavf_mac_change_done(struct iavf_adapter *adapter, const void *data)
  */
 static int iavf_set_mac_sync(struct iavf_adapter *adapter, const u8 *addr)
 {
+	struct iavf_arq_event_info event;
 	int ret;
 
 	netdev_assert_locked(adapter->netdev);
@@ -1083,8 +1084,16 @@ static int iavf_set_mac_sync(struct iavf_adapter *adapter, const u8 *addr)
 	if (ret)
 		return ret;
 
-	return iavf_poll_virtchnl_response(adapter, iavf_mac_change_done, addr,
-					   VIRTCHNL_OP_UNKNOWN, 2500);
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	ret = iavf_poll_virtchnl_msg(&adapter->hw, &event, VIRTCHNL_OP_UNKNOWN,
+				     2500, iavf_mac_change_done, addr);
+
+	kfree(event.msg_buf);
+	return ret;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index df124f840ddb..ef9a251060d9 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -54,55 +54,121 @@ int iavf_send_api_ver(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_poll_virtchnl_msg
+ * iavf_virtchnl_completion_done - Check if virtchnl operation completed
+ * @adapter: adapter structure
+ * @condition: optional callback for custom completion check
+ * @cond_data: context data for callback
+ * @op_to_poll: opcode to check against current_op (if no callback)
+ *
+ * Checks if operation is complete. Callback takes priority if provided,
+ * otherwise checks if current_op matches op_to_poll.
+ *
+ * Return: true if operation completed
+ */
+static inline bool
+iavf_virtchnl_completion_done(struct iavf_adapter *adapter,
+			      bool (*condition)(struct iavf_adapter *, const void *),
+			      const void *cond_data,
+			      enum virtchnl_ops op_to_poll)
+{
+	if (condition)
+		return condition(adapter, cond_data);
+
+	return adapter->current_op == op_to_poll;
+}
+
+/**
+ * iavf_poll_virtchnl_msg - Poll admin queue for virtchnl message
  * @hw: HW configuration structure
  * @event: event to populate on success
- * @op_to_poll: requested virtchnl op to poll for
+ * @op_to_poll: virtchnl opcode to poll for (used for init-time and runtime
+ *              without callback)
+ * @timeout_ms: timeout in milliseconds (0 = no timeout, exit on empty queue)
+ * @condition: optional callback to check custom completion (runtime use,
+ *             takes priority over op_to_poll check)
+ * @cond_data: context data for condition callback
+ *
+ * Enhanced polling function that handles both init-time and runtime use cases:
+ * - Init-time: Set op_to_poll, timeout_ms=0, condition=NULL
+ *   Polls until matching opcode found or queue empty
+ * - Runtime with callback: Set timeout_ms>0, condition callback, cond_data
+ *   Polls with timeout until condition returns true (op_to_poll not used)
+ * - Runtime without callback: Set op_to_poll, timeout_ms>0, condition=NULL
+ *   Polls with timeout until adapter->current_op == op_to_poll
+ *
+ * Runtime messages are processed through iavf_virtchnl_completion().
+ * For init-time use, returns 0 with raw message data in event buffer.
+ * For runtime use, returns 0 when completion condition is met.
  *
- * Initialize poll for virtchnl msg matching the requested_op. Returns 0
- * if a message of the correct opcode is in the queue or an error code
- * if no message matching the op code is waiting and other failures.
+ * Return: 0 on success, -EAGAIN on timeout, or error code
  */
-static int
-iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
-		       enum virtchnl_ops op_to_poll)
+int iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
+			   enum virtchnl_ops op_to_poll, unsigned int timeout_ms,
+			   bool (*condition)(struct iavf_adapter *, const void *),
+			   const void *cond_data)
 {
+	struct iavf_adapter *adapter = hw->back;
+	unsigned long timeout = timeout_ms ? jiffies + msecs_to_jiffies(timeout_ms) : 0;
 	enum virtchnl_ops received_op;
 	enum iavf_status status;
-	u32 v_retval;
+	u32 v_retval = 0;
+	u16 pending;
 
-	while (1) {
-		/* When the AQ is empty, iavf_clean_arq_element will return
-		 * nonzero and this loop will terminate.
-		 */
-		status = iavf_clean_arq_element(hw, event, NULL);
-		if (status != IAVF_SUCCESS)
-			return iavf_status_to_errno(status);
-		received_op =
-		    (enum virtchnl_ops)le32_to_cpu(event->desc.cookie_high);
+	do {
+		if (timeout_ms && iavf_virtchnl_completion_done(adapter, condition,
+								cond_data, op_to_poll))
+			return 0;
 
-		if (received_op == VIRTCHNL_OP_EVENT) {
-			struct iavf_adapter *adapter = hw->back;
-			struct virtchnl_pf_event *vpe =
-				(struct virtchnl_pf_event *)event->msg_buf;
+		status = iavf_clean_arq_element(hw, event, &pending);
+		if (status == IAVF_SUCCESS) {
+			received_op = (enum virtchnl_ops)le32_to_cpu(event->desc.cookie_high);
 
-			if (vpe->event != VIRTCHNL_EVENT_RESET_IMPENDING)
-				continue;
+			/* Handle reset events specially */
+			if (received_op == VIRTCHNL_OP_EVENT) {
+				struct virtchnl_pf_event *vpe =
+					(struct virtchnl_pf_event *)event->msg_buf;
 
-			dev_info(&adapter->pdev->dev, "Reset indication received from the PF\n");
-			if (!(adapter->flags & IAVF_FLAG_RESET_PENDING))
-				iavf_schedule_reset(adapter,
-						    IAVF_FLAG_RESET_PENDING);
+				if (vpe->event != VIRTCHNL_EVENT_RESET_IMPENDING)
+					continue;
+
+				dev_info(&adapter->pdev->dev,
+					 "Reset indication received from the PF\n");
+				if (!(adapter->flags & IAVF_FLAG_RESET_PENDING))
+					iavf_schedule_reset(adapter,
+							    IAVF_FLAG_RESET_PENDING);
+
+				return -EIO;
+			}
+
+			v_retval = le32_to_cpu(event->desc.cookie_low);
+
+			if (!timeout_ms) {
+				if (received_op == op_to_poll)
+					return virtchnl_status_to_errno((enum virtchnl_status_code)
+							v_retval);
+			} else {
+				iavf_virtchnl_completion(adapter, received_op,
+							 (enum iavf_status)v_retval,
+							 event->msg_buf, event->msg_len);
+			}
+
+			if (pending)
+				continue;
+		} else if (!timeout_ms) {
+			return iavf_status_to_errno(status);
+		}
 
-			return -EIO;
+		if (timeout_ms) {
+			memset(event->msg_buf, 0, IAVF_MAX_AQ_BUF_SIZE);
+			usleep_range(50, 75);
 		}
 
-		if (op_to_poll == received_op)
-			break;
-	}
+	} while (!timeout_ms || time_before(jiffies, timeout));
+
+	if (iavf_virtchnl_completion_done(adapter, condition, cond_data, op_to_poll))
+		return 0;
 
-	v_retval = le32_to_cpu(event->desc.cookie_low);
-	return virtchnl_status_to_errno((enum virtchnl_status_code)v_retval);
+	return -EAGAIN;
 }
 
 /**
@@ -124,7 +190,8 @@ int iavf_verify_api_ver(struct iavf_adapter *adapter)
 	if (!event.msg_buf)
 		return -ENOMEM;
 
-	err = iavf_poll_virtchnl_msg(&adapter->hw, &event, VIRTCHNL_OP_VERSION);
+	err = iavf_poll_virtchnl_msg(&adapter->hw, &event, VIRTCHNL_OP_VERSION,
+				     0, NULL, NULL);
 	if (!err) {
 		struct virtchnl_version_info *pf_vvi =
 			(struct virtchnl_version_info *)event.msg_buf;
@@ -294,7 +361,8 @@ int iavf_get_vf_config(struct iavf_adapter *adapter)
 	if (!event.msg_buf)
 		return -ENOMEM;
 
-	err = iavf_poll_virtchnl_msg(hw, &event, VIRTCHNL_OP_GET_VF_RESOURCES);
+	err = iavf_poll_virtchnl_msg(hw, &event, VIRTCHNL_OP_GET_VF_RESOURCES,
+				     0, NULL, NULL);
 	memcpy(adapter->vf_res, event.msg_buf, min(event.msg_len, len));
 
 	/* some PFs send more queues than we should have so validate that
@@ -322,7 +390,8 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 		return -ENOMEM;
 
 	err = iavf_poll_virtchnl_msg(&adapter->hw, &event,
-				     VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS);
+				     VIRTCHNL_OP_GET_OFFLOAD_VLAN_V2_CAPS,
+				     0, NULL, NULL);
 	if (!err)
 		memcpy(&adapter->vlan_v2_caps, event.msg_buf,
 		       min(event.msg_len, len));
@@ -342,7 +411,8 @@ int iavf_get_vf_supported_rxdids(struct iavf_adapter *adapter)
 	event.buf_len = sizeof(rxdids);
 
 	err = iavf_poll_virtchnl_msg(&adapter->hw, &event,
-				     VIRTCHNL_OP_GET_SUPPORTED_RXDIDS);
+				     VIRTCHNL_OP_GET_SUPPORTED_RXDIDS,
+				     0, NULL, NULL);
 	if (!err)
 		adapter->supp_rxdids = rxdids;
 
@@ -359,7 +429,8 @@ int iavf_get_vf_ptp_caps(struct iavf_adapter *adapter)
 	event.buf_len = sizeof(caps);
 
 	err = iavf_poll_virtchnl_msg(&adapter->hw, &event,
-				     VIRTCHNL_OP_1588_PTP_GET_CAPS);
+				     VIRTCHNL_OP_1588_PTP_GET_CAPS,
+				     0, NULL, NULL);
 	if (!err)
 		adapter->ptp.hw_caps = caps;
 
@@ -2961,101 +3032,3 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 	adapter->current_op = VIRTCHNL_OP_UNKNOWN;
 }
 
-/**
- * iavf_virtchnl_done - Check if virtchnl operation completed
- * @adapter: board private structure
- * @condition: optional callback for custom completion check
- *   (takes priority)
- * @cond_data: context data for callback
- * @v_opcode: virtchnl opcode value we're waiting for if no condition
- *   configured (typically VIRTCHNL_OP_UNKNOWN), if condition not used
- *
- * Checks completion status. Callback takes priority if provided. Otherwise
- * waits for current_op to reach v_opcode (typically VIRTCHNL_OP_UNKNOWN
- * after completion).
- *
- * Return: true if operation completed
- */
-static inline bool iavf_virtchnl_done(struct iavf_adapter *adapter,
-				      bool (*condition)(struct iavf_adapter *, const void *),
-				      const void *cond_data,
-				      enum virtchnl_ops v_opcode)
-{
-	if (condition)
-		return condition(adapter, cond_data);
-
-	return adapter->current_op == v_opcode;
-}
-
-/**
- * iavf_poll_virtchnl_response - Poll admin queue for virtchnl response
- * @adapter: board private structure
- * @condition: optional callback to check if desired response received
- *   (takes priority)
- * @cond_data: context data passed to condition callback
- * @v_opcode: virtchnl opcode value to wait for if no condition configured
- *   (typically VIRTCHNL_OP_UNKNOWN), if condition, not used
- * @timeout_ms: maximum time to wait in milliseconds
- *
- * Polls admin queue and processes all messages until condition returns true
- * or timeout expires. If condition is NULL, waits for current_op to become
- * v_opcode (typically VIRTCHNL_OP_UNKNOWN after operation completes).
- * Caller must hold netdev_lock. This can sleep for up to timeout_ms while
- * polling hardware.
- *
- * Return: 0 on success (condition met), -EAGAIN on timeout or error
- */
-int iavf_poll_virtchnl_response(struct iavf_adapter *adapter,
-				bool (*condition)(struct iavf_adapter *, const void *),
-				const void *cond_data,
-				enum virtchnl_ops v_opcode,
-				unsigned int timeout_ms)
-{
-	struct iavf_hw *hw = &adapter->hw;
-	struct iavf_arq_event_info event;
-	enum virtchnl_ops v_op;
-	enum iavf_status v_ret;
-	unsigned long timeout;
-	u16 pending;
-	int ret;
-
-	netdev_assert_locked(adapter->netdev);
-
-	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
-	event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
-	if (!event.msg_buf)
-		return -ENOMEM;
-
-	timeout = jiffies + msecs_to_jiffies(timeout_ms);
-	do {
-		if (iavf_virtchnl_done(adapter, condition, cond_data, v_opcode)) {
-			ret = 0;
-			goto out;
-		}
-
-		ret = iavf_clean_arq_element(hw, &event, &pending);
-		if (!ret) {
-			v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
-			v_ret = (enum iavf_status)le32_to_cpu(event.desc.cookie_low);
-
-			iavf_virtchnl_completion(adapter, v_op, v_ret,
-						 event.msg_buf, event.msg_len);
-
-			memset(event.msg_buf, 0, IAVF_MAX_AQ_BUF_SIZE);
-
-			if (pending)
-				continue;
-		}
-
-		usleep_range(50, 75);
-	} while (time_before(jiffies, timeout));
-
-	if (iavf_virtchnl_done(adapter, condition, cond_data, v_opcode))
-		ret = 0;
-	else
-		ret = -EAGAIN;
-
-out:
-	kfree(event.msg_buf);
-	return ret;
-}
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 1/3] net: dsa: microchip: implement KSZ87xx Module 3 low-loss cable errata
From: Marek Vasut @ 2026-04-14 11:05 UTC (permalink / raw)
  To: Fidelio Lawson, Woojung Huh, UNGLinuxDriver, Andrew Lunn,
	Vladimir Oltean, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Marek Vasut, Maxime Chevallier, Simon Horman,
	Heiner Kallweit, Russell King
  Cc: netdev, linux-kernel, Fidelio Lawson
In-Reply-To: <20260414-ksz87xx_errata_low_loss_connections-v3-1-0e3838ca98c9@exotec.com>

On 4/14/26 11:12 AM, Fidelio Lawson wrote:
> Implement the "Module 3: Equalizer fix for short cables" erratum from
> Microchip document DS80000687C for KSZ87xx switches.
> 
> The issue affects short or low-loss cable links (e.g. CAT5e/CAT6),
> where the PHY receiver equalizer may amplify high-amplitude signals
> excessively, resulting in internal distortion and link establishment
> failures.
> 
> KSZ87xx devices require a workaround for the Module 3 low-loss cable
> condition, controlled through the switch TABLE_LINK_MD_V indirect
> registers.
> 
> The affected registers are part of the switch address space and are not
> directly accessible from the PHY driver. To keep the PHY-facing API
> clean and avoid leaking switch-specific details, model this errata
> control as vendor-specific Clause 22 PHY registers.
> 
> A vendor-specific Clause 22 PHY register is introduced as a mode
> selector in PHY_REG_LOW_LOSS_CTRL, and ksz8_r_phy() / ksz8_w_phy()
> translate accesses to these bits into the appropriate indirect
> TABLE_LINK_MD_V accesses.
> 
> The control register defines the following modes:
> 0: disabled (default behavior)
> 1: EQ training workaround
> 2: LPF 90 MHz
> 3: LPF 62 MHz
> 4: LPF 55 MHz
> 5: LPF 44 MHz
I may not fully understand this, but aren't the EQ and LPF settings 
orthogonal ?

^ permalink raw reply

* RE: [PATCH iwl-next v2 1/2] idpf: remove conditonal MBX deinit from idpf_vc_core_deinit()
From: Loktionov, Aleksandr @ 2026-04-14 11:07 UTC (permalink / raw)
  To: Tantilov, Emil S, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Kitszel, Przemyslaw, Bhat, Jay,
	Barrera, Ivan D, Zaremba, Larysa, Nguyen, Anthony L,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, Lobakin, Aleksander,
	linux-pci@vger.kernel.org, Chittim, Madhu, decot@google.com,
	willemb@google.com, sheenamo@google.com, lukas@wunner.de
In-Reply-To: <20260414031631.2107-2-emil.s.tantilov@intel.com>



> -----Original Message-----
> From: Tantilov, Emil S <emil.s.tantilov@intel.com>
> Sent: Tuesday, April 14, 2026 5:17 AM
> To: intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Bhat, Jay <jay.bhat@intel.com>;
> Barrera, Ivan D <ivan.d.barrera@intel.com>; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Zaremba, Larysa
> <larysa.zaremba@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; andrew+netdev@lunn.ch;
> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; Lobakin, Aleksander <aleksander.lobakin@intel.com>;
> linux-pci@vger.kernel.org; Chittim, Madhu <madhu.chittim@intel.com>;
> decot@google.com; willemb@google.com; sheenamo@google.com;
> lukas@wunner.de
> Subject: [PATCH iwl-next v2 1/2] idpf: remove conditonal MBX deinit
> from idpf_vc_core_deinit()
"conditional" -> "conditional"

Everything else looks fine
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

> 
> Previously it was assumed that idpf_vc_core_deinit() is always being
> called during reset handling, with remove being an exception. Ideally
> the driver needs to communicate the changes to FW in all instances
> where the MBX is not already disabled. Remove the remove_in_prog check
> from
> idpf_vc_core_deinit() as the MBX was already disabled while handling
> the reset via libie_ctlq_xn_shutdown() by the service task. This is
> also needed by the following patch, introducing PCI callbacks support.
> 
> Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
> Reviewed-by: Jay Bhat <jay.bhat@intel.com>
> Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
> ---
>  drivers/net/ethernet/intel/idpf/idpf_virtchnl.c | 11 +----------
>  1 file changed, 1 insertion(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> index 129c8f6b0faa..fceaf3ec1cd4 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> +++ b/drivers/net/ethernet/intel/idpf/idpf_virtchnl.c
> @@ -3229,24 +3229,15 @@ int idpf_vc_core_init(struct idpf_adapter
> *adapter)
>   */
>  void idpf_vc_core_deinit(struct idpf_adapter *adapter)  {
> -	bool remove_in_prog;
> -
>  	if (!test_bit(IDPF_VC_CORE_INIT, adapter->flags))
>  		return;
> 
> -	/* Avoid transaction timeouts when called during reset */
> -	remove_in_prog = test_bit(IDPF_REMOVE_IN_PROG, adapter->flags);
> -	if (!remove_in_prog)
> -		idpf_deinit_dflt_mbx(adapter);
> -
>  	idpf_ptp_release(adapter);
>  	idpf_deinit_task(adapter);
>  	idpf_idc_deinit_core_aux_device(adapter);
>  	idpf_rel_rx_pt_lkup(adapter);
>  	idpf_intr_rel(adapter);
> -
> -	if (remove_in_prog)
> -		idpf_deinit_dflt_mbx(adapter);
> +	idpf_deinit_dflt_mbx(adapter);
> 
>  	cancel_delayed_work_sync(&adapter->serv_task);
> 
> --
> 2.37.3


^ permalink raw reply

* RE: [PATCH iwl-next v2 2/2] idpf: implement pci error handlers
From: Loktionov, Aleksandr @ 2026-04-14 11:09 UTC (permalink / raw)
  To: Tantilov, Emil S, intel-wired-lan@lists.osuosl.org
  Cc: netdev@vger.kernel.org, Kitszel, Przemyslaw, Bhat, Jay,
	Barrera, Ivan D, Zaremba, Larysa, Nguyen, Anthony L,
	andrew+netdev@lunn.ch, davem@davemloft.net, edumazet@google.com,
	kuba@kernel.org, pabeni@redhat.com, Lobakin, Aleksander,
	linux-pci@vger.kernel.org, Chittim, Madhu, decot@google.com,
	willemb@google.com, sheenamo@google.com, lukas@wunner.de
In-Reply-To: <20260414031631.2107-3-emil.s.tantilov@intel.com>



> -----Original Message-----
> From: Tantilov, Emil S <emil.s.tantilov@intel.com>
> Sent: Tuesday, April 14, 2026 5:17 AM
> To: intel-wired-lan@lists.osuosl.org
> Cc: netdev@vger.kernel.org; Kitszel, Przemyslaw
> <przemyslaw.kitszel@intel.com>; Bhat, Jay <jay.bhat@intel.com>;
> Barrera, Ivan D <ivan.d.barrera@intel.com>; Loktionov, Aleksandr
> <aleksandr.loktionov@intel.com>; Zaremba, Larysa
> <larysa.zaremba@intel.com>; Nguyen, Anthony L
> <anthony.l.nguyen@intel.com>; andrew+netdev@lunn.ch;
> davem@davemloft.net; edumazet@google.com; kuba@kernel.org;
> pabeni@redhat.com; Lobakin, Aleksander <aleksander.lobakin@intel.com>;
> linux-pci@vger.kernel.org; Chittim, Madhu <madhu.chittim@intel.com>;
> decot@google.com; willemb@google.com; sheenamo@google.com;
> lukas@wunner.de
> Subject: [PATCH iwl-next v2 2/2] idpf: implement pci error handlers
> 
> Add callbacks to handle PCI errors and FLR reset. When preparing to
> handle reset on the bus, the driver must stop all operations that can
> lead to MMIO access in order to prevent HW errors. To accomplish this
> introduce helper
> idpf_reset_prepare() that gets called prior to FLR or when PCI error
> is detected. Upon resume the recovery is done through the existing
> reset path by starting the event task.
> 
> The following callbacks are implemented:
> .reset_prepare runs the first portion of the generic reset path
> leading up to the part where we wait for the reset to complete.
> .reset_done/resume runs the recovery part of the reset handling.
> .error_detected is the callback dealing with PCI errors, similar to
> the prepare call, we stop all operations, prior to attempting a
> recovery.
> .slot_reset is the callback attempting to restore the device, provided
> a PCI reset was initiated by the AER driver.
> 
> Whereas previously the init logic guaranteed netdevs during reset, the
> addition of idpf_detach_and_close() to the PCI callbacks flow makes it
> possible for the function to be called without netdevs. Add check to
> avoid NULL pointer dereference in that case.
> 
> Co-developed-by: Alan Brady <alan.brady@intel.com>
> Signed-off-by: Alan Brady <alan.brady@intel.com>
> Signed-off-by: Emil Tantilov <emil.s.tantilov@intel.com>
> Reviewed-by: Jay Bhat <jay.bhat@intel.com>
> Reviewed-by: Madhu Chittim <madhu.chittim@intel.com>
> ---
>  drivers/net/ethernet/intel/idpf/idpf.h      |   3 +
>  drivers/net/ethernet/intel/idpf/idpf_lib.c  |  13 ++-
> drivers/net/ethernet/intel/idpf/idpf_main.c | 112 ++++++++++++++++++++
>  3 files changed, 126 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/intel/idpf/idpf.h
> b/drivers/net/ethernet/intel/idpf/idpf.h
> index 1d0e32e47e87..164d2f3e233a 100644
> --- a/drivers/net/ethernet/intel/idpf/idpf.h
> +++ b/drivers/net/ethernet/intel/idpf/idpf.h
> @@ -88,6 +88,7 @@ enum idpf_state {
>   * @IDPF_REMOVE_IN_PROG: Driver remove in progress
>   * @IDPF_MB_INTR_MODE: Mailbox in interrupt mode
>   * @IDPF_VC_CORE_INIT: virtchnl core has been init
> + * @IDPF_PCI_CB_RESET: Reset via the PCI callbacks
>   * @IDPF_FLAGS_NBITS: Must be last
>   */
>  enum idpf_flags {
> @@ -97,6 +98,7 @@ enum idpf_flags {
>  	IDPF_REMOVE_IN_PROG,
>  	IDPF_MB_INTR_MODE,
>  	IDPF_VC_CORE_INIT,

...

> +/**
> + * idpf_pci_err_resume - Resume operations after PCI error recovery
> + * @pdev: PCI device struct
> + */
> +static void idpf_pci_err_resume(struct pci_dev *pdev) {
> +	struct idpf_adapter *adapter = pci_get_drvdata(pdev);
> +
> +	/* Force a PFR when resuming from PCI error. */
> +	if (test_and_set_bit(IDPF_PCI_CB_RESET, adapter->flags))
> +		adapter->dev_ops.reg_ops.trigger_reset(adapter,
> IDPF_HR_FUNC_RESET);
You say "Force a PFR", but PFR is only triggered on the AER path, not on the FLR path.

Everything else looks fine
Reviewed-by: Aleksandr Loktionov <aleksandr.loktionov@intel.com>

> +
> +	queue_delayed_work(adapter->vc_event_wq,
> +			   &adapter->vc_event_task,
> +			   msecs_to_jiffies(300));
> +}

...

>  };
>  module_pci_driver(idpf_driver);
> --
> 2.37.3


^ permalink raw reply

* Re: [PATCH 1/1] net: strparser: fix skb_head leak in strp_abort_strp()
From: patchwork-bot+netdevbpf @ 2026-04-14 11:10 UTC (permalink / raw)
  To: Ren Wei
  Cc: netdev, davem, edumazet, kuba, pabeni, horms, nate.karstens, sd,
	linux, Julia.Lawall, tom, yifanwucs, tomapufckgml, yuantan098,
	bird, rakukuip
In-Reply-To: <ade3857a9404999ce9a1c27ec523efc896072678.1775482694.git.rakukuip@gmail.com>

Hello:

This patch was applied to netdev/net.git (main)
by Paolo Abeni <pabeni@redhat.com>:

On Sat, 11 Apr 2026 23:10:10 +0800 you wrote:
> From: Luxiao Xu <rakukuip@gmail.com>
> 
> When the stream parser is aborted, for example after a message assembly timeout,
> it can still hold a reference to a partially assembled message in
> strp->skb_head.
> 
> That skb is not released in strp_abort_strp(), which leaks the partially
> assembled message and can be triggered repeatedly to exhaust memory.
> 
> [...]

Here is the summary with links:
  - [1/1] net: strparser: fix skb_head leak in strp_abort_strp()
    https://git.kernel.org/netdev/net/c/fe72340daaf1

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox