All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
@ 2025-06-30  9:51 Zigit Zo
  2025-06-30 10:00 ` Markus Elfring
  2025-06-30 14:50 ` Michael S. Tsirkin
  0 siblings, 2 replies; 7+ messages in thread
From: Zigit Zo @ 2025-06-30  9:51 UTC (permalink / raw)
  To: mst, jasowang, xuanzhuo, eperezma
  Cc: zuozhijie, virtualization, netdev, linux-kernel

This bug happens if the VMM sends a VIRTIO_NET_S_ANNOUNCE request while
the virtio-net driver is still probing with rtnl_lock() hold, this will
cause a recursive mutex in netdev_notify_peers().

Fix it by skip acking the annouce in virtnet_config_changed_work() when
probing. The annouce will still get done when ndo_open() enables the
virtio_config_driver_enable().

We've observed a softlockup with Ubuntu 24.04, and can be reproduced with
QEMU sending the announce_self rapidly while booting.

[  494.167473] INFO: task swapper/0:1 blocked for more than 368 seconds.
[  494.167667]       Not tainted 6.8.0-57-generic #59-Ubuntu
[  494.167810] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  494.168015] task:swapper/0       state:D stack:0     pid:1     tgid:1     ppid:0      flags:0x00004000
[  494.168260] Call Trace:
[  494.168329]  <TASK>
[  494.168389]  __schedule+0x27c/0x6b0
[  494.168495]  schedule+0x33/0x110
[  494.168585]  schedule_preempt_disabled+0x15/0x30
[  494.168709]  __mutex_lock.constprop.0+0x42f/0x740
[  494.168835]  __mutex_lock_slowpath+0x13/0x20
[  494.168949]  mutex_lock+0x3c/0x50
[  494.169039]  rtnl_lock+0x15/0x20
[  494.169128]  netdev_notify_peers+0x12/0x30
[  494.169240]  virtnet_config_changed_work+0x152/0x1a0
[  494.169377]  virtnet_probe+0xa48/0xe00
[  494.169484]  ? vp_get+0x4d/0x100
[  494.169574]  virtio_dev_probe+0x1e9/0x310
[  494.169682]  really_probe+0x1c7/0x410
[  494.169783]  __driver_probe_device+0x8c/0x180
[  494.169901]  driver_probe_device+0x24/0xd0
[  494.170011]  __driver_attach+0x10b/0x210
[  494.170117]  ? __pfx___driver_attach+0x10/0x10
[  494.170237]  bus_for_each_dev+0x8d/0xf0
[  494.170341]  driver_attach+0x1e/0x30
[  494.170440]  bus_add_driver+0x14e/0x290
[  494.170548]  driver_register+0x5e/0x130
[  494.170651]  ? __pfx_virtio_net_driver_init+0x10/0x10
[  494.170788]  register_virtio_driver+0x20/0x40
[  494.170905]  virtio_net_driver_init+0x97/0xb0
[  494.171022]  do_one_initcall+0x5e/0x340
[  494.171128]  do_initcalls+0x107/0x230
[  494.171228]  ? __pfx_kernel_init+0x10/0x10
[  494.171340]  kernel_init_freeable+0x134/0x210
[  494.171462]  kernel_init+0x1b/0x200
[  494.171560]  ret_from_fork+0x47/0x70
[  494.171659]  ? __pfx_kernel_init+0x10/0x10
[  494.171769]  ret_from_fork_asm+0x1b/0x30
[  494.171875]  </TASK>

Fixes: df28de7b0050 ("virtio-net: synchronize operstate with admin state on up/down")
Signed-off-by: Zigit Zo <zuozhijie@bytedance.com>
---
 drivers/net/virtio_net.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index e53ba600605a..0290d289ebee 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -6211,7 +6211,8 @@ static const struct net_device_ops virtnet_netdev = {
 	.ndo_tx_timeout		= virtnet_tx_timeout,
 };
 
-static void virtnet_config_changed_work(struct work_struct *work)
+static void __virtnet_config_changed_work(struct work_struct *work,
+					  bool check_announce)
 {
 	struct virtnet_info *vi =
 		container_of(work, struct virtnet_info, config_work);
@@ -6221,7 +6222,7 @@ static void virtnet_config_changed_work(struct work_struct *work)
 				 struct virtio_net_config, status, &v) < 0)
 		return;
 
-	if (v & VIRTIO_NET_S_ANNOUNCE) {
+	if (check_announce && (v & VIRTIO_NET_S_ANNOUNCE)) {
 		netdev_notify_peers(vi->dev);
 		virtnet_ack_link_announce(vi);
 	}
@@ -6244,6 +6245,11 @@ static void virtnet_config_changed_work(struct work_struct *work)
 	}
 }
 
+static void virtnet_config_changed_work(struct work_struct *work)
+{
+	__virtnet_config_changed_work(work, true);
+}
+
 static void virtnet_config_changed(struct virtio_device *vdev)
 {
 	struct virtnet_info *vi = vdev->priv;
@@ -7030,7 +7036,10 @@ static int virtnet_probe(struct virtio_device *vdev)
 	   otherwise get link status from config. */
 	netif_carrier_off(dev);
 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
-		virtnet_config_changed_work(&vi->config_work);
+		/* The check_annouce work will get scheduled when ndo_open()
+		 * doing the virtio_config_driver_enable().
+		 */
+		__virtnet_config_changed_work(&vi->config_work, false);
 	} else {
 		vi->status = VIRTIO_NET_S_LINK_UP;
 		virtnet_update_settings(vi);

base-commit: 2def09ead4ad5907988b655d1e1454003aaf8297
-- 
2.49.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
  2025-06-30  9:51 [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing Zigit Zo
@ 2025-06-30 10:00 ` Markus Elfring
  2025-06-30 14:50 ` Michael S. Tsirkin
  1 sibling, 0 replies; 7+ messages in thread
From: Markus Elfring @ 2025-06-30 10:00 UTC (permalink / raw)
  To: Zigit Zo, netdev, virtualization
  Cc: LKML, Eugenio Pérez, Jason Wang, Michael S. Tsirkin,
	Xuan Zhuo

…
> Fix it by skip acking the annouce in virtnet_config_changed_work() when
…

                            announce?

Regards,
Markus

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
  2025-06-30  9:51 [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing Zigit Zo
  2025-06-30 10:00 ` Markus Elfring
@ 2025-06-30 14:50 ` Michael S. Tsirkin
  2025-06-30 14:54   ` Michael S. Tsirkin
  1 sibling, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2025-06-30 14:50 UTC (permalink / raw)
  To: Zigit Zo; +Cc: jasowang, xuanzhuo, eperezma, virtualization, netdev,
	linux-kernel

On Mon, Jun 30, 2025 at 05:51:09PM +0800, Zigit Zo wrote:
> This bug happens if the VMM sends a VIRTIO_NET_S_ANNOUNCE request while
> the virtio-net driver is still probing with rtnl_lock() hold, this will
> cause a recursive mutex in netdev_notify_peers().
> 
> Fix it by skip acking the annouce in virtnet_config_changed_work() when
> probing. The annouce will still get done when ndo_open() enables the
> virtio_config_driver_enable().

I am not so sure it will be - while driver is not loaded, device does
not have to send interrupts, and there's no rule I'm aware of that says
we'll get one after DRIVER_OK.

How about, we instead just schedule the work to do it later?


Also, there is another bug here.
If ndo_open did not run, we actually should not send any announcements.

Do we care if carrier on is set on probe or on open?
If not, let's just defer this to ndo_open?


> We've observed a softlockup with Ubuntu 24.04, and can be reproduced with
> QEMU sending the announce_self rapidly while booting.
> 
> [  494.167473] INFO: task swapper/0:1 blocked for more than 368 seconds.
> [  494.167667]       Not tainted 6.8.0-57-generic #59-Ubuntu
> [  494.167810] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> [  494.168015] task:swapper/0       state:D stack:0     pid:1     tgid:1     ppid:0      flags:0x00004000
> [  494.168260] Call Trace:
> [  494.168329]  <TASK>
> [  494.168389]  __schedule+0x27c/0x6b0
> [  494.168495]  schedule+0x33/0x110
> [  494.168585]  schedule_preempt_disabled+0x15/0x30
> [  494.168709]  __mutex_lock.constprop.0+0x42f/0x740
> [  494.168835]  __mutex_lock_slowpath+0x13/0x20
> [  494.168949]  mutex_lock+0x3c/0x50
> [  494.169039]  rtnl_lock+0x15/0x20
> [  494.169128]  netdev_notify_peers+0x12/0x30
> [  494.169240]  virtnet_config_changed_work+0x152/0x1a0
> [  494.169377]  virtnet_probe+0xa48/0xe00
> [  494.169484]  ? vp_get+0x4d/0x100
> [  494.169574]  virtio_dev_probe+0x1e9/0x310
> [  494.169682]  really_probe+0x1c7/0x410
> [  494.169783]  __driver_probe_device+0x8c/0x180
> [  494.169901]  driver_probe_device+0x24/0xd0
> [  494.170011]  __driver_attach+0x10b/0x210
> [  494.170117]  ? __pfx___driver_attach+0x10/0x10
> [  494.170237]  bus_for_each_dev+0x8d/0xf0
> [  494.170341]  driver_attach+0x1e/0x30
> [  494.170440]  bus_add_driver+0x14e/0x290
> [  494.170548]  driver_register+0x5e/0x130
> [  494.170651]  ? __pfx_virtio_net_driver_init+0x10/0x10
> [  494.170788]  register_virtio_driver+0x20/0x40
> [  494.170905]  virtio_net_driver_init+0x97/0xb0
> [  494.171022]  do_one_initcall+0x5e/0x340
> [  494.171128]  do_initcalls+0x107/0x230
> [  494.171228]  ? __pfx_kernel_init+0x10/0x10
> [  494.171340]  kernel_init_freeable+0x134/0x210
> [  494.171462]  kernel_init+0x1b/0x200
> [  494.171560]  ret_from_fork+0x47/0x70
> [  494.171659]  ? __pfx_kernel_init+0x10/0x10
> [  494.171769]  ret_from_fork_asm+0x1b/0x30
> [  494.171875]  </TASK>
> 
> Fixes: df28de7b0050 ("virtio-net: synchronize operstate with admin state on up/down")
> Signed-off-by: Zigit Zo <zuozhijie@bytedance.com>
> ---
>  drivers/net/virtio_net.c | 15 ++++++++++++---
>  1 file changed, 12 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index e53ba600605a..0290d289ebee 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -6211,7 +6211,8 @@ static const struct net_device_ops virtnet_netdev = {
>  	.ndo_tx_timeout		= virtnet_tx_timeout,
>  };
>  
> -static void virtnet_config_changed_work(struct work_struct *work)
> +static void __virtnet_config_changed_work(struct work_struct *work,
> +					  bool check_announce)
>  {
>  	struct virtnet_info *vi =
>  		container_of(work, struct virtnet_info, config_work);

So this will be schedule_announce instead of check_announce?




> @@ -6221,7 +6222,7 @@ static void virtnet_config_changed_work(struct work_struct *work)
>  				 struct virtio_net_config, status, &v) < 0)
>  		return;
>  
> -	if (v & VIRTIO_NET_S_ANNOUNCE) {
> +	if (check_announce && (v & VIRTIO_NET_S_ANNOUNCE)) {
>  		netdev_notify_peers(vi->dev);
>  		virtnet_ack_link_announce(vi);
>  	}
> @@ -6244,6 +6245,11 @@ static void virtnet_config_changed_work(struct work_struct *work)
>  	}
>  }
>  
> +static void virtnet_config_changed_work(struct work_struct *work)
> +{
> +	__virtnet_config_changed_work(work, true);
> +}
> +
>  static void virtnet_config_changed(struct virtio_device *vdev)
>  {
>  	struct virtnet_info *vi = vdev->priv;
> @@ -7030,7 +7036,10 @@ static int virtnet_probe(struct virtio_device *vdev)
>  	   otherwise get link status from config. */
>  	netif_carrier_off(dev);
>  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> -		virtnet_config_changed_work(&vi->config_work);
> +		/* The check_annouce work will get scheduled when ndo_open()
> +		 * doing the virtio_config_driver_enable().
> +		 */
> +		__virtnet_config_changed_work(&vi->config_work, false);
>  	} else {
>  		vi->status = VIRTIO_NET_S_LINK_UP;
>  		virtnet_update_settings(vi);
> 
> base-commit: 2def09ead4ad5907988b655d1e1454003aaf8297
> -- 
> 2.49.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
  2025-06-30 14:50 ` Michael S. Tsirkin
@ 2025-06-30 14:54   ` Michael S. Tsirkin
  2025-07-01  7:48     ` [External] " Zigit Zo
  0 siblings, 1 reply; 7+ messages in thread
From: Michael S. Tsirkin @ 2025-06-30 14:54 UTC (permalink / raw)
  To: Zigit Zo; +Cc: jasowang, xuanzhuo, eperezma, virtualization, netdev,
	linux-kernel

On Mon, Jun 30, 2025 at 10:50:55AM -0400, Michael S. Tsirkin wrote:
> On Mon, Jun 30, 2025 at 05:51:09PM +0800, Zigit Zo wrote:
> > This bug happens if the VMM sends a VIRTIO_NET_S_ANNOUNCE request while
> > the virtio-net driver is still probing with rtnl_lock() hold, this will
> > cause a recursive mutex in netdev_notify_peers().
> > 
> > Fix it by skip acking the annouce in virtnet_config_changed_work() when
> > probing. The annouce will still get done when ndo_open() enables the
> > virtio_config_driver_enable().
> 
> I am not so sure it will be - while driver is not loaded, device does
> not have to send interrupts, and there's no rule I'm aware of that says
> we'll get one after DRIVER_OK.
> 
> How about, we instead just schedule the work to do it later?
> 
> 
> Also, there is another bug here.
> If ndo_open did not run, we actually should not send any announcements.
> 
> Do we care if carrier on is set on probe or on open?
> If not, let's just defer this to ndo_open?

Hmm yes I think we do, device is visible to userspace is it not?

Hmm.  We can keep the announce bit set in vi->status and on open, check
it and then schedule a work to do the announcement.


> 
> > We've observed a softlockup with Ubuntu 24.04, and can be reproduced with
> > QEMU sending the announce_self rapidly while booting.
> > 
> > [  494.167473] INFO: task swapper/0:1 blocked for more than 368 seconds.
> > [  494.167667]       Not tainted 6.8.0-57-generic #59-Ubuntu
> > [  494.167810] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> > [  494.168015] task:swapper/0       state:D stack:0     pid:1     tgid:1     ppid:0      flags:0x00004000
> > [  494.168260] Call Trace:
> > [  494.168329]  <TASK>
> > [  494.168389]  __schedule+0x27c/0x6b0
> > [  494.168495]  schedule+0x33/0x110
> > [  494.168585]  schedule_preempt_disabled+0x15/0x30
> > [  494.168709]  __mutex_lock.constprop.0+0x42f/0x740
> > [  494.168835]  __mutex_lock_slowpath+0x13/0x20
> > [  494.168949]  mutex_lock+0x3c/0x50
> > [  494.169039]  rtnl_lock+0x15/0x20
> > [  494.169128]  netdev_notify_peers+0x12/0x30
> > [  494.169240]  virtnet_config_changed_work+0x152/0x1a0
> > [  494.169377]  virtnet_probe+0xa48/0xe00
> > [  494.169484]  ? vp_get+0x4d/0x100
> > [  494.169574]  virtio_dev_probe+0x1e9/0x310
> > [  494.169682]  really_probe+0x1c7/0x410
> > [  494.169783]  __driver_probe_device+0x8c/0x180
> > [  494.169901]  driver_probe_device+0x24/0xd0
> > [  494.170011]  __driver_attach+0x10b/0x210
> > [  494.170117]  ? __pfx___driver_attach+0x10/0x10
> > [  494.170237]  bus_for_each_dev+0x8d/0xf0
> > [  494.170341]  driver_attach+0x1e/0x30
> > [  494.170440]  bus_add_driver+0x14e/0x290
> > [  494.170548]  driver_register+0x5e/0x130
> > [  494.170651]  ? __pfx_virtio_net_driver_init+0x10/0x10
> > [  494.170788]  register_virtio_driver+0x20/0x40
> > [  494.170905]  virtio_net_driver_init+0x97/0xb0
> > [  494.171022]  do_one_initcall+0x5e/0x340
> > [  494.171128]  do_initcalls+0x107/0x230
> > [  494.171228]  ? __pfx_kernel_init+0x10/0x10
> > [  494.171340]  kernel_init_freeable+0x134/0x210
> > [  494.171462]  kernel_init+0x1b/0x200
> > [  494.171560]  ret_from_fork+0x47/0x70
> > [  494.171659]  ? __pfx_kernel_init+0x10/0x10
> > [  494.171769]  ret_from_fork_asm+0x1b/0x30
> > [  494.171875]  </TASK>
> > 
> > Fixes: df28de7b0050 ("virtio-net: synchronize operstate with admin state on up/down")
> > Signed-off-by: Zigit Zo <zuozhijie@bytedance.com>
> > ---
> >  drivers/net/virtio_net.c | 15 ++++++++++++---
> >  1 file changed, 12 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index e53ba600605a..0290d289ebee 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -6211,7 +6211,8 @@ static const struct net_device_ops virtnet_netdev = {
> >  	.ndo_tx_timeout		= virtnet_tx_timeout,
> >  };
> >  
> > -static void virtnet_config_changed_work(struct work_struct *work)
> > +static void __virtnet_config_changed_work(struct work_struct *work,
> > +					  bool check_announce)
> >  {
> >  	struct virtnet_info *vi =
> >  		container_of(work, struct virtnet_info, config_work);
> 
> So this will be schedule_announce instead of check_announce?
> 
> 
> 
> 
> > @@ -6221,7 +6222,7 @@ static void virtnet_config_changed_work(struct work_struct *work)
> >  				 struct virtio_net_config, status, &v) < 0)
> >  		return;
> >  
> > -	if (v & VIRTIO_NET_S_ANNOUNCE) {
> > +	if (check_announce && (v & VIRTIO_NET_S_ANNOUNCE)) {
> >  		netdev_notify_peers(vi->dev);
> >  		virtnet_ack_link_announce(vi);
> >  	}
> > @@ -6244,6 +6245,11 @@ static void virtnet_config_changed_work(struct work_struct *work)
> >  	}
> >  }
> >  
> > +static void virtnet_config_changed_work(struct work_struct *work)
> > +{
> > +	__virtnet_config_changed_work(work, true);
> > +}
> > +
> >  static void virtnet_config_changed(struct virtio_device *vdev)
> >  {
> >  	struct virtnet_info *vi = vdev->priv;
> > @@ -7030,7 +7036,10 @@ static int virtnet_probe(struct virtio_device *vdev)
> >  	   otherwise get link status from config. */
> >  	netif_carrier_off(dev);
> >  	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> > -		virtnet_config_changed_work(&vi->config_work);
> > +		/* The check_annouce work will get scheduled when ndo_open()
> > +		 * doing the virtio_config_driver_enable().
> > +		 */
> > +		__virtnet_config_changed_work(&vi->config_work, false);
> >  	} else {
> >  		vi->status = VIRTIO_NET_S_LINK_UP;
> >  		virtnet_update_settings(vi);
> > 
> > base-commit: 2def09ead4ad5907988b655d1e1454003aaf8297
> > -- 
> > 2.49.0


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [External] Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
  2025-06-30 14:54   ` Michael S. Tsirkin
@ 2025-07-01  7:48     ` Zigit Zo
  2025-07-01  7:52       ` Michael S. Tsirkin
  0 siblings, 1 reply; 7+ messages in thread
From: Zigit Zo @ 2025-07-01  7:48 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, xuanzhuo, eperezma, virtualization, netdev,
	linux-kernel

On 6/30/25 10:54 PM, Michael S. Tsirkin wrote:
> On Mon, Jun 30, 2025 at 10:50:55AM -0400, Michael S. Tsirkin wrote:
>> On Mon, Jun 30, 2025 at 05:51:09PM +0800, Zigit Zo wrote:
>>> This bug happens if the VMM sends a VIRTIO_NET_S_ANNOUNCE request while
>>> the virtio-net driver is still probing with rtnl_lock() hold, this will
>>> cause a recursive mutex in netdev_notify_peers().
>>>
>>> Fix it by skip acking the annouce in virtnet_config_changed_work() when
>>> probing. The annouce will still get done when ndo_open() enables the
>>> virtio_config_driver_enable().
>>
>> I am not so sure it will be - while driver is not loaded, device does
>> not have to send interrupts, and there's no rule I'm aware of that says
>> we'll get one after DRIVER_OK.

Yep, at first we're thinking that when the VIRTIO_NET_S_ANNOUNCE flag set,
we can always assure an interrupt has fired by VMM, to notify the driver
to do the announcement.

But later we realized that the S_ANNOUNCE flag can be sent before the
driver's probing, and for QEMU seems to set the status flag regardless of
whether driver is ready, so the problem you're talking still may happens.
>> How about, we instead just schedule the work to do it later?I'm not sure if scheduling the work later will break df28de7b0050, the work
was being scheduled before that commit, and we have no much idea of why that
commit removes the schedule_work, we just keep it for safe...

Then, for plan A, we change the check_announce to schedule_announce, and if
that's true, we do another schedule_work to call virtnet_config_changed_work
again to finish the announcement, like

	if (v & VIRTIO_NET_S_ANNOUNCE) {
		if (unlikely(schedule_announce))
			schedule_work(&vi->config_work);
		else {
			netdev_notify_peers(vi->dev);
			virtnet_ack_link_announce(vi);
		}
	}

>>
>> Also, there is another bug here.
>> If ndo_open did not run, we actually should not send any announcements.
>>
>> Do we care if carrier on is set on probe or on open?
>> If not, let's just defer this to ndo_open?
> 
> Hmm yes I think we do, device is visible to userspace is it not?
> 
> Hmm.  We can keep the announce bit set in vi->status and on open, check
> it and then schedule a work to do the announcement.

Okay, so there's a plan B, we save the bit and re-check it in ndo_open, like

	/* __virtnet_config_changed_work() */
	if (v & VIRTIO_NET_S_ANNOUNCE) {
		vi->status |= VIRTIO_NET_S_ANNOUNCE;
		if (unlikely(!check_announce))
			goto check_link;

		netdev_notify_peers(vi->dev);
		virtnet_ack_link_announce(vi);
		vi->status &= ~VIRTIO_NET_S_ANNOUNCE;
	}

	/* virtnet_open() */
	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
		if (vi->status & VIRTIO_NET_S_LINK_UP)
			netif_carrier_on(vi->dev);
		if
		if (vi->status & VIRTIO_NET_S_ANNOUNCE)
			schedule_work(&vi->config_work);
		virtio_config_driver_enable(vi->vdev);
	}

This is a dirty demo, any ideas are welcomed :)

(I think in virtnet_open() we can make the S_LINK_UP being scheduled as well?)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [External] Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
  2025-07-01  7:48     ` [External] " Zigit Zo
@ 2025-07-01  7:52       ` Michael S. Tsirkin
  0 siblings, 0 replies; 7+ messages in thread
From: Michael S. Tsirkin @ 2025-07-01  7:52 UTC (permalink / raw)
  To: Zigit Zo; +Cc: jasowang, xuanzhuo, eperezma, virtualization, netdev,
	linux-kernel

On Tue, Jul 01, 2025 at 03:48:41PM +0800, Zigit Zo wrote:
> On 6/30/25 10:54 PM, Michael S. Tsirkin wrote:
> > On Mon, Jun 30, 2025 at 10:50:55AM -0400, Michael S. Tsirkin wrote:
> >> On Mon, Jun 30, 2025 at 05:51:09PM +0800, Zigit Zo wrote:
> >>> This bug happens if the VMM sends a VIRTIO_NET_S_ANNOUNCE request while
> >>> the virtio-net driver is still probing with rtnl_lock() hold, this will
> >>> cause a recursive mutex in netdev_notify_peers().
> >>>
> >>> Fix it by skip acking the annouce in virtnet_config_changed_work() when
> >>> probing. The annouce will still get done when ndo_open() enables the
> >>> virtio_config_driver_enable().
> >>
> >> I am not so sure it will be - while driver is not loaded, device does
> >> not have to send interrupts, and there's no rule I'm aware of that says
> >> we'll get one after DRIVER_OK.
> 
> Yep, at first we're thinking that when the VIRTIO_NET_S_ANNOUNCE flag set,
> we can always assure an interrupt has fired by VMM, to notify the driver
> to do the announcement.
> 
> But later we realized that the S_ANNOUNCE flag can be sent before the
> driver's probing, and for QEMU seems to set the status flag regardless of
> whether driver is ready, so the problem you're talking still may happens.
> >> How about, we instead just schedule the work to do it later?I'm not sure if scheduling the work later will break df28de7b0050, the work
> was being scheduled before that commit, and we have no much idea of why that
> commit removes the schedule_work, we just keep it for safe...

Well managing async things is always tricky. Direct call is safer.
If you reintroduce it, you need to audit all call paths for safely.


> Then, for plan A, we change the check_announce to schedule_announce, and if
> that's true, we do another schedule_work to call virtnet_config_changed_work
> again to finish the announcement, like
> 
> 	if (v & VIRTIO_NET_S_ANNOUNCE) {
> 		if (unlikely(schedule_announce))
> 			schedule_work(&vi->config_work);
> 		else {
> 			netdev_notify_peers(vi->dev);
> 			virtnet_ack_link_announce(vi);
> 		}
> 	}
> 
> >>
> >> Also, there is another bug here.
> >> If ndo_open did not run, we actually should not send any announcements.
> >>
> >> Do we care if carrier on is set on probe or on open?
> >> If not, let's just defer this to ndo_open?
> > 
> > Hmm yes I think we do, device is visible to userspace is it not?
> > 
> > Hmm.  We can keep the announce bit set in vi->status and on open, check
> > it and then schedule a work to do the announcement.
> 
> Okay, so there's a plan B, we save the bit and re-check it in ndo_open, like
> 
> 	/* __virtnet_config_changed_work() */
> 	if (v & VIRTIO_NET_S_ANNOUNCE) {
> 		vi->status |= VIRTIO_NET_S_ANNOUNCE;
> 		if (unlikely(!check_announce))
> 			goto check_link;
> 
> 		netdev_notify_peers(vi->dev);
> 		virtnet_ack_link_announce(vi);
> 		vi->status &= ~VIRTIO_NET_S_ANNOUNCE;
> 	}
> 
> 	/* virtnet_open() */
> 	if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
> 		if (vi->status & VIRTIO_NET_S_LINK_UP)
> 			netif_carrier_on(vi->dev);
> 		if
> 		if (vi->status & VIRTIO_NET_S_ANNOUNCE)
> 			schedule_work(&vi->config_work);
> 		virtio_config_driver_enable(vi->vdev);
> 	}
> 
> This is a dirty demo, any ideas are welcomed :)
> 
> (I think in virtnet_open() we can make the S_LINK_UP being scheduled as well?)


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
@ 2025-07-01 18:06 kernel test robot
  0 siblings, 0 replies; 7+ messages in thread
From: kernel test robot @ 2025-07-01 18:06 UTC (permalink / raw)
  To: oe-kbuild; +Cc: lkp, Dan Carpenter

BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
In-Reply-To: <20250630095109.214013-1-zuozhijie@bytedance.com>
References: <20250630095109.214013-1-zuozhijie@bytedance.com>
TO: Zigit Zo <zuozhijie@bytedance.com>
TO: mst@redhat.com
TO: jasowang@redhat.com
TO: xuanzhuo@linux.alibaba.com
TO: eperezma@redhat.com
CC: zuozhijie@bytedance.com
CC: virtualization@lists.linux.dev
CC: netdev@vger.kernel.org
CC: linux-kernel@vger.kernel.org

Hi Zigit,

kernel test robot noticed the following build warnings:

[auto build test WARNING on 2def09ead4ad5907988b655d1e1454003aaf8297]

url:    https://github.com/intel-lab-lkp/linux/commits/Zigit-Zo/virtio-net-fix-a-rtnl_lock-deadlock-during-probing/20250630-175257
base:   2def09ead4ad5907988b655d1e1454003aaf8297
patch link:    https://lore.kernel.org/r/20250630095109.214013-1-zuozhijie%40bytedance.com
patch subject: [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing
:::::: branch date: 32 hours ago
:::::: commit date: 32 hours ago
config: x86_64-randconfig-161-20250701 (https://download.01.org/0day-ci/archive/20250702/202507020158.2hoKFZXE-lkp@intel.com/config)
compiler: clang version 20.1.7 (https://github.com/llvm/llvm-project 6146a88f60492b520a36f8f8f3231e15f3cc6082)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <error27@gmail.com>
| Closes: https://lore.kernel.org/r/202507020158.2hoKFZXE-lkp@intel.com/

New smatch warnings:
drivers/net/virtio_net.c:6221 __virtnet_config_changed_work() error: uninitialized symbol 'virtio_cread_v'.
drivers/net/virtio_net.c:6225 __virtnet_config_changed_work() error: uninitialized symbol 'v'.

Old smatch warnings:
drivers/net/virtio_net.c:2108 build_skb_from_xdp_buff() error: uninitialized symbol 'nr_frags'.
drivers/net/virtio_net.c:2190 virtnet_build_xdp_buff_mrg() error: uninitialized symbol 'shinfo'.
drivers/net/virtio_net.c:3122 virtnet_update_settings() error: uninitialized symbol 'virtio_cread_v'.

vim +/virtio_cread_v +6221 drivers/net/virtio_net.c

76288b4e573e06 Stephen Hemminger  2009-01-06  6213  
d72ba54248be27 Zigit Zo           2025-06-30  6214  static void __virtnet_config_changed_work(struct work_struct *work,
d72ba54248be27 Zigit Zo           2025-06-30  6215  					  bool check_announce)
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6216  {
586d17c5a01bf1 Jason Wang         2012-04-11  6217  	struct virtnet_info *vi =
586d17c5a01bf1 Jason Wang         2012-04-11  6218  		container_of(work, struct virtnet_info, config_work);
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6219  	u16 v;
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6220  
855e0c5288177b Rusty Russell      2013-10-14 @6221  	if (virtio_cread_feature(vi->vdev, VIRTIO_NET_F_STATUS,
855e0c5288177b Rusty Russell      2013-10-14  6222  				 struct virtio_net_config, status, &v) < 0)
507613bf31f4bc Michael S. Tsirkin 2014-10-15  6223  		return;
586d17c5a01bf1 Jason Wang         2012-04-11  6224  
d72ba54248be27 Zigit Zo           2025-06-30 @6225  	if (check_announce && (v & VIRTIO_NET_S_ANNOUNCE)) {
ee89bab14e8576 Amerigo Wang       2012-08-09  6226  		netdev_notify_peers(vi->dev);
586d17c5a01bf1 Jason Wang         2012-04-11  6227  		virtnet_ack_link_announce(vi);
586d17c5a01bf1 Jason Wang         2012-04-11  6228  	}
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6229  
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6230  	/* Ignore unknown (future) status bits */
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6231  	v &= VIRTIO_NET_S_LINK_UP;
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6232  
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6233  	if (vi->status == v)
507613bf31f4bc Michael S. Tsirkin 2014-10-15  6234  		return;
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6235  
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6236  	vi->status = v;
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6237  
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6238  	if (vi->status & VIRTIO_NET_S_LINK_UP) {
faa9b39f0e9ddf Jason Baron        2018-01-05  6239  		virtnet_update_settings(vi);
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6240  		netif_carrier_on(vi->dev);
986a4f4d452dec Jason Wang         2012-12-07  6241  		netif_tx_wake_all_queues(vi->dev);
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6242  	} else {
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6243  		netif_carrier_off(vi->dev);
986a4f4d452dec Jason Wang         2012-12-07  6244  		netif_tx_stop_all_queues(vi->dev);
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6245  	}
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6246  }
9f4d26d0f3016c Mark McLoughlin    2009-01-19  6247  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-07-01 18:06 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-06-30  9:51 [PATCH net] virtio-net: fix a rtnl_lock() deadlock during probing Zigit Zo
2025-06-30 10:00 ` Markus Elfring
2025-06-30 14:50 ` Michael S. Tsirkin
2025-06-30 14:54   ` Michael S. Tsirkin
2025-07-01  7:48     ` [External] " Zigit Zo
2025-07-01  7:52       ` Michael S. Tsirkin
  -- strict thread matches above, loose matches on Subject: below --
2025-07-01 18:06 kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.