[PATCH] vduse: Fix msg list race in vduse_dev_read

public inbox for virtualization@lists.linux-foundation.org
 help / color / mirror / Atom feed

* [PATCH] vduse: Fix msg list race in vduse_dev_read_iter
@ 2026-01-30  8:15 Zhang Tianci
  2026-01-30 10:16 ` Michael S. Tsirkin
  2026-01-30 11:51 ` Eugenio Perez Martin
  0 siblings, 2 replies; 4+ messages in thread
From: Zhang Tianci @ 2026-01-30  8:15 UTC (permalink / raw)
  To: mst, jasowang
  Cc: xuanzhuo, eperezma, marco.crivellari, anders.roxell,
	virtualization, linux-kernel, Zhang Tianci, Xie Yongji

Move the message to recv_list before dropping msg_lock and copying the
request to userspace, avoiding a transient unlinked state that can race
with the msg_sync timeout path. Roll back to send_list on copy failures.

Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
---
 drivers/vdpa/vdpa_user/vduse_dev.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
index ae357d014564c..b6a558341c06c 100644
--- a/drivers/vdpa/vdpa_user/vduse_dev.c
+++ b/drivers/vdpa/vdpa_user/vduse_dev.c
@@ -325,6 +325,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct file *file = iocb->ki_filp;
 	struct vduse_dev *dev = file->private_data;
 	struct vduse_dev_msg *msg;
+	struct vduse_dev_request req;
 	int size = sizeof(struct vduse_dev_request);
 	ssize_t ret;
 
@@ -339,7 +340,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 		ret = -EAGAIN;
 		if (file->f_flags & O_NONBLOCK)
-			goto unlock;
+			break;
 
 		spin_unlock(&dev->msg_lock);
 		ret = wait_event_interruptible_exclusive(dev->waitq,
@@ -349,17 +350,30 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 
 		spin_lock(&dev->msg_lock);
 	}
+	if (!msg) {
+		spin_unlock(&dev->msg_lock);
+		return ret;
+	}
+
+	memcpy(&req, &msg->req, sizeof(req));
+	/*
+	 * Move @msg to recv_list before dropping msg_lock.
+	 * This avoids a window where @msg is detached from any list and
+	 * vduse_dev_msg_sync() timeout path may operate on an unlinked node.
+	 */
+	vduse_enqueue_msg(&dev->recv_list, msg);
 	spin_unlock(&dev->msg_lock);
-	ret = copy_to_iter(&msg->req, size, to);
-	spin_lock(&dev->msg_lock);
+
+	ret = copy_to_iter(&req, size, to);
 	if (ret != size) {
+		spin_lock(&dev->msg_lock);
+		/* Roll back: move msg back to send_list if still pending. */
+		msg = vduse_find_msg(&dev->recv_list, req.request_id);
+		if (msg)
+			vduse_enqueue_msg(&dev->send_list, msg);
+		spin_unlock(&dev->msg_lock);
 		ret = -EFAULT;
-		vduse_enqueue_msg(&dev->send_list, msg);
-		goto unlock;
 	}
-	vduse_enqueue_msg(&dev->recv_list, msg);
-unlock:
-	spin_unlock(&dev->msg_lock);
 
 	return ret;
 }
-- 
2.39.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] vduse: Fix msg list race in vduse_dev_read_iter
  2026-01-30  8:15 [PATCH] vduse: Fix msg list race in vduse_dev_read_iter Zhang Tianci
@ 2026-01-30 10:16 ` Michael S. Tsirkin
  2026-01-30 11:51 ` Eugenio Perez Martin
  1 sibling, 0 replies; 4+ messages in thread
From: Michael S. Tsirkin @ 2026-01-30 10:16 UTC (permalink / raw)
  To: Zhang Tianci
  Cc: jasowang, xuanzhuo, eperezma, marco.crivellari, anders.roxell,
	virtualization, linux-kernel, Xie Yongji

Thanks for the patch! yet something to improve:

On Fri, Jan 30, 2026 at 04:15:24PM +0800, Zhang Tianci wrote:
> Move the message to recv_list before dropping msg_lock and copying the
> request to userspace, avoiding a transient unlinked state that can race
> with the msg_sync timeout path. Roll back to send_list on copy failures.

this is not how you write commit messages, though.
describe the problem then how you fix it, please.
something like:

if msg_sync timeout triggers after a message has been removed
from send_list and before it was added to
recv_list, then .... as a result ....

To fix, move the message ...

> 
> Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
> Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index ae357d014564c..b6a558341c06c 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -325,6 +325,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  	struct file *file = iocb->ki_filp;
>  	struct vduse_dev *dev = file->private_data;
>  	struct vduse_dev_msg *msg;
> +	struct vduse_dev_request req;
>  	int size = sizeof(struct vduse_dev_request);
>  	ssize_t ret;
>  
> @@ -339,7 +340,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  
>  		ret = -EAGAIN;
>  		if (file->f_flags & O_NONBLOCK)
> -			goto unlock;
> +			break;
>  
>  		spin_unlock(&dev->msg_lock);
>  		ret = wait_event_interruptible_exclusive(dev->waitq,
> @@ -349,17 +350,30 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>  
>  		spin_lock(&dev->msg_lock);
>  	}
> +	if (!msg) {
> +		spin_unlock(&dev->msg_lock);
> +		return ret;
> +	}
> +
> +	memcpy(&req, &msg->req, sizeof(req));
> +	/*
> +	 * Move @msg to recv_list before dropping msg_lock.
> +	 * This avoids a window where @msg is detached from any list and
> +	 * vduse_dev_msg_sync() timeout path may operate on an unlinked node.
> +	 */

when standing by itself, not as part of the patch, this
comment confuses more than it clarifies.

> +	vduse_enqueue_msg(&dev->recv_list, msg);
>  	spin_unlock(&dev->msg_lock);
> -	ret = copy_to_iter(&msg->req, size, to);
> -	spin_lock(&dev->msg_lock);
> +
> +	ret = copy_to_iter(&req, size, to);
>  	if (ret != size) {
> +		spin_lock(&dev->msg_lock);
> +		/* Roll back: move msg back to send_list if still pending. */
> +		msg = vduse_find_msg(&dev->recv_list, req.request_id);

Looks like this always scans the whole list.
Make a variant using list_for_each_entry_reverse maybe?

> +		if (msg)
> +			vduse_enqueue_msg(&dev->send_list, msg);

why is it not a concern that it will be at the tail of the send_list now,
reordering the messages?

> +		spin_unlock(&dev->msg_lock);
>  		ret = -EFAULT;
> -		vduse_enqueue_msg(&dev->send_list, msg);
> -		goto unlock;
>  	}
> -	vduse_enqueue_msg(&dev->recv_list, msg);
> -unlock:
> -	spin_unlock(&dev->msg_lock);
>  
>  	return ret;
>  }
> -- 
> 2.39.5


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] vduse: Fix msg list race in vduse_dev_read_iter
  2026-01-30  8:15 [PATCH] vduse: Fix msg list race in vduse_dev_read_iter Zhang Tianci
  2026-01-30 10:16 ` Michael S. Tsirkin
@ 2026-01-30 11:51 ` Eugenio Perez Martin
       [not found]   ` <CAP4dvscscRNbJBKwDLE-gebPT24fOJg08xap_0t00cPbYZ4xvQ@mail.gmail.com>
  1 sibling, 1 reply; 4+ messages in thread
From: Eugenio Perez Martin @ 2026-01-30 11:51 UTC (permalink / raw)
  To: Zhang Tianci
  Cc: mst, jasowang, xuanzhuo, marco.crivellari, anders.roxell,
	virtualization, linux-kernel, Xie Yongji

On Fri, Jan 30, 2026 at 9:15 AM Zhang Tianci
<zhangtianci.1997@bytedance.com> wrote:
>
> Move the message to recv_list before dropping msg_lock and copying the
> request to userspace, avoiding a transient unlinked state that can race
> with the msg_sync timeout path. Roll back to send_list on copy failures.
>

Missed Fixes: tag and Cc: stable@vger.kernel.org. Or maybe we can
consider this a change in the behavior? I don't think any VDUSE
instance should trust it will never receive that message that it
received partially once but still...


> Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
> Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
> ---
>  drivers/vdpa/vdpa_user/vduse_dev.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> index ae357d014564c..b6a558341c06c 100644
> --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> @@ -325,6 +325,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>         struct file *file = iocb->ki_filp;
>         struct vduse_dev *dev = file->private_data;
>         struct vduse_dev_msg *msg;
> +       struct vduse_dev_request req;
>         int size = sizeof(struct vduse_dev_request);
>         ssize_t ret;
>
> @@ -339,7 +340,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
>                 ret = -EAGAIN;
>                 if (file->f_flags & O_NONBLOCK)
> -                       goto unlock;
> +                       break;
>
>                 spin_unlock(&dev->msg_lock);
>                 ret = wait_event_interruptible_exclusive(dev->waitq,
> @@ -349,17 +350,30 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
>
>                 spin_lock(&dev->msg_lock);
>         }
> +       if (!msg) {
> +               spin_unlock(&dev->msg_lock);
> +               return ret;
> +       }
> +
> +       memcpy(&req, &msg->req, sizeof(req));
> +       /*
> +        * Move @msg to recv_list before dropping msg_lock.
> +        * This avoids a window where @msg is detached from any list and
> +        * vduse_dev_msg_sync() timeout path may operate on an unlinked node.

But in the timeout case, msg->completed is false so list_del is never
called, isn't it?

Is there even any event that may cause more than one packet in either
queue? Maybe we can simplify a lot of this if we don't have that
assumption.

> +        */
> +       vduse_enqueue_msg(&dev->recv_list, msg);
>         spin_unlock(&dev->msg_lock);
> -       ret = copy_to_iter(&msg->req, size, to);
> -       spin_lock(&dev->msg_lock);
> +
> +       ret = copy_to_iter(&req, size, to);
>         if (ret != size) {
> +               spin_lock(&dev->msg_lock);
> +               /* Roll back: move msg back to send_list if still pending. */
> +               msg = vduse_find_msg(&dev->recv_list, req.request_id);
> +               if (msg)
> +                       vduse_enqueue_msg(&dev->send_list, msg);
> +               spin_unlock(&dev->msg_lock);
>                 ret = -EFAULT;
> -               vduse_enqueue_msg(&dev->send_list, msg);
> -               goto unlock;
>         }
> -       vduse_enqueue_msg(&dev->recv_list, msg);
> -unlock:
> -       spin_unlock(&dev->msg_lock);
>
>         return ret;
>  }
> --
> 2.39.5
>


^ permalink raw reply	[flat|nested] 4+ messages in thread

[parent not found: <CAP4dvscscRNbJBKwDLE-gebPT24fOJg08xap_0t00cPbYZ4xvQ@mail.gmail.com>]

* Re: [External] Re: [PATCH] vduse: Fix msg list race in vduse_dev_read_iter
       [not found]   ` <CAP4dvscscRNbJBKwDLE-gebPT24fOJg08xap_0t00cPbYZ4xvQ@mail.gmail.com>
@ 2026-02-02  9:28     ` Eugenio Perez Martin
  0 siblings, 0 replies; 4+ messages in thread
From: Eugenio Perez Martin @ 2026-02-02  9:28 UTC (permalink / raw)
  To: Zhang Tianci
  Cc: Michael Tsirkin, Jason Wang, Xuan Zhuo, marco.crivellari,
	anders.roxell, virtualization, linux-kernel, Xie Yongji

On Sat, Jan 31, 2026 at 8:15 AM Zhang Tianci
<zhangtianci.1997@bytedance.com> wrote:
>
> Hi Eugenio,
>
> On Fri, Jan 30, 2026 at 7:52 PM Eugenio Perez Martin
> <eperezma@redhat.com> wrote:
> >
> > On Fri, Jan 30, 2026 at 9:15 AM Zhang Tianci
> > <zhangtianci.1997@bytedance.com> wrote:
> > >
> > > Move the message to recv_list before dropping msg_lock and copying the
> > > request to userspace, avoiding a transient unlinked state that can race
> > > with the msg_sync timeout path. Roll back to send_list on copy failures.
> > >
> >
> > Missed Fixes: tag and Cc: stable@vger.kernel.org. Or maybe we can
> > consider this a change in the behavior? I don't think any VDUSE
> > instance should trust it will never receive that message that it
> > received partially once but still...
>
> I'll add the Fixes tag and CC stable list in v2 patch.
>
> >
> >
> > > Signed-off-by: Zhang Tianci <zhangtianci.1997@bytedance.com>
> > > Reviewed-by: Xie Yongji <xieyongji@bytedance.com>
> > > ---
> > >  drivers/vdpa/vdpa_user/vduse_dev.c | 30 ++++++++++++++++++++++--------
> > >  1 file changed, 22 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/drivers/vdpa/vdpa_user/vduse_dev.c b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > index ae357d014564c..b6a558341c06c 100644
> > > --- a/drivers/vdpa/vdpa_user/vduse_dev.c
> > > +++ b/drivers/vdpa/vdpa_user/vduse_dev.c
> > > @@ -325,6 +325,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >         struct file *file = iocb->ki_filp;
> > >         struct vduse_dev *dev = file->private_data;
> > >         struct vduse_dev_msg *msg;
> > > +       struct vduse_dev_request req;
> > >         int size = sizeof(struct vduse_dev_request);
> > >         ssize_t ret;
> > >
> > > @@ -339,7 +340,7 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >
> > >                 ret = -EAGAIN;
> > >                 if (file->f_flags & O_NONBLOCK)
> > > -                       goto unlock;
> > > +                       break;
> > >
> > >                 spin_unlock(&dev->msg_lock);
> > >                 ret = wait_event_interruptible_exclusive(dev->waitq,
> > > @@ -349,17 +350,30 @@ static ssize_t vduse_dev_read_iter(struct kiocb *iocb, struct iov_iter *to)
> > >
> > >                 spin_lock(&dev->msg_lock);
> > >         }
> > > +       if (!msg) {
> > > +               spin_unlock(&dev->msg_lock);
> > > +               return ret;
> > > +       }
> > > +
> > > +       memcpy(&req, &msg->req, sizeof(req));
> > > +       /*
> > > +        * Move @msg to recv_list before dropping msg_lock.
> > > +        * This avoids a window where @msg is detached from any list and
> > > +        * vduse_dev_msg_sync() timeout path may operate on an unlinked node.
> >
> > But in the timeout case, msg->completed is false so list_del is never
> > called, isn't it?
>
> Did you misread it?

Yes, I probably misread the code :). I should not post on Friday evenings!

> Here's the code:
>          spin_lock(&dev->msg_lock);
>          if (!msg->completed) {       <- here msg->completed is false.
>                  list_del(&msg->list);   <- boom
>                  msg->resp.result = VDUSE_REQ_RESULT_FAILED;
>                  /* Mark the device as malfunction when there is a timeout */
>                  if (!ret)
>                          vduse_dev_broken(dev);
>          }
>          ret = (msg->resp.result == VDUSE_REQ_RESULT_OK) ? 0 : -EIO;
>          spin_unlock(&dev->msg_lock);
>
> >
> > Is there even any event that may cause more than one packet in either
> > queue? Maybe we can simplify a lot of this if we don't have that
> > assumption.
>
> That makes sense. However, I believe that even though such a scenario doesn't
> exist at present, we cannot depend on the queue containing just a single entry.
> This is a very fragile assumption and can easily be invalidated by modifications
> to upper-layer operations.
>

I didn't mean to change the external character device, that would be a
userland visible change. Just to simplify the part of the code
internal to the module.

For example, instead of having a list, have a permanent msg with an
enum state: free (as "no message in flight)", sent_to_userland,
completed. So we don't need to worry about lists etc.

If vduse ever implements more than one message in flight it is very
easy to come back to the list. Note that I'm also interested in
speeding up the vdpa initialization and to have many msgs in flight is
one of the most straightforward ways to do it, I'm not against that in
the long term :).

> >
> > > +        */
> > > +       vduse_enqueue_msg(&dev->recv_list, msg);

If we re-enqueue it, another read from another userland thread could
read the same message again concurrently with this vduse_dev_read_iter
call, isn't it?

> > >         spin_unlock(&dev->msg_lock);
> > > -       ret = copy_to_iter(&msg->req, size, to);
> > > -       spin_lock(&dev->msg_lock);
> > > +
> > > +       ret = copy_to_iter(&req, size, to);
> > >         if (ret != size) {
> > > +               spin_lock(&dev->msg_lock);
> > > +               /* Roll back: move msg back to send_list if still pending. */
> > > +               msg = vduse_find_msg(&dev->recv_list, req.request_id);
> > > +               if (msg)
> > > +                       vduse_enqueue_msg(&dev->send_list, msg)
> > > +               spin_unlock(&dev->msg_lock);
> > >                 ret = -EFAULT;
> > > -               vduse_enqueue_msg(&dev->send_list, msg);
> > > -               goto unlock;
> > >         }
> > > -       vduse_enqueue_msg(&dev->recv_list, msg);
> > > -unlock:
> > > -       spin_unlock(&dev->msg_lock);
> > >
> > >         return ret;
> > >  }
> > > --
> > > 2.39.5
> > >
> >
>
> I'll send out the v2 patch soon.
>
> Thanks,
> Tianci
>


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-02-02  9:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-01-30  8:15 [PATCH] vduse: Fix msg list race in vduse_dev_read_iter Zhang Tianci
2026-01-30 10:16 ` Michael S. Tsirkin
2026-01-30 11:51 ` Eugenio Perez Martin
     [not found]   ` <CAP4dvscscRNbJBKwDLE-gebPT24fOJg08xap_0t00cPbYZ4xvQ@mail.gmail.com>
2026-02-02  9:28     ` [External] " Eugenio Perez Martin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox