All of lore.kernel.org
 help / color / mirror / Atom feed
From: Christian Schoenebeck <qemu_oss@crudebyte.com>
To: qemu-devel@nongnu.org
Cc: Greg Kurz <groug@kaod.org>
Subject: Re: 9pfs: Twalk crash
Date: Wed, 01 Sep 2021 14:49:37 +0200	[thread overview]
Message-ID: <37058587.QroWqLRJlK@silver> (raw)
In-Reply-To: <20210831190454.5c725f2b@bahia.lan>

On Dienstag, 31. August 2021 19:04:54 CEST Greg Kurz wrote:
> > Ok, I made a quick "is same thread" assertion check:
> > 
> > diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> > index 54ee93b71f..bb6ebd16aa 100644
> > --- a/hw/9pfs/virtio-9p-device.c
> > +++ b/hw/9pfs/virtio-9p-device.c
> > @@ -28,12 +28,25 @@
> > 
> >  #include "qemu/module.h"
> >  #include "sysemu/qtest.h"
> > 
> > +static void assert_thread(void) {
> > +    static QemuThread thread;
> > +    static bool calledBefore;
> > +    if (!calledBefore) {
> > +        calledBefore = 1;
> > +        qemu_thread_get_self(&thread);
> > +        return;
> > +    }
> > +    assert(qemu_thread_is_self(&thread));
> > +}
> > +
[...]
> > And it triggered, however I am not sure if some of those functions I
> > asserted above are indeed allowed to be executed on a different thread
> > than main thread:
> > 
> > Program terminated with signal SIGABRT, Aborted.
> > #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
> > 50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
> > [Current thread is 1 (Thread 0x7fd0bcef1700 (LWP 6470))]
> 
> Based in the thread number, it seems that the signal was raised by
> the main event thread...

No, it was not main thread actually, gdb's "current thread is 1" output is 
misleading.

Following the thread id trace, I extended the thread assertion checks over to 
v9fs_walk() as well, like this:

static void coroutine_fn v9fs_walk(void *opaque)
{
    ...
    assert_thread();
    v9fs_co_run_in_worker({
        ...
    });
    assert_thread();
    ...
}

and made sure the reference thread id to be compared is really the main 
thread.

And what happens here is before v9fs_co_run_in_worker() is entered, 
v9fs_walk() runs on main thread, but after returning from 
v9fs_co_run_in_worker() it runs on a different thread for some reason, not on 
main thread as it would be expected at that point.

This is my test patch:

----

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 2815257f42..fa706c10e6 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -1717,6 +1717,8 @@ static void coroutine_fn v9fs_walk(void *opaque)
     V9fsState *s = pdu->s;
     V9fsQID qid;
 
+    assert_thread();
+
     err = pdu_unmarshal(pdu, offset, "ddw", &fid, &newfid, &nwnames);
     if (err < 0) {
         pdu_complete(pdu, err);
@@ -1724,10 +1726,15 @@ static void coroutine_fn v9fs_walk(void *opaque)
     }
     offset += err;
 
+    assert_thread();
+
     trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames);
 
+    assert_thread();
+
     if (nwnames > P9_MAXWELEM) {
         err = -EINVAL;
+        assert_thread();
         goto out_nofid;
     }
     if (nwnames) {
@@ -1738,10 +1745,12 @@ static void coroutine_fn v9fs_walk(void *opaque)
         for (i = 0; i < nwnames; i++) {
             err = pdu_unmarshal(pdu, offset, "s", &wnames[i]);
             if (err < 0) {
+                assert_thread();
                 goto out_nofid;
             }
             if (name_is_illegal(wnames[i].data)) {
                 err = -ENOENT;
+                assert_thread();
                 goto out_nofid;
             }
             offset += err;
@@ -1750,6 +1759,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
     fidp = get_fid(pdu, fid);
     if (fidp == NULL) {
         err = -ENOENT;
+        assert_thread();
         goto out_nofid;
     }
 
@@ -1767,6 +1777,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
      * Twalk client request) as small as possible, run all the required fs
      * driver code altogether inside the following block.
      */
+    assert_thread();
     v9fs_co_run_in_worker({
         if (v9fs_request_cancelled(pdu)) {
             err = -EINTR;
@@ -1807,6 +1818,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
             }
         }
     });
+    assert_thread();
     /*
      * Handle all the rest of this Twalk request on main thread ...
      */
diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 54ee93b71f..abf8a33b3b 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -27,6 +27,21 @@
 #include "qemu/iov.h"
 #include "qemu/module.h"
 #include "sysemu/qtest.h"
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+static pid_t main_thread_id = -1;
+
+void assert_thread(void) {
+    assert(main_thread_id != -1);
+    pid_t tid = syscall(__NR_gettid);
+    if (tid != main_thread_id) {
+        printf("Unexpected thread id %d (should be %d) - process id is %d\n",
+               tid, main_thread_id, getpid());
+        fflush(stdout);
+    }
+    assert(tid == main_thread_id);
+}
 
 static void virtio_9p_push_and_notify(V9fsPDU *pdu)
 {
@@ -34,6 +49,8 @@ static void virtio_9p_push_and_notify(V9fsPDU *pdu)
     V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
     VirtQueueElement *elem = v->elems[pdu->idx];
 
+    assert_thread();
+
     /* push onto queue and notify */
     virtqueue_push(v->vq, elem, pdu->size);
     g_free(elem);
@@ -51,6 +68,8 @@ static void handle_9p_output(VirtIODevice *vdev, VirtQueue 
*vq)
     ssize_t len;
     VirtQueueElement *elem;
 
+    assert_thread();
+
     while ((pdu = pdu_alloc(s))) {
         P9MsgHeader out;
 
@@ -125,6 +144,8 @@ static ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -143,6 +164,8 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -160,6 +183,8 @@ static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->in_sg, elem->in_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
@@ -180,6 +205,8 @@ static void virtio_init_out_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->out_sg, elem->out_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
@@ -251,6 +278,10 @@ static Property virtio_9p_properties[] = {
 
 static void virtio_9p_class_init(ObjectClass *klass, void *data)
 {
+    main_thread_id = syscall(__NR_gettid);
+    printf("Main thread ID is %d\n", main_thread_id);
+    fflush(stdout);
+
     DeviceClass *dc = DEVICE_CLASS(klass);
     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
 
diff --git a/hw/9pfs/virtio-9p.h b/hw/9pfs/virtio-9p.h
index 20fa118f3a..eb7d1293cb 100644
--- a/hw/9pfs/virtio-9p.h
+++ b/hw/9pfs/virtio-9p.h
@@ -17,4 +17,6 @@ struct V9fsVirtioState {
 #define TYPE_VIRTIO_9P "virtio-9p-device"
 OBJECT_DECLARE_SIMPLE_TYPE(V9fsVirtioState, VIRTIO_9P)
 
+void assert_thread(void);
+
 #endif


----

Output:

Unexpected thread id 10780 (should be 10754) - process id is 10754
qemu-system-x86_64: ../hw/9pfs/virtio-9p-device.c:43: assert_thread: Assertion 
`tid == main_thread_id' failed.
Aborted (core dumped)

So the initially captured thread ID 10754 at process startup matches the 
process ID, hence 10754 was the main thread, and hence the assertion was 
triggered on a thread that was not main thread.

Backtrace:

Program terminated with signal SIGABRT, Aborted.
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
[Current thread is 1 (Thread 0x7fb4637fe700 (LWP 10780))]
(gdb) bt
#0  0x00007fb7ac6d87bb in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/
sysv/linux/raise.c:50
#1  0x00007fb7ac6c3535 in __GI_abort () at abort.c:79
#2  0x00007fb7ac6c340f in __assert_fail_base
    (fmt=0x7fb7ac825ee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
assertion=0x562e6126072f "tid == main_thread_id", file=0x562e612606fc "../hw/2
#3  0x00007fb7ac6d1102 in __GI___assert_fail
    (assertion=assertion@entry=0x562e6126072f "tid == main_thread_id", 
file=file@entry=0x562e612606fc "../hw/9pfs/virtio-9p-device.c", line=line@e1
#4  0x0000562e60ef3965 in assert_thread () at ../hw/9pfs/virtio-9p-device.c:43
#5  0x0000562e60ef3965 in assert_thread () at ../hw/9pfs/virtio-9p-device.c:35
#6  0x0000562e60cef125 in v9fs_walk (opaque=0x562e6405c288) at ../hw/9pfs/
9p.c:1821
#7  0x0000562e6114f79b in coroutine_trampoline (i0=<optimized out>, 
i1=<optimized out>) at ../util/coroutine-ucontext.c:173
#8  0x00007fb7ac6edb50 in __correctly_grouped_prefixwc
    (begin=0x2 <error: Cannot access memory at address 0x2>, end=0x0, 
thousands=0 L'\000', grouping=0x7fb7ac6d87bb <__GI_raise+267> "H\213\214$\b\1
#9  0x0000000000000000 in  ()

And 9p.c:1821 is exactly:

    assert_thread();
    v9fs_co_run_in_worker({
        if (v9fs_request_cancelled(pdu)) {
            err = -EINTR;
            break;
        }
        err = s->ops->lstat(&s->ctx, &dpath, &fidst);
        if (err < 0) {
            err = -errno;
            break;
        }
        stbuf = fidst;
        for (name_idx = 0; name_idx < nwnames; name_idx++) {
            if (v9fs_request_cancelled(pdu)) {
                err = -EINTR;
                break;
            }
            if (!same_stat_id(&pdu->s->root_st, &stbuf) ||
                strcmp("..", wnames[name_idx].data))
            {
                err = s->ops->name_to_path(&s->ctx, &dpath,
                                        wnames[name_idx].data, &path);
                if (err < 0) {
                    err = -errno;
                    break;
                }
                if (v9fs_request_cancelled(pdu)) {
                    err = -EINTR;
                    break;
                }
                err = s->ops->lstat(&s->ctx, &path, &stbuf);
                if (err < 0) {
                    err = -errno;
                    break;
                }
                stbufs[name_idx] = stbuf;
                v9fs_path_copy(&dpath, &path);
                v9fs_path_copy(&pathes[name_idx], &path);
            }
        }
    });
    assert_thread(); //<--- HERE

> > > > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > > > (gdb) bt full
> > > > #0  0x000055a934dfb9a7 in virtio_pdu_vunmarshal (pdu=0x55a93717cde8,
> > > > offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at
> > > > ../hw/9pfs/virtio-9p-device.c: 146
> > > > 
> > > >         s = 0x55a93717b4b8
> > > >         v = 0x55a93717aee0
> > > >         elem = 0x0
> > > 
> > > So this is v->elems[pdu->idx]... what's the value of pdu->idx ?
> > 
> > In that originally posted core dump it was 113:
> > 
> > #0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7,
> > fmt=0x55a9352766d1
> > "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
> > 146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset,
> > 1, fmt, ap);
> > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > (gdb) p pdu->idx
> > $1 = 113
> 
> Ok, so it is a valid index (not over MAX_REQ). So it would mean
> that someone cleared the slot in our back ?

Yes, the slot is within valid boundaries. Assertion checks for that in 
virtio-9p-device.c in future wouldn't hurt though.

Best regards,
Christian Schoenebeck




  reply	other threads:[~2021-09-01 12:57 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-30 15:55 9pfs: Twalk crash Christian Schoenebeck
2021-08-31 10:57 ` Greg Kurz
2021-08-31 15:00   ` Christian Schoenebeck
2021-08-31 17:04     ` Greg Kurz
2021-09-01 12:49       ` Christian Schoenebeck [this message]
2021-09-01 14:21         ` Christian Schoenebeck
2021-09-01 15:41           ` Greg Kurz
2021-09-01 16:07             ` Christian Schoenebeck
2021-09-01 16:31               ` Greg Kurz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=37058587.QroWqLRJlK@silver \
    --to=qemu_oss@crudebyte.com \
    --cc=groug@kaod.org \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.