[Qemu-devel] [PATCH] sheepdog: use coroutines

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] sheepdog: use coroutines
@ 2011-08-12 12:33 MORITA Kazutaka
  2011-08-23 12:29 ` Kevin Wolf
  2011-12-23 13:38 ` Christoph Hellwig
  0 siblings, 2 replies; 15+ messages in thread
From: MORITA Kazutaka @ 2011-08-12 12:33 UTC (permalink / raw)
  To: kwolf; +Cc: sheepdog, qemu-devel

This makes the sheepdog block driver support bdrv_co_readv/writev
instead of bdrv_aio_readv/writev.

With this patch, Sheepdog network I/O becomes fully asynchronous.  The
block driver yields back when send/recv returns EAGAIN, and is resumed
when the sheepdog network connection is ready for the operation.

Signed-off-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
---
 block/sheepdog.c |  150 +++++++++++++++++++++++++++++++++--------------------
 1 files changed, 93 insertions(+), 57 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index e150ac0..e283c34 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -274,7 +274,7 @@ struct SheepdogAIOCB {
     int ret;
     enum AIOCBState aiocb_type;
 
-    QEMUBH *bh;
+    Coroutine *coroutine;
     void (*aio_done_func)(SheepdogAIOCB *);
 
     int canceled;
@@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
     char *port;
     int fd;
 
+    CoMutex lock;
+    Coroutine *co_send;
+    Coroutine *co_recv;
+
     uint32_t aioreq_seq_num;
     QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
 } BDRVSheepdogState;
@@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
 /*
  * Sheepdog I/O handling:
  *
- * 1. In the sd_aio_readv/writev, read/write requests are added to the
- *    QEMU Bottom Halves.
- *
- * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
- *    requests to the server and link the requests to the
- *    outstanding_list in the BDRVSheepdogState.  we exits the
- *    function without waiting for receiving the response.
+ * 1. In sd_co_rw_vector, we send the I/O requests to the server and
+ *    link the requests to the outstanding_list in the
+ *    BDRVSheepdogState.  The function exits without waiting for
+ *    receiving the response.
  *
- * 3. We receive the response in aio_read_response, the fd handler to
+ * 2. We receive the response in aio_read_response, the fd handler to
  *    the sheepdog connection.  If metadata update is needed, we send
  *    the write request to the vdi object in sd_write_done, the write
- *    completion function.  The AIOCB callback is not called until all
- *    the requests belonging to the AIOCB are finished.
+ *    completion function.  We switch back to sd_co_readv/writev after
+ *    all the requests belonging to the AIOCB are finished.
  */
 
 static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
@@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
 static void sd_finish_aiocb(SheepdogAIOCB *acb)
 {
     if (!acb->canceled) {
-        acb->common.cb(acb->common.opaque, acb->ret);
+        qemu_coroutine_enter(acb->coroutine, NULL);
     }
     qemu_aio_release(acb);
 }
@@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
      * Sheepdog cannot cancel the requests which are already sent to
      * the servers, so we just complete the request with -EIO here.
      */
-    acb->common.cb(acb->common.opaque, -EIO);
+    acb->ret = -EIO;
+    qemu_coroutine_enter(acb->coroutine, NULL);
     acb->canceled = 1;
 }
 
@@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
 
     acb->aio_done_func = NULL;
     acb->canceled = 0;
-    acb->bh = NULL;
+    acb->coroutine = qemu_coroutine_self();
     acb->ret = 0;
     QLIST_INIT(&acb->aioreq_head);
     return acb;
 }
 
-static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
-{
-    if (acb->bh) {
-        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    qemu_bh_schedule(acb->bh);
-    return 0;
-}
-
 #ifdef _WIN32
 
 struct msghdr {
@@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len,
 again:
     ret = do_send_recv(sockfd, iov, len, iov_offset, write);
     if (ret < 0) {
-        if (errno == EINTR || errno == EAGAIN) {
+        if (errno == EINTR) {
+            goto again;
+        }
+        if (errno == EAGAIN) {
+            if (qemu_in_coroutine()) {
+                qemu_coroutine_yield();
+            }
             goto again;
         }
         error_report("failed to recv a rsp, %s", strerror(errno));
@@ -793,14 +789,14 @@ static void aio_read_response(void *opaque)
     unsigned long idx;
 
     if (QLIST_EMPTY(&s->outstanding_aio_head)) {
-        return;
+        goto out;
     }
 
     /* read a header */
     ret = do_read(fd, &rsp, sizeof(rsp));
     if (ret) {
         error_report("failed to get the header, %s", strerror(errno));
-        return;
+        goto out;
     }
 
     /* find the right aio_req from the outstanding_aio list */
@@ -811,7 +807,7 @@ static void aio_read_response(void *opaque)
     }
     if (!aio_req) {
         error_report("cannot find aio_req %x", rsp.id);
-        return;
+        goto out;
     }
 
     acb = aio_req->aiocb;
@@ -847,7 +843,7 @@ static void aio_read_response(void *opaque)
                        aio_req->iov_offset);
         if (ret) {
             error_report("failed to get the data, %s", strerror(errno));
-            return;
+            goto out;
         }
         break;
     }
@@ -861,10 +857,30 @@ static void aio_read_response(void *opaque)
     if (!rest) {
         /*
          * We've finished all requests which belong to the AIOCB, so
-         * we can call the callback now.
+         * we can switch back to sd_co_readv/writev now.
          */
         acb->aio_done_func(acb);
     }
+out:
+    s->co_recv = NULL;
+}
+
+static void co_read_response(void *opaque)
+{
+    BDRVSheepdogState *s = opaque;
+
+    if (!s->co_recv) {
+        s->co_recv = qemu_coroutine_create(aio_read_response);
+    }
+
+    qemu_coroutine_enter(s->co_recv, opaque);
+}
+
+static void co_write_request(void *opaque)
+{
+    BDRVSheepdogState *s = opaque;
+
+    qemu_coroutine_enter(s->co_send, NULL);
 }
 
 static int aio_flush_request(void *opaque)
@@ -924,7 +940,7 @@ static int get_sheep_fd(BDRVSheepdogState *s)
         return -1;
     }
 
-    qemu_aio_set_fd_handler(fd, aio_read_response, NULL, aio_flush_request,
+    qemu_aio_set_fd_handler(fd, co_read_response, NULL, aio_flush_request,
                             NULL, s);
     return fd;
 }
@@ -1091,6 +1107,10 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
 
     hdr.id = aio_req->id;
 
+    qemu_co_mutex_lock(&s->lock);
+    s->co_send = qemu_coroutine_self();
+    qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
+                            aio_flush_request, NULL, s);
     set_cork(s->fd, 1);
 
     /* send a header */
@@ -1109,6 +1129,9 @@ static int add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
     }
 
     set_cork(s->fd, 0);
+    qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
+                            aio_flush_request, NULL, s);
+    qemu_co_mutex_unlock(&s->lock);
 
     return 0;
 }
@@ -1225,6 +1248,7 @@ static int sd_open(BlockDriverState *bs, const char *filename, int flags)
 
     bs->total_sectors = s->inode.vdi_size / SECTOR_SIZE;
     strncpy(s->name, vdi, sizeof(s->name));
+    qemu_co_mutex_init(&s->lock);
     qemu_free(buf);
     return 0;
 out:
@@ -1491,7 +1515,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
 /*
  * This function is called after writing data objects.  If we need to
  * update metadata, this sends a write request to the vdi object.
- * Otherwise, this calls the AIOCB callback.
+ * Otherwise, this switches back to sd_co_readv/writev.
  */
 static void sd_write_done(SheepdogAIOCB *acb)
 {
@@ -1587,8 +1611,11 @@ out:
  * waiting the response.  The responses are received in the
  * `aio_read_response' function which is called from the main loop as
  * a fd handler.
+ *
+ * Returns 1 when we need to wait a response, 0 when there is no sent
+ * request and -errno in error cases.
  */
-static void sd_readv_writev_bh_cb(void *p)
+static int sd_co_rw_vector(void *p)
 {
     SheepdogAIOCB *acb = p;
     int ret = 0;
@@ -1600,9 +1627,6 @@ static void sd_readv_writev_bh_cb(void *p)
     SheepdogInode *inode = &s->inode;
     AIOReq *aio_req;
 
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
     if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) {
         /*
          * In the case we open the snapshot VDI, Sheepdog creates the
@@ -1684,42 +1708,47 @@ static void sd_readv_writev_bh_cb(void *p)
     }
 out:
     if (QLIST_EMPTY(&acb->aioreq_head)) {
-        sd_finish_aiocb(acb);
+        return acb->ret;
     }
+    return 1;
 }
 
-static BlockDriverAIOCB *sd_aio_writev(BlockDriverState *bs, int64_t sector_num,
-                                       QEMUIOVector *qiov, int nb_sectors,
-                                       BlockDriverCompletionFunc *cb,
-                                       void *opaque)
+static int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
+                        int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
+    int ret;
 
     if (bs->growable && sector_num + nb_sectors > bs->total_sectors) {
         /* TODO: shouldn't block here */
         if (sd_truncate(bs, (sector_num + nb_sectors) * SECTOR_SIZE) < 0) {
-            return NULL;
+            return -EIO;
         }
         bs->total_sectors = sector_num + nb_sectors;
     }
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
     acb->aio_done_func = sd_write_done;
     acb->aiocb_type = AIOCB_WRITE_UDATA;
 
-    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
-    return &acb->common;
+    ret = sd_co_rw_vector(acb);
+    if (ret <= 0) {
+        qemu_aio_release(acb);
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+
+    return acb->ret;
 }
 
-static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
-                                      QEMUIOVector *qiov, int nb_sectors,
-                                      BlockDriverCompletionFunc *cb,
-                                      void *opaque)
+static int sd_co_readv(BlockDriverState *bs, int64_t sector_num,
+                       int nb_sectors, QEMUIOVector *qiov)
 {
     SheepdogAIOCB *acb;
-    int i;
+    int i, ret;
 
-    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, cb, opaque);
+    acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors, NULL, NULL);
     acb->aiocb_type = AIOCB_READ_UDATA;
     acb->aio_done_func = sd_finish_aiocb;
 
@@ -1731,8 +1760,15 @@ static BlockDriverAIOCB *sd_aio_readv(BlockDriverState *bs, int64_t sector_num,
         memset(qiov->iov[i].iov_base, 0, qiov->iov[i].iov_len);
     }
 
-    sd_schedule_bh(sd_readv_writev_bh_cb, acb);
-    return &acb->common;
+    ret = sd_co_rw_vector(acb);
+    if (ret <= 0) {
+        qemu_aio_release(acb);
+        return ret;
+    }
+
+    qemu_coroutine_yield();
+
+    return acb->ret;
 }
 
 static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
@@ -2062,8 +2098,8 @@ BlockDriver bdrv_sheepdog = {
     .bdrv_getlength = sd_getlength,
     .bdrv_truncate  = sd_truncate,
 
-    .bdrv_aio_readv     = sd_aio_readv,
-    .bdrv_aio_writev    = sd_aio_writev,
+    .bdrv_co_readv  = sd_co_readv,
+    .bdrv_co_writev = sd_co_writev,
 
     .bdrv_snapshot_create   = sd_snapshot_create,
     .bdrv_snapshot_goto     = sd_snapshot_goto,
-- 
1.7.2.5

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] sheepdog: use coroutines
  2011-08-12 12:33 [Qemu-devel] [PATCH] sheepdog: use coroutines MORITA Kazutaka
@ 2011-08-23 12:29 ` Kevin Wolf
  2011-08-23 17:14   ` MORITA Kazutaka
  2011-12-23 13:38 ` Christoph Hellwig
  1 sibling, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2011-08-23 12:29 UTC (permalink / raw)
  To: MORITA Kazutaka; +Cc: sheepdog, qemu-devel

Am 12.08.2011 14:33, schrieb MORITA Kazutaka:
> This makes the sheepdog block driver support bdrv_co_readv/writev
> instead of bdrv_aio_readv/writev.
> 
> With this patch, Sheepdog network I/O becomes fully asynchronous.  The
> block driver yields back when send/recv returns EAGAIN, and is resumed
> when the sheepdog network connection is ready for the operation.
> 
> Signed-off-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
> ---
>  block/sheepdog.c |  150 +++++++++++++++++++++++++++++++++--------------------
>  1 files changed, 93 insertions(+), 57 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index e150ac0..e283c34 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -274,7 +274,7 @@ struct SheepdogAIOCB {
>      int ret;
>      enum AIOCBState aiocb_type;
>  
> -    QEMUBH *bh;
> +    Coroutine *coroutine;
>      void (*aio_done_func)(SheepdogAIOCB *);
>  
>      int canceled;
> @@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
>      char *port;
>      int fd;
>  
> +    CoMutex lock;
> +    Coroutine *co_send;
> +    Coroutine *co_recv;
> +
>      uint32_t aioreq_seq_num;
>      QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
>  } BDRVSheepdogState;
> @@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
>  /*
>   * Sheepdog I/O handling:
>   *
> - * 1. In the sd_aio_readv/writev, read/write requests are added to the
> - *    QEMU Bottom Halves.
> - *
> - * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
> - *    requests to the server and link the requests to the
> - *    outstanding_list in the BDRVSheepdogState.  we exits the
> - *    function without waiting for receiving the response.
> + * 1. In sd_co_rw_vector, we send the I/O requests to the server and
> + *    link the requests to the outstanding_list in the
> + *    BDRVSheepdogState.  The function exits without waiting for
> + *    receiving the response.
>   *
> - * 3. We receive the response in aio_read_response, the fd handler to
> + * 2. We receive the response in aio_read_response, the fd handler to
>   *    the sheepdog connection.  If metadata update is needed, we send
>   *    the write request to the vdi object in sd_write_done, the write
> - *    completion function.  The AIOCB callback is not called until all
> - *    the requests belonging to the AIOCB are finished.
> + *    completion function.  We switch back to sd_co_readv/writev after
> + *    all the requests belonging to the AIOCB are finished.
>   */
>  
>  static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
> @@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
>  static void sd_finish_aiocb(SheepdogAIOCB *acb)
>  {
>      if (!acb->canceled) {
> -        acb->common.cb(acb->common.opaque, acb->ret);
> +        qemu_coroutine_enter(acb->coroutine, NULL);
>      }
>      qemu_aio_release(acb);
>  }
> @@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
>       * Sheepdog cannot cancel the requests which are already sent to
>       * the servers, so we just complete the request with -EIO here.
>       */
> -    acb->common.cb(acb->common.opaque, -EIO);
> +    acb->ret = -EIO;
> +    qemu_coroutine_enter(acb->coroutine, NULL);
>      acb->canceled = 1;
>  }
>  
> @@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>  
>      acb->aio_done_func = NULL;
>      acb->canceled = 0;
> -    acb->bh = NULL;
> +    acb->coroutine = qemu_coroutine_self();
>      acb->ret = 0;
>      QLIST_INIT(&acb->aioreq_head);
>      return acb;
>  }
>  
> -static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
> -{
> -    if (acb->bh) {
> -        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
> -        return -EIO;
> -    }
> -
> -    acb->bh = qemu_bh_new(cb, acb);
> -    qemu_bh_schedule(acb->bh);
> -    return 0;
> -}
> -
>  #ifdef _WIN32
>  
>  struct msghdr {
> @@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len,
>  again:
>      ret = do_send_recv(sockfd, iov, len, iov_offset, write);
>      if (ret < 0) {
> -        if (errno == EINTR || errno == EAGAIN) {
> +        if (errno == EINTR) {
> +            goto again;
> +        }
> +        if (errno == EAGAIN) {
> +            if (qemu_in_coroutine()) {
> +                qemu_coroutine_yield();
> +            }

Who reenters the coroutine if we yield here?

And of course for a coroutine based block driver I would like to see
much less code in callback functions. But it's a good start anyway.

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] sheepdog: use coroutines
  2011-08-23 12:29 ` Kevin Wolf
@ 2011-08-23 17:14   ` MORITA Kazutaka
  2011-08-24 12:56     ` Kevin Wolf
  0 siblings, 1 reply; 15+ messages in thread
From: MORITA Kazutaka @ 2011-08-23 17:14 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: sheepdog, qemu-devel, MORITA Kazutaka

At Tue, 23 Aug 2011 14:29:50 +0200,
Kevin Wolf wrote:
> 
> Am 12.08.2011 14:33, schrieb MORITA Kazutaka:
> > This makes the sheepdog block driver support bdrv_co_readv/writev
> > instead of bdrv_aio_readv/writev.
> > 
> > With this patch, Sheepdog network I/O becomes fully asynchronous.  The
> > block driver yields back when send/recv returns EAGAIN, and is resumed
> > when the sheepdog network connection is ready for the operation.
> > 
> > Signed-off-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
> > ---
> >  block/sheepdog.c |  150 +++++++++++++++++++++++++++++++++--------------------
> >  1 files changed, 93 insertions(+), 57 deletions(-)
> > 
> > diff --git a/block/sheepdog.c b/block/sheepdog.c
> > index e150ac0..e283c34 100644
> > --- a/block/sheepdog.c
> > +++ b/block/sheepdog.c
> > @@ -274,7 +274,7 @@ struct SheepdogAIOCB {
> >      int ret;
> >      enum AIOCBState aiocb_type;
> >  
> > -    QEMUBH *bh;
> > +    Coroutine *coroutine;
> >      void (*aio_done_func)(SheepdogAIOCB *);
> >  
> >      int canceled;
> > @@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
> >      char *port;
> >      int fd;
> >  
> > +    CoMutex lock;
> > +    Coroutine *co_send;
> > +    Coroutine *co_recv;
> > +
> >      uint32_t aioreq_seq_num;
> >      QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
> >  } BDRVSheepdogState;
> > @@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
> >  /*
> >   * Sheepdog I/O handling:
> >   *
> > - * 1. In the sd_aio_readv/writev, read/write requests are added to the
> > - *    QEMU Bottom Halves.
> > - *
> > - * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
> > - *    requests to the server and link the requests to the
> > - *    outstanding_list in the BDRVSheepdogState.  we exits the
> > - *    function without waiting for receiving the response.
> > + * 1. In sd_co_rw_vector, we send the I/O requests to the server and
> > + *    link the requests to the outstanding_list in the
> > + *    BDRVSheepdogState.  The function exits without waiting for
> > + *    receiving the response.
> >   *
> > - * 3. We receive the response in aio_read_response, the fd handler to
> > + * 2. We receive the response in aio_read_response, the fd handler to
> >   *    the sheepdog connection.  If metadata update is needed, we send
> >   *    the write request to the vdi object in sd_write_done, the write
> > - *    completion function.  The AIOCB callback is not called until all
> > - *    the requests belonging to the AIOCB are finished.
> > + *    completion function.  We switch back to sd_co_readv/writev after
> > + *    all the requests belonging to the AIOCB are finished.
> >   */
> >  
> >  static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
> > @@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
> >  static void sd_finish_aiocb(SheepdogAIOCB *acb)
> >  {
> >      if (!acb->canceled) {
> > -        acb->common.cb(acb->common.opaque, acb->ret);
> > +        qemu_coroutine_enter(acb->coroutine, NULL);
> >      }
> >      qemu_aio_release(acb);
> >  }
> > @@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
> >       * Sheepdog cannot cancel the requests which are already sent to
> >       * the servers, so we just complete the request with -EIO here.
> >       */
> > -    acb->common.cb(acb->common.opaque, -EIO);
> > +    acb->ret = -EIO;
> > +    qemu_coroutine_enter(acb->coroutine, NULL);
> >      acb->canceled = 1;
> >  }
> >  
> > @@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
> >  
> >      acb->aio_done_func = NULL;
> >      acb->canceled = 0;
> > -    acb->bh = NULL;
> > +    acb->coroutine = qemu_coroutine_self();
> >      acb->ret = 0;
> >      QLIST_INIT(&acb->aioreq_head);
> >      return acb;
> >  }
> >  
> > -static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
> > -{
> > -    if (acb->bh) {
> > -        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
> > -        return -EIO;
> > -    }
> > -
> > -    acb->bh = qemu_bh_new(cb, acb);
> > -    qemu_bh_schedule(acb->bh);
> > -    return 0;
> > -}
> > -
> >  #ifdef _WIN32
> >  
> >  struct msghdr {
> > @@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len,
> >  again:
> >      ret = do_send_recv(sockfd, iov, len, iov_offset, write);
> >      if (ret < 0) {
> > -        if (errno == EINTR || errno == EAGAIN) {
> > +        if (errno == EINTR) {
> > +            goto again;
> > +        }
> > +        if (errno == EAGAIN) {
> > +            if (qemu_in_coroutine()) {
> > +                qemu_coroutine_yield();
> > +            }
> 
> Who reenters the coroutine if we yield here?

The fd handlers, co_write_request() and co_read_response(), will call
qemu_coroutine_enter() for the coroutine.  It will be restarted after
the sheepdog network connection becomes ready.

> 
> And of course for a coroutine based block driver I would like to see
> much less code in callback functions. But it's a good start anyway.

Yes, actually they can be much simpler.  This patch adds asynchrous
I/O support with a minimal change based on a coroutine, and I think
code simplification would be the next step.


Thanks,

Kazutaka

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] sheepdog: use coroutines
  2011-08-23 17:14   ` MORITA Kazutaka
@ 2011-08-24 12:56     ` Kevin Wolf
  0 siblings, 0 replies; 15+ messages in thread
From: Kevin Wolf @ 2011-08-24 12:56 UTC (permalink / raw)
  To: MORITA Kazutaka; +Cc: sheepdog, qemu-devel

Am 23.08.2011 19:14, schrieb MORITA Kazutaka:
> At Tue, 23 Aug 2011 14:29:50 +0200,
> Kevin Wolf wrote:
>>
>> Am 12.08.2011 14:33, schrieb MORITA Kazutaka:
>>> This makes the sheepdog block driver support bdrv_co_readv/writev
>>> instead of bdrv_aio_readv/writev.
>>>
>>> With this patch, Sheepdog network I/O becomes fully asynchronous.  The
>>> block driver yields back when send/recv returns EAGAIN, and is resumed
>>> when the sheepdog network connection is ready for the operation.
>>>
>>> Signed-off-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
>>> ---
>>>  block/sheepdog.c |  150 +++++++++++++++++++++++++++++++++--------------------
>>>  1 files changed, 93 insertions(+), 57 deletions(-)
>>>
>>> diff --git a/block/sheepdog.c b/block/sheepdog.c
>>> index e150ac0..e283c34 100644
>>> --- a/block/sheepdog.c
>>> +++ b/block/sheepdog.c
>>> @@ -274,7 +274,7 @@ struct SheepdogAIOCB {
>>>      int ret;
>>>      enum AIOCBState aiocb_type;
>>>  
>>> -    QEMUBH *bh;
>>> +    Coroutine *coroutine;
>>>      void (*aio_done_func)(SheepdogAIOCB *);
>>>  
>>>      int canceled;
>>> @@ -295,6 +295,10 @@ typedef struct BDRVSheepdogState {
>>>      char *port;
>>>      int fd;
>>>  
>>> +    CoMutex lock;
>>> +    Coroutine *co_send;
>>> +    Coroutine *co_recv;
>>> +
>>>      uint32_t aioreq_seq_num;
>>>      QLIST_HEAD(outstanding_aio_head, AIOReq) outstanding_aio_head;
>>>  } BDRVSheepdogState;
>>> @@ -346,19 +350,16 @@ static const char * sd_strerror(int err)
>>>  /*
>>>   * Sheepdog I/O handling:
>>>   *
>>> - * 1. In the sd_aio_readv/writev, read/write requests are added to the
>>> - *    QEMU Bottom Halves.
>>> - *
>>> - * 2. In sd_readv_writev_bh_cb, the callbacks of BHs, we send the I/O
>>> - *    requests to the server and link the requests to the
>>> - *    outstanding_list in the BDRVSheepdogState.  we exits the
>>> - *    function without waiting for receiving the response.
>>> + * 1. In sd_co_rw_vector, we send the I/O requests to the server and
>>> + *    link the requests to the outstanding_list in the
>>> + *    BDRVSheepdogState.  The function exits without waiting for
>>> + *    receiving the response.
>>>   *
>>> - * 3. We receive the response in aio_read_response, the fd handler to
>>> + * 2. We receive the response in aio_read_response, the fd handler to
>>>   *    the sheepdog connection.  If metadata update is needed, we send
>>>   *    the write request to the vdi object in sd_write_done, the write
>>> - *    completion function.  The AIOCB callback is not called until all
>>> - *    the requests belonging to the AIOCB are finished.
>>> + *    completion function.  We switch back to sd_co_readv/writev after
>>> + *    all the requests belonging to the AIOCB are finished.
>>>   */
>>>  
>>>  static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb,
>>> @@ -398,7 +399,7 @@ static inline int free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req)
>>>  static void sd_finish_aiocb(SheepdogAIOCB *acb)
>>>  {
>>>      if (!acb->canceled) {
>>> -        acb->common.cb(acb->common.opaque, acb->ret);
>>> +        qemu_coroutine_enter(acb->coroutine, NULL);
>>>      }
>>>      qemu_aio_release(acb);
>>>  }
>>> @@ -411,7 +412,8 @@ static void sd_aio_cancel(BlockDriverAIOCB *blockacb)
>>>       * Sheepdog cannot cancel the requests which are already sent to
>>>       * the servers, so we just complete the request with -EIO here.
>>>       */
>>> -    acb->common.cb(acb->common.opaque, -EIO);
>>> +    acb->ret = -EIO;
>>> +    qemu_coroutine_enter(acb->coroutine, NULL);
>>>      acb->canceled = 1;
>>>  }
>>>  
>>> @@ -435,24 +437,12 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
>>>  
>>>      acb->aio_done_func = NULL;
>>>      acb->canceled = 0;
>>> -    acb->bh = NULL;
>>> +    acb->coroutine = qemu_coroutine_self();
>>>      acb->ret = 0;
>>>      QLIST_INIT(&acb->aioreq_head);
>>>      return acb;
>>>  }
>>>  
>>> -static int sd_schedule_bh(QEMUBHFunc *cb, SheepdogAIOCB *acb)
>>> -{
>>> -    if (acb->bh) {
>>> -        error_report("bug: %d %d", acb->aiocb_type, acb->aiocb_type);
>>> -        return -EIO;
>>> -    }
>>> -
>>> -    acb->bh = qemu_bh_new(cb, acb);
>>> -    qemu_bh_schedule(acb->bh);
>>> -    return 0;
>>> -}
>>> -
>>>  #ifdef _WIN32
>>>  
>>>  struct msghdr {
>>> @@ -635,7 +625,13 @@ static int do_readv_writev(int sockfd, struct iovec *iov, int len,
>>>  again:
>>>      ret = do_send_recv(sockfd, iov, len, iov_offset, write);
>>>      if (ret < 0) {
>>> -        if (errno == EINTR || errno == EAGAIN) {
>>> +        if (errno == EINTR) {
>>> +            goto again;
>>> +        }
>>> +        if (errno == EAGAIN) {
>>> +            if (qemu_in_coroutine()) {
>>> +                qemu_coroutine_yield();
>>> +            }
>>
>> Who reenters the coroutine if we yield here?
> 
> The fd handlers, co_write_request() and co_read_response(), will call
> qemu_coroutine_enter() for the coroutine.  It will be restarted after
> the sheepdog network connection becomes ready.

Makes sense. Applied to the block branch,

>> And of course for a coroutine based block driver I would like to see
>> much less code in callback functions. But it's a good start anyway.
> 
> Yes, actually they can be much simpler.  This patch adds asynchrous
> I/O support with a minimal change based on a coroutine, and I think
> code simplification would be the next step.

Ok, if you're looking into this, perfect. :-)

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] sheepdog: use coroutines
  2011-08-12 12:33 [Qemu-devel] [PATCH] sheepdog: use coroutines MORITA Kazutaka
  2011-08-23 12:29 ` Kevin Wolf
@ 2011-12-23 13:38 ` Christoph Hellwig
  2011-12-29 12:06   ` [Qemu-devel] coroutine bug?, was " Christoph Hellwig
  1 sibling, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2011-12-23 13:38 UTC (permalink / raw)
  To: MORITA Kazutaka; +Cc: kwolf, sheepdog, qemu-devel

FYI, this causes segfaults when doing large streaming writes when
running against a sheepdog cluster which:

  a) has relatively fast SSDs

and

  b) uses buffered I/O.

Unfortunately I can't get a useful backtrace out of gdb.  When running just
this commit I at least get some debugging messages:

qemu-system-x86_64: failed to recv a rsp, Socket operation on non-socket
qemu-system-x86_64: failed to get the header, Socket operation on non-socket

but on least qemu these don't show up either.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2011-12-23 13:38 ` Christoph Hellwig
@ 2011-12-29 12:06   ` Christoph Hellwig
  2011-12-30 10:35     ` Stefan Hajnoczi
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2011-12-29 12:06 UTC (permalink / raw)
  To: MORITA Kazutaka; +Cc: kwolf, sheepdog, qemu-devel

On Fri, Dec 23, 2011 at 02:38:50PM +0100, Christoph Hellwig wrote:
> FYI, this causes segfaults when doing large streaming writes when
> running against a sheepdog cluster which:
> 
>   a) has relatively fast SSDs
> 
> and
> 
>   b) uses buffered I/O.
> 
> Unfortunately I can't get a useful backtrace out of gdb.  When running just
> this commit I at least get some debugging messages:
> 
> qemu-system-x86_64: failed to recv a rsp, Socket operation on non-socket
> qemu-system-x86_64: failed to get the header, Socket operation on non-socket
> 
> but on least qemu these don't show up either.

s/least/latest/

Some more debugging.  Just for the call that eventually segfaults s->fd
turns from its normal value (normall 13 for me) into 0.  This is entirely
reproducable in my testing, and given that the sheepdog driver never
assigns to that value except opening the device this seems to point to
an issue in the coroutine code to me.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2011-12-29 12:06   ` [Qemu-devel] coroutine bug?, was " Christoph Hellwig
@ 2011-12-30 10:35     ` Stefan Hajnoczi
  2012-01-02 15:39       ` Christoph Hellwig
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2011-12-30 10:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kwolf, sheepdog, qemu-devel, MORITA Kazutaka

On Thu, Dec 29, 2011 at 01:06:26PM +0100, Christoph Hellwig wrote:
> On Fri, Dec 23, 2011 at 02:38:50PM +0100, Christoph Hellwig wrote:
> > FYI, this causes segfaults when doing large streaming writes when
> > running against a sheepdog cluster which:
> > 
> >   a) has relatively fast SSDs
> > 
> > and
> > 
> >   b) uses buffered I/O.
> > 
> > Unfortunately I can't get a useful backtrace out of gdb.  When running just
> > this commit I at least get some debugging messages:
> > 
> > qemu-system-x86_64: failed to recv a rsp, Socket operation on non-socket
> > qemu-system-x86_64: failed to get the header, Socket operation on non-socket
> > 
> > but on least qemu these don't show up either.
> 
> s/least/latest/
> 
> Some more debugging.  Just for the call that eventually segfaults s->fd
> turns from its normal value (normall 13 for me) into 0.  This is entirely
> reproducable in my testing, and given that the sheepdog driver never
> assigns to that value except opening the device this seems to point to
> an issue in the coroutine code to me.

Are you building with gcc 4.5.3 or later?  (Earlier versions may
mis-compile, see https://bugs.launchpad.net/qemu/+bug/902148.)

If you can reproduce this bug and suspect coroutines are involved then I
suggest using gdb to observe the last valid field values of s and the
address of s.  When the coroutine re-enters make sure that s still has
the same address and check if the field values are the same as before.

I don't have a sheepdog setup here but if there's an easy way to
reproduce please let me know and I'll take a look.

Stefan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2011-12-30 10:35     ` Stefan Hajnoczi
@ 2012-01-02 15:39       ` Christoph Hellwig
  2012-01-02 15:40         ` Christoph Hellwig
                           ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Christoph Hellwig @ 2012-01-02 15:39 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, sheepdog, Christoph Hellwig, MORITA Kazutaka, qemu-devel

On Fri, Dec 30, 2011 at 10:35:01AM +0000, Stefan Hajnoczi wrote:
> Are you building with gcc 4.5.3 or later?  (Earlier versions may
> mis-compile, see https://bugs.launchpad.net/qemu/+bug/902148.)

I'm using "gcc version 4.6.2 (Debian 4.6.2-9)", so that should not
be an issue. 

> If you can reproduce this bug and suspect coroutines are involved then I

It's entirely reproducable.

I've played around a bit and switched from the ucontext to the gthreads
coroutine implementation.  The result seems odd, but starts to make sense.

Running the workload I now get the following message from qemu:

Co-routine re-entered recursively

and the gdb backtrace looks like:

(gdb) bt
#0  0x00007f2fff36f405 in *__GI_raise (sig=<optimized out>)
    at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
#1  0x00007f2fff372680 in *__GI_abort () at abort.c:92
#2  0x00007f30019a6616 in qemu_coroutine_enter (co=0x7f3004d4d7b0, opaque=0x0)
    at qemu-coroutine.c:53
#3  0x00007f30019a5e82 in qemu_co_queue_next_bh (opaque=<optimized out>)
    at qemu-coroutine-lock.c:43
#4  0x00007f30018d5a72 in qemu_bh_poll () at async.c:71
#5  0x00007f3001982990 in main_loop_wait (nonblocking=<optimized out>)
    at main-loop.c:472
#6  0x00007f30018cf714 in main_loop () at /home/hch/work/qemu/vl.c:1481
#7  main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>)
    at /home/hch/work/qemu/vl.c:3479

adding some printks suggest this happens when calling add_aio_request from
aio_read_response when either delaying creates, or updating metadata,
although not everytime one of these cases happens.

I've tried to understand how the recursive calling happens, but unfortunately
the whole coroutine code lacks any sort of documentation how it should
behave or what it asserts about the callers.

> I don't have a sheepdog setup here but if there's an easy way to
> reproduce please let me know and I'll take a look.

With the small patch below applied to the sheppdog source I can reproduce
the issue on my laptop using the following setup:

for port in 7000 7001 7002; do
    mkdir -p /mnt/sheepdog/$port
    /usr/sbin/sheep -p $port -c local /mnt/sheepdog/$port
    sleep 2
done

collie cluster format
collie vdi create test 20G

then start a qemu instance that uses the the sheepdog volume using the
following device and drive lines:

	-drive if=none,file=sheepdog:test,cache=none,id=test \
	-device virtio-blk-pci,drive=test,id=testdev \

finally, in the guest run:

	dd if=/dev/zero of=/dev/vdX bs=67108864 count=128 oflag=direct

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-02 15:39       ` Christoph Hellwig
@ 2012-01-02 15:40         ` Christoph Hellwig
  2012-01-02 22:38         ` Stefan Hajnoczi
  2012-01-03  8:16         ` Stefan Hajnoczi
  2 siblings, 0 replies; 15+ messages in thread
From: Christoph Hellwig @ 2012-01-02 15:40 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, sheepdog, Christoph Hellwig, MORITA Kazutaka, qemu-devel

Forgot to attach the "go-fast" patch required to reproduce the issue
on slow storage:

Index: sheepdog/sheep/simple_store.c
===================================================================
--- sheepdog.orig/sheep/simple_store.c	2011-12-29 20:10:44.171622413 +0100
+++ sheepdog/sheep/simple_store.c	2012-01-02 16:23:39.759957971 +0100
@@ -30,7 +30,7 @@ extern char *obj_path;
 
 extern mode_t def_fmode;
 
-static int def_store_flags = O_DSYNC | O_RDWR;
+static int def_store_flags = O_RDWR;
 
 struct store_driver store;
 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-02 15:39       ` Christoph Hellwig
  2012-01-02 15:40         ` Christoph Hellwig
@ 2012-01-02 22:38         ` Stefan Hajnoczi
  2012-01-03  8:13           ` Stefan Hajnoczi
  2012-01-03  8:16         ` Stefan Hajnoczi
  2 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2012-01-02 22:38 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kwolf, sheepdog, qemu-devel, MORITA Kazutaka

On Mon, Jan 2, 2012 at 3:39 PM, Christoph Hellwig <hch@lst.de> wrote:
> On Fri, Dec 30, 2011 at 10:35:01AM +0000, Stefan Hajnoczi wrote:
>> If you can reproduce this bug and suspect coroutines are involved then I
>
> It's entirely reproducable.
>
> I've played around a bit and switched from the ucontext to the gthreads
> coroutine implementation.  The result seems odd, but starts to make sense.
>
> Running the workload I now get the following message from qemu:
>
> Co-routine re-entered recursively
>
> and the gdb backtrace looks like:
>
> (gdb) bt
> #0  0x00007f2fff36f405 in *__GI_raise (sig=<optimized out>)
>    at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> #1  0x00007f2fff372680 in *__GI_abort () at abort.c:92
> #2  0x00007f30019a6616 in qemu_coroutine_enter (co=0x7f3004d4d7b0, opaque=0x0)
>    at qemu-coroutine.c:53
> #3  0x00007f30019a5e82 in qemu_co_queue_next_bh (opaque=<optimized out>)
>    at qemu-coroutine-lock.c:43
> #4  0x00007f30018d5a72 in qemu_bh_poll () at async.c:71
> #5  0x00007f3001982990 in main_loop_wait (nonblocking=<optimized out>)
>    at main-loop.c:472
> #6  0x00007f30018cf714 in main_loop () at /home/hch/work/qemu/vl.c:1481
> #7  main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>)
>    at /home/hch/work/qemu/vl.c:3479
>
> adding some printks suggest this happens when calling add_aio_request from
> aio_read_response when either delaying creates, or updating metadata,
> although not everytime one of these cases happens.
>
> I've tried to understand how the recursive calling happens, but unfortunately
> the whole coroutine code lacks any sort of documentation how it should
> behave or what it asserts about the callers.
>
>> I don't have a sheepdog setup here but if there's an easy way to
>> reproduce please let me know and I'll take a look.
>
> With the small patch below applied to the sheppdog source I can reproduce
> the issue on my laptop using the following setup:
>
> for port in 7000 7001 7002; do
>    mkdir -p /mnt/sheepdog/$port
>    /usr/sbin/sheep -p $port -c local /mnt/sheepdog/$port
>    sleep 2
> done
>
> collie cluster format
> collie vdi create test 20G
>
> then start a qemu instance that uses the the sheepdog volume using the
> following device and drive lines:
>
>        -drive if=none,file=sheepdog:test,cache=none,id=test \
>        -device virtio-blk-pci,drive=test,id=testdev \
>
> finally, in the guest run:
>
>        dd if=/dev/zero of=/dev/vdX bs=67108864 count=128 oflag=direct

Thanks for these instructions.  I can reproduce the issue here.

It seems suspicious the way that BDRVSheepdogState->co_recv and
->co_send work.  The code adds select(2) read/write callback functions
on the sheepdog socket file descriptor.  When the socket becomes
writeable or readable the co_send or co_recv coroutines are entered.
So far, so good, this is how a coroutine is integrated into the main
loop of QEMU.

The problem is that this patch is mixing things.  The co_recv
coroutine runs aio_read_response(), which invokes send_pending_req().
send_pending_req() invokes add_aio_request().  That function isn't
suitable for co_recv's context because it actually sends data and hits
a few blocking (yield) points.  It takes a coroutine mutex - but the
select(2) read callback is still in place.  We're now still in the
aio_read_response() call chain except we're actually not reading at
all, we're trying to write!  And we'll get spurious wakeups if there
is any data readable on the socket.

So the co_recv coroutine has two things in the system that will try to enter it:
1. The select(2) read callback on the sheepdog socket.
2. The aio_add_request() blocking operations, including a coroutine mutex.

This is bad, a yielded coroutine should only have one thing that will
enter it.  It's rare that it makes sense to have multiple things
entering a coroutine.

It's late here but I hope this gives Kazutaka some thoughts on what is
causing the issue with this patch.

Stefan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-02 22:38         ` Stefan Hajnoczi
@ 2012-01-03  8:13           ` Stefan Hajnoczi
  2012-01-06 11:16             ` MORITA Kazutaka
  0 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2012-01-03  8:13 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kwolf, sheepdog, qemu-devel, MORITA Kazutaka

On Mon, Jan 02, 2012 at 10:38:11PM +0000, Stefan Hajnoczi wrote:
> On Mon, Jan 2, 2012 at 3:39 PM, Christoph Hellwig <hch@lst.de> wrote:
> > On Fri, Dec 30, 2011 at 10:35:01AM +0000, Stefan Hajnoczi wrote:
> >> If you can reproduce this bug and suspect coroutines are involved then I
> >
> > It's entirely reproducable.
> >
> > I've played around a bit and switched from the ucontext to the gthreads
> > coroutine implementation.  The result seems odd, but starts to make sense.
> >
> > Running the workload I now get the following message from qemu:
> >
> > Co-routine re-entered recursively
> >
> > and the gdb backtrace looks like:
> >
> > (gdb) bt
> > #0  0x00007f2fff36f405 in *__GI_raise (sig=<optimized out>)
> >    at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> > #1  0x00007f2fff372680 in *__GI_abort () at abort.c:92
> > #2  0x00007f30019a6616 in qemu_coroutine_enter (co=0x7f3004d4d7b0, opaque=0x0)
> >    at qemu-coroutine.c:53
> > #3  0x00007f30019a5e82 in qemu_co_queue_next_bh (opaque=<optimized out>)
> >    at qemu-coroutine-lock.c:43
> > #4  0x00007f30018d5a72 in qemu_bh_poll () at async.c:71
> > #5  0x00007f3001982990 in main_loop_wait (nonblocking=<optimized out>)
> >    at main-loop.c:472
> > #6  0x00007f30018cf714 in main_loop () at /home/hch/work/qemu/vl.c:1481
> > #7  main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>)
> >    at /home/hch/work/qemu/vl.c:3479
> >
> > adding some printks suggest this happens when calling add_aio_request from
> > aio_read_response when either delaying creates, or updating metadata,
> > although not everytime one of these cases happens.
> >
> > I've tried to understand how the recursive calling happens, but unfortunately
> > the whole coroutine code lacks any sort of documentation how it should
> > behave or what it asserts about the callers.
> >
> >> I don't have a sheepdog setup here but if there's an easy way to
> >> reproduce please let me know and I'll take a look.
> >
> > With the small patch below applied to the sheppdog source I can reproduce
> > the issue on my laptop using the following setup:
> >
> > for port in 7000 7001 7002; do
> >    mkdir -p /mnt/sheepdog/$port
> >    /usr/sbin/sheep -p $port -c local /mnt/sheepdog/$port
> >    sleep 2
> > done
> >
> > collie cluster format
> > collie vdi create test 20G
> >
> > then start a qemu instance that uses the the sheepdog volume using the
> > following device and drive lines:
> >
> >        -drive if=none,file=sheepdog:test,cache=none,id=test \
> >        -device virtio-blk-pci,drive=test,id=testdev \
> >
> > finally, in the guest run:
> >
> >        dd if=/dev/zero of=/dev/vdX bs=67108864 count=128 oflag=direct
> 
> Thanks for these instructions.  I can reproduce the issue here.
> 
> It seems suspicious the way that BDRVSheepdogState->co_recv and
> ->co_send work.  The code adds select(2) read/write callback functions
> on the sheepdog socket file descriptor.  When the socket becomes
> writeable or readable the co_send or co_recv coroutines are entered.
> So far, so good, this is how a coroutine is integrated into the main
> loop of QEMU.
> 
> The problem is that this patch is mixing things.  The co_recv
> coroutine runs aio_read_response(), which invokes send_pending_req().
> send_pending_req() invokes add_aio_request().  That function isn't
> suitable for co_recv's context because it actually sends data and hits
> a few blocking (yield) points.  It takes a coroutine mutex - but the
> select(2) read callback is still in place.  We're now still in the
> aio_read_response() call chain except we're actually not reading at
> all, we're trying to write!  And we'll get spurious wakeups if there
> is any data readable on the socket.
> 
> So the co_recv coroutine has two things in the system that will try to enter it:
> 1. The select(2) read callback on the sheepdog socket.
> 2. The aio_add_request() blocking operations, including a coroutine mutex.
> 
> This is bad, a yielded coroutine should only have one thing that will
> enter it.  It's rare that it makes sense to have multiple things
> entering a coroutine.
> 
> It's late here but I hope this gives Kazutaka some thoughts on what is
> causing the issue with this patch.

Poked around some more this morning.  Here is a patch that isolates the
bug:

---8<---
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index 26ad76b..0d7a03f 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -59,6 +59,16 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
     QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
     qemu_coroutine_yield();
     assert(qemu_in_coroutine());
+    {
+        Coroutine *co;
+
+        QTAILQ_FOREACH(co, &queue->entries, co_queue_next) {
+            assert(co != self);
+        }
+        QTAILQ_FOREACH(co, &unlock_bh_queue, co_queue_next) {
+            assert(co != self);
+        }
+    }
 }
 
 void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
---8<---

When you run this with ucontext or GThread coroutines you hit this
assertion failure:

qemu-system-x86_64: qemu-coroutine-lock.c:66: qemu_co_queue_wait: Assertion `co != self' failed.

Here is an explanation for what the asserts are checking and how we
violate the constraint:

qemu_co_queue_next() and qemu_co_queue_next_bh() take a waiting
Coroutine off the wait queue and put it onto the unlock bh queue.  When
the BH executes the coroutines from the unlock bh queue are popped off
that queue and entered.  This is how coroutine wait queues work, and
coroutine mutexes are built on coroutine wait queues.

The problem is that due to the spurious wakeups introduced in the
sheepdog patch we are acquiring the mutex earlier than normal with a BH
unlock still pending.  Our coroutine can actually terminate by returning
from sheepdog.c:aio_read_respond() while the BH is still pending.  When
we get around to executing the BH we're going to operate on a freed
coroutine - BOOM!

The reason why we get different failure behavior with ucontext and
GThread is because they have different implementations and reuse
coroutines different.  We've still done something illegal but the
undefined behavior happens to be different - both ucontext and GThread
are working correctly, the problem is in the sheepdog patch.

I suggest implementing coroutine socket I/O functions for both send
*and* receive.  Then sheepdog request processing becomes simple - we'll
have less callback and coroutine trickery because it's possible to write
synchronous coroutine code.

Right now the code is a mix of callbacks and some coroutine code (which
has the flaw I explained above).  Things will be much easier if the code
is converted fully to coroutines.

Stefan

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-02 15:39       ` Christoph Hellwig
  2012-01-02 15:40         ` Christoph Hellwig
  2012-01-02 22:38         ` Stefan Hajnoczi
@ 2012-01-03  8:16         ` Stefan Hajnoczi
  2012-01-04 15:58           ` Christoph Hellwig
  2 siblings, 1 reply; 15+ messages in thread
From: Stefan Hajnoczi @ 2012-01-03  8:16 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: kwolf, sheepdog, qemu-devel, MORITA Kazutaka

On Mon, Jan 02, 2012 at 04:39:59PM +0100, Christoph Hellwig wrote:
> I've tried to understand how the recursive calling happens, but unfortunately
> the whole coroutine code lacks any sort of documentation how it should
> behave or what it asserts about the callers.

There is documentation on the public coroutine functions, see
qemu-coroutine.h.

If you have specific questions I can improve the documentation.  Please
let me know.

Stefan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-03  8:16         ` Stefan Hajnoczi
@ 2012-01-04 15:58           ` Christoph Hellwig
  2012-01-04 17:03             ` Paolo Bonzini
  0 siblings, 1 reply; 15+ messages in thread
From: Christoph Hellwig @ 2012-01-04 15:58 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, sheepdog, Christoph Hellwig, MORITA Kazutaka, qemu-devel

On Tue, Jan 03, 2012 at 08:16:55AM +0000, Stefan Hajnoczi wrote:
> On Mon, Jan 02, 2012 at 04:39:59PM +0100, Christoph Hellwig wrote:
> > I've tried to understand how the recursive calling happens, but unfortunately
> > the whole coroutine code lacks any sort of documentation how it should
> > behave or what it asserts about the callers.
> 
> There is documentation on the public coroutine functions, see
> qemu-coroutine.h.

Ok, I was looking in the source files and the documentation directory only
as that is where I expected the documentation to sit.

Btw, what is the plan forward for the block I/O interface?  Only
qcow2, sheepdog and nbd implement the coroutine interfaces, and none of the
hardware emulations calls them directly.  Also at least for interfaces
like the librbd callbacks coroutines don't even seem to be that useful.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-04 15:58           ` Christoph Hellwig
@ 2012-01-04 17:03             ` Paolo Bonzini
  0 siblings, 0 replies; 15+ messages in thread
From: Paolo Bonzini @ 2012-01-04 17:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: public-kwolf-H+wXaHxf7aLQT0dZR+AlfA,
	public-sheepdog-CJ+3F33aHDCwRfvgX43A7Q, Stefan Hajnoczi,
	public-qemu-devel-qX2TKyscuCcdnm+yROfE0A

On 01/04/2012 04:58 PM, Christoph Hellwig wrote:
> Btw, what is the plan forward for the block I/O interface?  Only
> qcow2, sheepdog and nbd implement the coroutine interfaces

Actually all formats implement a coroutine interface except raw, qed and 
vdi (IIRC).  But the "minor" formats do not implement scatter/gather 
I/O, and use a bounce buffer instead.

Among the protocols, only nbd implements coroutines, the others use AIO.

> , and none of the
> hardware emulations calls them directly.

Yes, AIO is often simpler to use for hardware emulation (or anyway it 
has smaller advantages).  But I do expect that SD could easily switch 
from synchronous to coroutines, for example.  Floppy is a mess.

IMO both coroutines and AIO have their uses.  Converting qed to 
coroutines would make sense, but this is much less true for converting 
raw and even less for rbd or iscsi (the external libraries are 
callback-based).

On the other hand, changing all formats to support scatter/gather I/O 
should not be hard, and would enable some nice simplifications in the 
code.  I have enough other cleanups on my plate, though. :/

Paolo

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] coroutine bug?, was Re: [PATCH] sheepdog: use coroutines
  2012-01-03  8:13           ` Stefan Hajnoczi
@ 2012-01-06 11:16             ` MORITA Kazutaka
  0 siblings, 0 replies; 15+ messages in thread
From: MORITA Kazutaka @ 2012-01-06 11:16 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: kwolf, sheepdog, Christoph Hellwig, MORITA Kazutaka, qemu-devel

At Tue, 3 Jan 2012 08:13:51 +0000,
Stefan Hajnoczi wrote:
> 
> On Mon, Jan 02, 2012 at 10:38:11PM +0000, Stefan Hajnoczi wrote:
> > On Mon, Jan 2, 2012 at 3:39 PM, Christoph Hellwig <hch@lst.de> wrote:
> > > On Fri, Dec 30, 2011 at 10:35:01AM +0000, Stefan Hajnoczi wrote:
> > >> If you can reproduce this bug and suspect coroutines are involved then I
> > >
> > > It's entirely reproducable.
> > >
> > > I've played around a bit and switched from the ucontext to the gthreads
> > > coroutine implementation.  The result seems odd, but starts to make sense.
> > >
> > > Running the workload I now get the following message from qemu:
> > >
> > > Co-routine re-entered recursively
> > >
> > > and the gdb backtrace looks like:
> > >
> > > (gdb) bt
> > > #0  0x00007f2fff36f405 in *__GI_raise (sig=<optimized out>)
> > >    at ../nptl/sysdeps/unix/sysv/linux/raise.c:64
> > > #1  0x00007f2fff372680 in *__GI_abort () at abort.c:92
> > > #2  0x00007f30019a6616 in qemu_coroutine_enter (co=0x7f3004d4d7b0, opaque=0x0)
> > >    at qemu-coroutine.c:53
> > > #3  0x00007f30019a5e82 in qemu_co_queue_next_bh (opaque=<optimized out>)
> > >    at qemu-coroutine-lock.c:43
> > > #4  0x00007f30018d5a72 in qemu_bh_poll () at async.c:71
> > > #5  0x00007f3001982990 in main_loop_wait (nonblocking=<optimized out>)
> > >    at main-loop.c:472
> > > #6  0x00007f30018cf714 in main_loop () at /home/hch/work/qemu/vl.c:1481
> > > #7  main (argc=<optimized out>, argv=<optimized out>, envp=<optimized out>)
> > >    at /home/hch/work/qemu/vl.c:3479
> > >
> > > adding some printks suggest this happens when calling add_aio_request from
> > > aio_read_response when either delaying creates, or updating metadata,
> > > although not everytime one of these cases happens.
> > >
> > > I've tried to understand how the recursive calling happens, but unfortunately
> > > the whole coroutine code lacks any sort of documentation how it should
> > > behave or what it asserts about the callers.
> > >
> > >> I don't have a sheepdog setup here but if there's an easy way to
> > >> reproduce please let me know and I'll take a look.
> > >
> > > With the small patch below applied to the sheppdog source I can reproduce
> > > the issue on my laptop using the following setup:
> > >
> > > for port in 7000 7001 7002; do
> > >    mkdir -p /mnt/sheepdog/$port
> > >    /usr/sbin/sheep -p $port -c local /mnt/sheepdog/$port
> > >    sleep 2
> > > done
> > >
> > > collie cluster format
> > > collie vdi create test 20G
> > >
> > > then start a qemu instance that uses the the sheepdog volume using the
> > > following device and drive lines:
> > >
> > >        -drive if=none,file=sheepdog:test,cache=none,id=test \
> > >        -device virtio-blk-pci,drive=test,id=testdev \
> > >
> > > finally, in the guest run:
> > >
> > >        dd if=/dev/zero of=/dev/vdX bs=67108864 count=128 oflag=direct
> > 
> > Thanks for these instructions.  I can reproduce the issue here.
> > 
> > It seems suspicious the way that BDRVSheepdogState->co_recv and
> > ->co_send work.  The code adds select(2) read/write callback functions
> > on the sheepdog socket file descriptor.  When the socket becomes
> > writeable or readable the co_send or co_recv coroutines are entered.
> > So far, so good, this is how a coroutine is integrated into the main
> > loop of QEMU.
> > 
> > The problem is that this patch is mixing things.  The co_recv
> > coroutine runs aio_read_response(), which invokes send_pending_req().
> > send_pending_req() invokes add_aio_request().  That function isn't
> > suitable for co_recv's context because it actually sends data and hits
> > a few blocking (yield) points.  It takes a coroutine mutex - but the
> > select(2) read callback is still in place.  We're now still in the
> > aio_read_response() call chain except we're actually not reading at
> > all, we're trying to write!  And we'll get spurious wakeups if there
> > is any data readable on the socket.
> > 
> > So the co_recv coroutine has two things in the system that will try to enter it:
> > 1. The select(2) read callback on the sheepdog socket.
> > 2. The aio_add_request() blocking operations, including a coroutine mutex.
> > 
> > This is bad, a yielded coroutine should only have one thing that will
> > enter it.  It's rare that it makes sense to have multiple things
> > entering a coroutine.
> > 
> > It's late here but I hope this gives Kazutaka some thoughts on what is
> > causing the issue with this patch.
> 
> Poked around some more this morning.  Here is a patch that isolates the
> bug:
> 
> ---8<---
> diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
> index 26ad76b..0d7a03f 100644
> --- a/qemu-coroutine-lock.c
> +++ b/qemu-coroutine-lock.c
> @@ -59,6 +59,16 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
>      QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
>      qemu_coroutine_yield();
>      assert(qemu_in_coroutine());
> +    {
> +        Coroutine *co;
> +
> +        QTAILQ_FOREACH(co, &queue->entries, co_queue_next) {
> +            assert(co != self);
> +        }
> +        QTAILQ_FOREACH(co, &unlock_bh_queue, co_queue_next) {
> +            assert(co != self);
> +        }
> +    }
>  }
>  
>  void coroutine_fn qemu_co_queue_wait_insert_head(CoQueue *queue)
> ---8<---
> 
> When you run this with ucontext or GThread coroutines you hit this
> assertion failure:
> 
> qemu-system-x86_64: qemu-coroutine-lock.c:66: qemu_co_queue_wait: Assertion `co != self' failed.
> 
> Here is an explanation for what the asserts are checking and how we
> violate the constraint:
> 
> qemu_co_queue_next() and qemu_co_queue_next_bh() take a waiting
> Coroutine off the wait queue and put it onto the unlock bh queue.  When
> the BH executes the coroutines from the unlock bh queue are popped off
> that queue and entered.  This is how coroutine wait queues work, and
> coroutine mutexes are built on coroutine wait queues.
> 
> The problem is that due to the spurious wakeups introduced in the
> sheepdog patch we are acquiring the mutex earlier than normal with a BH
> unlock still pending.  Our coroutine can actually terminate by returning
> from sheepdog.c:aio_read_respond() while the BH is still pending.  When
> we get around to executing the BH we're going to operate on a freed
> coroutine - BOOM!

Thanks for your detailed explanation.  I confirmed the following hack
fixes this problem.

---8<---
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 17a79be..b27c770 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -626,6 +626,9 @@ static void coroutine_fn aio_read_response(void *opaque)
 
     switch (acb->aiocb_type) {
     case AIOCB_WRITE_UDATA:
+        /* this coroutine context is no longer suitable for co_recv
+         * because we may send data to update vdi objects */
+        s->co_recv = NULL;
         if (!is_data_obj(aio_req->oid)) {
             break;
         }
---8<---

> 
> The reason why we get different failure behavior with ucontext and
> GThread is because they have different implementations and reuse
> coroutines different.  We've still done something illegal but the
> undefined behavior happens to be different - both ucontext and GThread
> are working correctly, the problem is in the sheepdog patch.
> 
> I suggest implementing coroutine socket I/O functions for both send
> *and* receive.  Then sheepdog request processing becomes simple - we'll
> have less callback and coroutine trickery because it's possible to write
> synchronous coroutine code.
> 
> Right now the code is a mix of callbacks and some coroutine code (which
> has the flaw I explained above).  Things will be much easier if the code
> is converted fully to coroutines.

Yes, the original patch aimed to make Sheepdog I/O asynchronous with a
small change, so the current sheepdog code is not clean.  It looks
like a good time to make the sheepdog driver fully coroutine-based
code.

Thanks,

Kazutaka

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2012-01-06 11:08 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-08-12 12:33 [Qemu-devel] [PATCH] sheepdog: use coroutines MORITA Kazutaka
2011-08-23 12:29 ` Kevin Wolf
2011-08-23 17:14   ` MORITA Kazutaka
2011-08-24 12:56     ` Kevin Wolf
2011-12-23 13:38 ` Christoph Hellwig
2011-12-29 12:06   ` [Qemu-devel] coroutine bug?, was " Christoph Hellwig
2011-12-30 10:35     ` Stefan Hajnoczi
2012-01-02 15:39       ` Christoph Hellwig
2012-01-02 15:40         ` Christoph Hellwig
2012-01-02 22:38         ` Stefan Hajnoczi
2012-01-03  8:13           ` Stefan Hajnoczi
2012-01-06 11:16             ` MORITA Kazutaka
2012-01-03  8:16         ` Stefan Hajnoczi
2012-01-04 15:58           ` Christoph Hellwig
2012-01-04 17:03             ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).