qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation
@ 2012-12-31 12:16 Roger Pau Monne
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Roger Pau Monne @ 2012-12-31 12:16 UTC (permalink / raw)
  To: qemu-devel

This series contains two bug fixes for xen_disk (patches 1 & 2) and 
the implementation of the persistent grants extensions (patch 3), that 
brings a considerable speed improvement.

Thanks for the reviews, Roger.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2012-12-31 12:16 [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation Roger Pau Monne
@ 2012-12-31 12:16 ` Roger Pau Monne
  2013-01-03 14:21   ` [Qemu-devel] [Xen-devel] " Konrad Rzeszutek Wilk
  2013-01-03 14:28   ` Ian Campbell
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak Roger Pau Monne
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Roger Pau Monne
  2 siblings, 2 replies; 14+ messages in thread
From: Roger Pau Monne @ 2012-12-31 12:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony PERARD, xen-devel, Stefano Stabellini, Roger Pau Monne

Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,
if first call to bdrv_open fails with errno = EINVAL, try a second
call without BDRV_O_NOCACHE.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: xen-devel@lists.xen.org
Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
Cc: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/xen_disk.c |   16 +++++++++++++---
 1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index e6bb2f2..a159ee5 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -562,7 +562,7 @@ static void blk_alloc(struct XenDevice *xendev)
 static int blk_init(struct XenDevice *xendev)
 {
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
-    int index, qflags, info = 0;
+    int index, qflags, info = 0, rc;
 
     /* read xenstore entries */
     if (blkdev->params == NULL) {
@@ -625,8 +625,18 @@ static int blk_init(struct XenDevice *xendev)
         xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
         blkdev->bs = bdrv_new(blkdev->dev);
         if (blkdev->bs) {
-            if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
-                        bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
+            rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
+                        bdrv_find_whitelisted_format(blkdev->fileproto));
+            if (rc != 0 && errno == EINVAL) {
+                /* Files on ramfs or tmpfs cannot be opened with O_DIRECT,
+                 * remove the BDRV_O_NOCACHE flag, and try to open
+                 * the file again.
+                 */
+                qflags &= ~BDRV_O_NOCACHE;
+                rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
+                        bdrv_find_whitelisted_format(blkdev->fileproto));
+            }
+            if (rc != 0) {
                 bdrv_delete(blkdev->bs);
                 blkdev->bs = NULL;
             }
-- 
1.7.7.5 (Apple Git-26)

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak
  2012-12-31 12:16 [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation Roger Pau Monne
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
@ 2012-12-31 12:16 ` Roger Pau Monne
  2013-01-04 15:06   ` Stefano Stabellini
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Roger Pau Monne
  2 siblings, 1 reply; 14+ messages in thread
From: Roger Pau Monne @ 2012-12-31 12:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony PERARD, xen-devel, Stefano Stabellini, Roger Pau Monne

On ioreq_release the full ioreq was memset to 0, loosing all the data
and memory allocations inside the QEMUIOVector, which leads to a
memory leak. Create a new function to specifically reset ioreq.

Reported-by: Maik Wessler <maik.wessler@yahoo.com>
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: xen-devel@lists.xen.org
Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
Cc: Anthony PERARD <anthony.perard@citrix.com>
---
 hw/xen_disk.c |   28 ++++++++++++++++++++++++++--
 1 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index a159ee5..1eb485a 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -113,6 +113,31 @@ struct XenBlkDev {
 
 /* ------------------------------------------------------------- */
 
+static void ioreq_reset(struct ioreq *ioreq)
+{
+    memset(&ioreq->req, 0, sizeof(ioreq->req));
+    ioreq->status = 0;
+    ioreq->start = 0;
+    ioreq->presync = 0;
+    ioreq->postsync = 0;
+    ioreq->mapped = 0;
+
+    memset(ioreq->domids, 0, sizeof(ioreq->domids));
+    memset(ioreq->refs, 0, sizeof(ioreq->refs));
+    ioreq->prot = 0;
+    memset(ioreq->page, 0, sizeof(ioreq->page));
+    ioreq->pages = NULL;
+
+    ioreq->aio_inflight = 0;
+    ioreq->aio_errors = 0;
+
+    ioreq->blkdev = NULL;
+    memset(&ioreq->list, 0, sizeof(ioreq->list));
+    memset(&ioreq->acct, 0, sizeof(ioreq->acct));
+
+    qemu_iovec_reset(&ioreq->v);
+}
+
 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 {
     struct ioreq *ioreq = NULL;
@@ -130,7 +155,6 @@ static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
         /* get one from freelist */
         ioreq = QLIST_FIRST(&blkdev->freelist);
         QLIST_REMOVE(ioreq, list);
-        qemu_iovec_reset(&ioreq->v);
     }
     QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
     blkdev->requests_inflight++;
@@ -154,7 +178,7 @@ static void ioreq_release(struct ioreq *ioreq, bool finish)
     struct XenBlkDev *blkdev = ioreq->blkdev;
 
     QLIST_REMOVE(ioreq, list);
-    memset(ioreq, 0, sizeof(*ioreq));
+    ioreq_reset(ioreq);
     ioreq->blkdev = blkdev;
     QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
     if (finish) {
-- 
1.7.7.5 (Apple Git-26)

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
  2012-12-31 12:16 [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation Roger Pau Monne
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak Roger Pau Monne
@ 2012-12-31 12:16 ` Roger Pau Monne
  2013-01-04 16:42   ` Stefano Stabellini
  2013-01-04 21:01   ` Blue Swirl
  2 siblings, 2 replies; 14+ messages in thread
From: Roger Pau Monne @ 2012-12-31 12:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Anthony PERARD, xen-devel, Stefano Stabellini, Roger Pau Monne

This protocol extension reuses the same set of grant pages for all
transactions between the front/back drivers, avoiding expensive tlb
flushes, grant table lock contention and switches between userspace
and kernel space. The full description of the protocol can be found in
the public blkif.h header.

Speed improvement with 15 guests performing I/O is ~450%.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Cc: xen-devel@lists.xen.org
Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
Cc: Anthony PERARD <anthony.perard@citrix.com>
---
Performance comparison with the previous implementation can be seen in
the followign graph:

http://xenbits.xen.org/people/royger/persistent_read_qemu.png
---
 hw/xen_disk.c |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 138 insertions(+), 17 deletions(-)

diff --git a/hw/xen_disk.c b/hw/xen_disk.c
index 1eb485a..bafeceb 100644
--- a/hw/xen_disk.c
+++ b/hw/xen_disk.c
@@ -52,6 +52,11 @@ static int max_requests = 32;
 #define BLOCK_SIZE  512
 #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
 
+struct persistent_gnt {
+    void *page;
+    struct XenBlkDev *blkdev;
+};
+
 struct ioreq {
     blkif_request_t     req;
     int16_t             status;
@@ -69,6 +74,7 @@ struct ioreq {
     int                 prot;
     void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
     void                *pages;
+    int                 num_unmap;
 
     /* aio status */
     int                 aio_inflight;
@@ -105,6 +111,12 @@ struct XenBlkDev {
     int                 requests_inflight;
     int                 requests_finished;
 
+    /* Persistent grants extension */
+    gboolean            feature_persistent;
+    GTree               *persistent_gnts;
+    unsigned int        persistent_gnt_c;
+    unsigned int        max_grants;
+
     /* qemu block driver */
     DriveInfo           *dinfo;
     BlockDriverState    *bs;
@@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq)
     qemu_iovec_reset(&ioreq->v);
 }
 
+static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
+{
+    uint ua = GPOINTER_TO_UINT(a);
+    uint ub = GPOINTER_TO_UINT(b);
+    return (ua > ub) - (ua < ub);
+}
+
+static void destroy_grant(gpointer pgnt)
+{
+    struct persistent_gnt *grant = pgnt;
+    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
+
+    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
+        xen_be_printf(&grant->blkdev->xendev, 0,
+                      "xc_gnttab_munmap failed: %s\n",
+                      strerror(errno));
+    }
+    grant->blkdev->persistent_gnt_c--;
+    xen_be_printf(&grant->blkdev->xendev, 3,
+                  "unmapped grant %p\n", grant->page);
+    g_free(grant);
+}
+
 static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
 {
     struct ioreq *ioreq = NULL;
@@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
     int i;
 
-    if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
+    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
         return;
     }
     if (batch_maps) {
         if (!ioreq->pages) {
             return;
         }
-        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
+        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
             xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
                           strerror(errno));
         }
-        ioreq->blkdev->cnt_map -= ioreq->v.niov;
+        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
         ioreq->pages = NULL;
     } else {
-        for (i = 0; i < ioreq->v.niov; i++) {
+        for (i = 0; i < ioreq->num_unmap; i++) {
             if (!ioreq->page[i]) {
                 continue;
             }
@@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
 static int ioreq_map(struct ioreq *ioreq)
 {
     XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
-    int i;
+    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int i, j, new_maps = 0;
+    struct persistent_gnt *grant;
 
     if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
         return 0;
     }
-    if (batch_maps) {
+    if (ioreq->blkdev->feature_persistent) {
+        for (i = 0; i < ioreq->v.niov; i++) {
+            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
+                                    GUINT_TO_POINTER(ioreq->refs[i]));
+
+            if (grant != NULL) {
+                page[i] = grant->page;
+                xen_be_printf(&ioreq->blkdev->xendev, 3,
+                              "using persistent-grant %" PRIu32 "\n",
+                              ioreq->refs[i]);
+            } else {
+                    /* Add the grant to the list of grants that
+                     * should be mapped
+                     */
+                    domids[new_maps] = ioreq->domids[i];
+                    refs[new_maps] = ioreq->refs[i];
+                    page[i] = NULL;
+                    new_maps++;
+            }
+        }
+        /* Set the protection to RW, since grants may be reused later
+         * with a different protection than the one needed for this request
+         */
+        ioreq->prot = PROT_WRITE | PROT_READ;
+    } else {
+        /* All grants in the request should be mapped */
+        memcpy(refs, ioreq->refs, sizeof(refs));
+        memcpy(domids, ioreq->domids, sizeof(domids));
+        memset(page, 0, sizeof(page));
+        new_maps = ioreq->v.niov;
+    }
+
+    if (batch_maps && new_maps) {
         ioreq->pages = xc_gnttab_map_grant_refs
-            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
+            (gnt, new_maps, domids, refs, ioreq->prot);
         if (ioreq->pages == NULL) {
             xen_be_printf(&ioreq->blkdev->xendev, 0,
                           "can't map %d grant refs (%s, %d maps)\n",
-                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
+                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
             return -1;
         }
-        for (i = 0; i < ioreq->v.niov; i++) {
-            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
-                (uintptr_t)ioreq->v.iov[i].iov_base;
+        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+            if (page[i] == NULL)
+                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
         }
-        ioreq->blkdev->cnt_map += ioreq->v.niov;
-    } else  {
-        for (i = 0; i < ioreq->v.niov; i++) {
+        ioreq->blkdev->cnt_map += new_maps;
+    } else if (new_maps)  {
+        for (i = 0; i < new_maps; i++) {
             ioreq->page[i] = xc_gnttab_map_grant_ref
-                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
+                (gnt, domids[i], refs[i], ioreq->prot);
             if (ioreq->page[i] == NULL) {
                 xen_be_printf(&ioreq->blkdev->xendev, 0,
                               "can't map grant ref %d (%s, %d maps)\n",
-                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
+                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
                 ioreq_unmap(ioreq);
                 return -1;
             }
-            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
             ioreq->blkdev->cnt_map++;
         }
+        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
+            if (page[i] == NULL)
+                page[i] = ioreq->page[j++];
+        }
+    }
+    if (ioreq->blkdev->feature_persistent) {
+        while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) &&
+              new_maps) {
+            /* Go through the list of newly mapped grants and add as many
+             * as possible to the list of persistently mapped grants
+             */
+            grant = g_malloc0(sizeof(*grant));
+            new_maps--;
+            if (batch_maps)
+                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
+            else
+                grant->page = ioreq->page[new_maps];
+            grant->blkdev = ioreq->blkdev;
+            xen_be_printf(&ioreq->blkdev->xendev, 3,
+                          "adding grant %" PRIu32 " page: %p\n",
+                          refs[new_maps], grant->page);
+            g_tree_insert(ioreq->blkdev->persistent_gnts,
+                          GUINT_TO_POINTER(refs[new_maps]),
+                          grant);
+            ioreq->blkdev->persistent_gnt_c++;
+        }
+    }
+    for (i = 0; i < ioreq->v.niov; i++) {
+        ioreq->v.iov[i].iov_base = page[i] +
+                                   (uintptr_t)ioreq->v.iov[i].iov_base;
     }
     ioreq->mapped = 1;
+    ioreq->num_unmap = new_maps;
     return 0;
 }
 
@@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev)
 
     /* fill info */
     xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
+    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
     xenstore_write_be_int(&blkdev->xendev, "info",            info);
     xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
     xenstore_write_be_int(&blkdev->xendev, "sectors",
@@ -713,6 +815,7 @@ out_error:
 static int blk_connect(struct XenDevice *xendev)
 {
     struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
+    int pers;
 
     if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
         return -1;
@@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev)
                              &blkdev->xendev.remote_port) == -1) {
         return -1;
     }
+    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
+        blkdev->feature_persistent = FALSE;
+    } else {
+        blkdev->feature_persistent = !!pers;
+    }
 
     blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
     if (blkdev->xendev.protocol) {
@@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev)
     }
     }
 
+    if (blkdev->feature_persistent) {
+        /* Init persistent grants */
+        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
+                                             NULL, NULL,
+                                             (GDestroyNotify)destroy_grant);
+        blkdev->persistent_gnt_c = 0;
+    }
+
     xen_be_bind_evtchn(&blkdev->xendev);
 
     xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
@@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev)
         blk_disconnect(xendev);
     }
 
+    /* Free persistent grants */
+    if (blkdev->feature_persistent)
+        g_tree_destroy(blkdev->persistent_gnts);
+
     while (!QLIST_EMPTY(&blkdev->freelist)) {
         ioreq = QLIST_FIRST(&blkdev->freelist);
         QLIST_REMOVE(ioreq, list);
-- 
1.7.7.5 (Apple Git-26)

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [Xen-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
@ 2013-01-03 14:21   ` Konrad Rzeszutek Wilk
  2013-01-03 14:28   ` Ian Campbell
  1 sibling, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2013-01-03 14:21 UTC (permalink / raw)
  To: Roger Pau Monne; +Cc: Anthony PERARD, Stefano Stabellini, qemu-devel, xen-devel

On Mon, Dec 31, 2012 at 01:16:12PM +0100, Roger Pau Monne wrote:
> Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,

That is not entirely true. There are patches floating around (LKML)
to make tmpfs/ramfs be able to do this.

> if first call to bdrv_open fails with errno = EINVAL, try a second
> call without BDRV_O_NOCACHE.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Cc: xen-devel@lists.xen.org
> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> Cc: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  hw/xen_disk.c |   16 +++++++++++++---
>  1 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> index e6bb2f2..a159ee5 100644
> --- a/hw/xen_disk.c
> +++ b/hw/xen_disk.c
> @@ -562,7 +562,7 @@ static void blk_alloc(struct XenDevice *xendev)
>  static int blk_init(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> -    int index, qflags, info = 0;
> +    int index, qflags, info = 0, rc;
>  
>      /* read xenstore entries */
>      if (blkdev->params == NULL) {
> @@ -625,8 +625,18 @@ static int blk_init(struct XenDevice *xendev)
>          xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
>          blkdev->bs = bdrv_new(blkdev->dev);
>          if (blkdev->bs) {
> -            if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
> -                        bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
> +            rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> +            if (rc != 0 && errno == EINVAL) {
> +                /* Files on ramfs or tmpfs cannot be opened with O_DIRECT,
> +                 * remove the BDRV_O_NOCACHE flag, and try to open
> +                 * the file again.
> +                 */
> +                qflags &= ~BDRV_O_NOCACHE;
> +                rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> +            }
> +            if (rc != 0) {
>                  bdrv_delete(blkdev->bs);
>                  blkdev->bs = NULL;
>              }
> -- 
> 1.7.7.5 (Apple Git-26)
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [Xen-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
  2013-01-03 14:21   ` [Qemu-devel] [Xen-devel] " Konrad Rzeszutek Wilk
@ 2013-01-03 14:28   ` Ian Campbell
  2013-01-04 14:54     ` Stefano Stabellini
  1 sibling, 1 reply; 14+ messages in thread
From: Ian Campbell @ 2013-01-03 14:28 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Anthony Perard, Stefano Stabellini, qemu-devel@nongnu.org,
	xen-devel@lists.xen.org

On Mon, 2012-12-31 at 12:16 +0000, Roger Pau Monne wrote:
> Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,
> if first call to bdrv_open fails with errno = EINVAL, try a second
> call without BDRV_O_NOCACHE.

Doesn't that risk spuriously turning of NOCACHE on other sorts of
devices as well which (potentially) opens up a data loss issue?

> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Cc: xen-devel@lists.xen.org
> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> Cc: Anthony PERARD <anthony.perard@citrix.com>
> ---
>  hw/xen_disk.c |   16 +++++++++++++---
>  1 files changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> index e6bb2f2..a159ee5 100644
> --- a/hw/xen_disk.c
> +++ b/hw/xen_disk.c
> @@ -562,7 +562,7 @@ static void blk_alloc(struct XenDevice *xendev)
>  static int blk_init(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> -    int index, qflags, info = 0;
> +    int index, qflags, info = 0, rc;
>  
>      /* read xenstore entries */
>      if (blkdev->params == NULL) {
> @@ -625,8 +625,18 @@ static int blk_init(struct XenDevice *xendev)
>          xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
>          blkdev->bs = bdrv_new(blkdev->dev);
>          if (blkdev->bs) {
> -            if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
> -                        bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
> +            rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> +            if (rc != 0 && errno == EINVAL) {
> +                /* Files on ramfs or tmpfs cannot be opened with O_DIRECT,
> +                 * remove the BDRV_O_NOCACHE flag, and try to open
> +                 * the file again.
> +                 */
> +                qflags &= ~BDRV_O_NOCACHE;
> +                rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> +            }
> +            if (rc != 0) {
>                  bdrv_delete(blkdev->bs);
>                  blkdev->bs = NULL;
>              }

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [Xen-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2013-01-03 14:28   ` Ian Campbell
@ 2013-01-04 14:54     ` Stefano Stabellini
  2013-01-04 15:05       ` Roger Pau Monné
  0 siblings, 1 reply; 14+ messages in thread
From: Stefano Stabellini @ 2013-01-04 14:54 UTC (permalink / raw)
  To: Ian Campbell
  Cc: Anthony Perard, xen-devel@lists.xen.org, qemu-devel@nongnu.org,
	Stefano Stabellini, Roger Pau Monne

[-- Attachment #1: Type: text/plain, Size: 3182 bytes --]

On Thu, 3 Jan 2013, Ian Campbell wrote:
> On Mon, 2012-12-31 at 12:16 +0000, Roger Pau Monne wrote:
> > Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,
> > if first call to bdrv_open fails with errno = EINVAL, try a second
> > call without BDRV_O_NOCACHE.
> 
> Doesn't that risk spuriously turning of NOCACHE on other sorts of
> devices as well which (potentially) opens up a data loss issue?

I agree, we shouldn't have this kind of critical configuration changes
behind the user's back.

I would rather let the user set the cache attributes, QEMU has already a
command line option for it, but we can't use it directly because
xen_disk gets the configuration solely from xenstore at the moment.

I guess we could add a key pair cache=foobar to the xl disk
configuration spec, that gets translated somehow to a key on xenstore.
Xen_disk would read the key and sets qflags accordingly.
We could use the same cache parameters supported by QEMU, see
bdrv_parse_cache_flags.

As an alternative, we could reuse the already defined "access" key, like
this:

access=rw|nocache

or

access=rw|unsafe


> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > Cc: xen-devel@lists.xen.org
> > Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> > Cc: Anthony PERARD <anthony.perard@citrix.com>
> > ---
> >  hw/xen_disk.c |   16 +++++++++++++---
> >  1 files changed, 13 insertions(+), 3 deletions(-)
> > 
> > diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> > index e6bb2f2..a159ee5 100644
> > --- a/hw/xen_disk.c
> > +++ b/hw/xen_disk.c
> > @@ -562,7 +562,7 @@ static void blk_alloc(struct XenDevice *xendev)
> >  static int blk_init(struct XenDevice *xendev)
> >  {
> >      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> > -    int index, qflags, info = 0;
> > +    int index, qflags, info = 0, rc;
> >  
> >      /* read xenstore entries */
> >      if (blkdev->params == NULL) {
> > @@ -625,8 +625,18 @@ static int blk_init(struct XenDevice *xendev)
> >          xen_be_printf(&blkdev->xendev, 2, "create new bdrv (xenbus setup)\n");
> >          blkdev->bs = bdrv_new(blkdev->dev);
> >          if (blkdev->bs) {
> > -            if (bdrv_open(blkdev->bs, blkdev->filename, qflags,
> > -                        bdrv_find_whitelisted_format(blkdev->fileproto)) != 0) {
> > +            rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> > +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> > +            if (rc != 0 && errno == EINVAL) {
> > +                /* Files on ramfs or tmpfs cannot be opened with O_DIRECT,
> > +                 * remove the BDRV_O_NOCACHE flag, and try to open
> > +                 * the file again.
> > +                 */
> > +                qflags &= ~BDRV_O_NOCACHE;
> > +                rc = bdrv_open(blkdev->bs, blkdev->filename, qflags,
> > +                        bdrv_find_whitelisted_format(blkdev->fileproto));
> > +            }
> > +            if (rc != 0) {
> >                  bdrv_delete(blkdev->bs);
> >                  blkdev->bs = NULL;
> >              }
> 
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [Xen-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2013-01-04 14:54     ` Stefano Stabellini
@ 2013-01-04 15:05       ` Roger Pau Monné
  2013-01-04 15:30         ` Stefano Stabellini
  0 siblings, 1 reply; 14+ messages in thread
From: Roger Pau Monné @ 2013-01-04 15:05 UTC (permalink / raw)
  To: Stefano Stabellini
  Cc: Anthony Perard, xen-devel@lists.xen.org, Ian Campbell,
	qemu-devel@nongnu.org

On 04/01/13 15:54, Stefano Stabellini wrote:
> On Thu, 3 Jan 2013, Ian Campbell wrote:
>> On Mon, 2012-12-31 at 12:16 +0000, Roger Pau Monne wrote:
>>> Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,
>>> if first call to bdrv_open fails with errno = EINVAL, try a second
>>> call without BDRV_O_NOCACHE.
>>
>> Doesn't that risk spuriously turning of NOCACHE on other sorts of
>> devices as well which (potentially) opens up a data loss issue?
> 
> I agree, we shouldn't have this kind of critical configuration changes
> behind the user's back.
> 
> I would rather let the user set the cache attributes, QEMU has already a
> command line option for it, but we can't use it directly because
> xen_disk gets the configuration solely from xenstore at the moment.
> 
> I guess we could add a key pair cache=foobar to the xl disk
> configuration spec, that gets translated somehow to a key on xenstore.
> Xen_disk would read the key and sets qflags accordingly.
> We could use the same cache parameters supported by QEMU, see
> bdrv_parse_cache_flags.
> 
> As an alternative, we could reuse the already defined "access" key, like
> this:
> 
> access=rw|nocache
> 
> or
> 
> access=rw|unsafe

I needed this patch to be able to perform the benchmarks for the
persistent grants implementation, but I realize this is not the best way
to solve this problem.

It might be worth to think of a good way to pass more information to the
qdisk backend (not only limited to whether O_DIRECT should be used or
not), so we can take advantage in the future of all the possible file
backends that Qemu supports, like GlusterFS or SheepDog.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak Roger Pau Monne
@ 2013-01-04 15:06   ` Stefano Stabellini
  0 siblings, 0 replies; 14+ messages in thread
From: Stefano Stabellini @ 2013-01-04 15:06 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Anthony Perard, Stefano Stabellini, qemu-devel@nongnu.org,
	xen-devel@lists.xen.org

[-- Attachment #1: Type: text/plain, Size: 2558 bytes --]

On Mon, 31 Dec 2012, Roger Pau Monne wrote:
> On ioreq_release the full ioreq was memset to 0, loosing all the data
> and memory allocations inside the QEMUIOVector, which leads to a
> memory leak. Create a new function to specifically reset ioreq.
> 
> Reported-by: Maik Wessler <maik.wessler@yahoo.com>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Cc: xen-devel@lists.xen.org
> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> Cc: Anthony PERARD <anthony.perard@citrix.com>
>

Nice catch!

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>

> ---
>  hw/xen_disk.c |   28 ++++++++++++++++++++++++++--
>  1 files changed, 26 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> index a159ee5..1eb485a 100644
> --- a/hw/xen_disk.c
> +++ b/hw/xen_disk.c
> @@ -113,6 +113,31 @@ struct XenBlkDev {
>  
>  /* ------------------------------------------------------------- */
>  
> +static void ioreq_reset(struct ioreq *ioreq)
> +{
> +    memset(&ioreq->req, 0, sizeof(ioreq->req));
> +    ioreq->status = 0;
> +    ioreq->start = 0;
> +    ioreq->presync = 0;
> +    ioreq->postsync = 0;
> +    ioreq->mapped = 0;
> +
> +    memset(ioreq->domids, 0, sizeof(ioreq->domids));
> +    memset(ioreq->refs, 0, sizeof(ioreq->refs));
> +    ioreq->prot = 0;
> +    memset(ioreq->page, 0, sizeof(ioreq->page));
> +    ioreq->pages = NULL;
> +
> +    ioreq->aio_inflight = 0;
> +    ioreq->aio_errors = 0;
> +
> +    ioreq->blkdev = NULL;
> +    memset(&ioreq->list, 0, sizeof(ioreq->list));
> +    memset(&ioreq->acct, 0, sizeof(ioreq->acct));
> +
> +    qemu_iovec_reset(&ioreq->v);
> +}
> +
>  static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>  {
>      struct ioreq *ioreq = NULL;
> @@ -130,7 +155,6 @@ static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>          /* get one from freelist */
>          ioreq = QLIST_FIRST(&blkdev->freelist);
>          QLIST_REMOVE(ioreq, list);
> -        qemu_iovec_reset(&ioreq->v);
>      }
>      QLIST_INSERT_HEAD(&blkdev->inflight, ioreq, list);
>      blkdev->requests_inflight++;
> @@ -154,7 +178,7 @@ static void ioreq_release(struct ioreq *ioreq, bool finish)
>      struct XenBlkDev *blkdev = ioreq->blkdev;
>  
>      QLIST_REMOVE(ioreq, list);
> -    memset(ioreq, 0, sizeof(*ioreq));
> +    ioreq_reset(ioreq);
>      ioreq->blkdev = blkdev;
>      QLIST_INSERT_HEAD(&blkdev->freelist, ioreq, list);
>      if (finish) {
> -- 
> 1.7.7.5 (Apple Git-26)
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [Xen-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs
  2013-01-04 15:05       ` Roger Pau Monné
@ 2013-01-04 15:30         ` Stefano Stabellini
  0 siblings, 0 replies; 14+ messages in thread
From: Stefano Stabellini @ 2013-01-04 15:30 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: qemu-devel@nongnu.org, Anthony Perard, xen-devel@lists.xen.org,
	Ian Campbell, Stefano Stabellini

On Fri, 4 Jan 2013, Roger Pau Monne wrote:
> On 04/01/13 15:54, Stefano Stabellini wrote:
> > On Thu, 3 Jan 2013, Ian Campbell wrote:
> >> On Mon, 2012-12-31 at 12:16 +0000, Roger Pau Monne wrote:
> >>> Files that reside on ramfs or tmpfs cannot be opened with O_DIRECT,
> >>> if first call to bdrv_open fails with errno = EINVAL, try a second
> >>> call without BDRV_O_NOCACHE.
> >>
> >> Doesn't that risk spuriously turning of NOCACHE on other sorts of
> >> devices as well which (potentially) opens up a data loss issue?
> > 
> > I agree, we shouldn't have this kind of critical configuration changes
> > behind the user's back.
> > 
> > I would rather let the user set the cache attributes, QEMU has already a
> > command line option for it, but we can't use it directly because
> > xen_disk gets the configuration solely from xenstore at the moment.
> > 
> > I guess we could add a key pair cache=foobar to the xl disk
> > configuration spec, that gets translated somehow to a key on xenstore.
> > Xen_disk would read the key and sets qflags accordingly.
> > We could use the same cache parameters supported by QEMU, see
> > bdrv_parse_cache_flags.
> > 
> > As an alternative, we could reuse the already defined "access" key, like
> > this:
> > 
> > access=rw|nocache
> > 
> > or
> > 
> > access=rw|unsafe
> 
> I needed this patch to be able to perform the benchmarks for the
> persistent grants implementation, but I realize this is not the best way
> to solve this problem.
> 
> It might be worth to think of a good way to pass more information to the
> qdisk backend (not only limited to whether O_DIRECT should be used or
> not), so we can take advantage in the future of all the possible file
> backends that Qemu supports, like GlusterFS or SheepDog.

Yes, you are right.
However in the Xen world QEMU is never invoked directly, always via
libxl. So it is natural that whatever we want to expose to the user has
to go through libxl. I suppose that GlusterFS and SheepDog make no
exception: they would be just another key=value pair or just another
value in the xl disk config line.
At this point it doesn't matter that much how we pass these parameters
from libxl to QEMU: at the moment everything is done via xenstore, we
might as well keep doing it that way.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Roger Pau Monne
@ 2013-01-04 16:42   ` Stefano Stabellini
  2013-01-04 17:37     ` Roger Pau Monné
  2013-01-04 21:01   ` Blue Swirl
  1 sibling, 1 reply; 14+ messages in thread
From: Stefano Stabellini @ 2013-01-04 16:42 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Anthony Perard, Konrad Rzeszutek Wilk, Stefano Stabellini,
	qemu-devel@nongnu.org, xen-devel@lists.xen.org

[-- Attachment #1: Type: text/plain, Size: 13438 bytes --]

On Mon, 31 Dec 2012, Roger Pau Monne wrote:
> This protocol extension reuses the same set of grant pages for all
> transactions between the front/back drivers, avoiding expensive tlb
> flushes, grant table lock contention and switches between userspace
> and kernel space. The full description of the protocol can be found in
> the public blkif.h header.

it would be great if you could add a link to a webpage too


> Speed improvement with 15 guests performing I/O is ~450%.
> 
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Cc: xen-devel@lists.xen.org
> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> Cc: Anthony PERARD <anthony.perard@citrix.com>
> ---
> Performance comparison with the previous implementation can be seen in
> the followign graph:
> 
> http://xenbits.xen.org/people/royger/persistent_read_qemu.png

this is what I like to see!


>  hw/xen_disk.c |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
>  1 files changed, 138 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> index 1eb485a..bafeceb 100644
> --- a/hw/xen_disk.c
> +++ b/hw/xen_disk.c
> @@ -52,6 +52,11 @@ static int max_requests = 32;
>  #define BLOCK_SIZE  512
>  #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
>  
> +struct persistent_gnt {
> +    void *page;
> +    struct XenBlkDev *blkdev;
> +};
> +
>  struct ioreq {
>      blkif_request_t     req;
>      int16_t             status;
> @@ -69,6 +74,7 @@ struct ioreq {
>      int                 prot;
>      void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>      void                *pages;
> +    int                 num_unmap;
>  
>      /* aio status */
>      int                 aio_inflight;
> @@ -105,6 +111,12 @@ struct XenBlkDev {
>      int                 requests_inflight;
>      int                 requests_finished;
>  
> +    /* Persistent grants extension */
> +    gboolean            feature_persistent;
> +    GTree               *persistent_gnts;
> +    unsigned int        persistent_gnt_c;

can you come up with a better name for this variable?


> +    unsigned int        max_grants;
> +
>      /* qemu block driver */
>      DriveInfo           *dinfo;
>      BlockDriverState    *bs;
> @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq)
>      qemu_iovec_reset(&ioreq->v);
>  }
>  
> +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
> +{
> +    uint ua = GPOINTER_TO_UINT(a);
> +    uint ub = GPOINTER_TO_UINT(b);
> +    return (ua > ub) - (ua < ub);
> +}
> +
> +static void destroy_grant(gpointer pgnt)
> +{
> +    struct persistent_gnt *grant = pgnt;
> +    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
> +
> +    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
> +        xen_be_printf(&grant->blkdev->xendev, 0,
> +                      "xc_gnttab_munmap failed: %s\n",
> +                      strerror(errno));
> +    }
> +    grant->blkdev->persistent_gnt_c--;
> +    xen_be_printf(&grant->blkdev->xendev, 3,
> +                  "unmapped grant %p\n", grant->page);
> +    g_free(grant);
> +}
> +
>  static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>  {
>      struct ioreq *ioreq = NULL;
> @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
>      int i;
>  
> -    if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
> +    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
>          return;
>      }
>      if (batch_maps) {
>          if (!ioreq->pages) {
>              return;
>          }
> -        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
> +        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
>              xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
>                            strerror(errno));
>          }

Could you please add a comment saying that only the first num_unmap
pages are going to be unmapped here?
The others are going to be unmapped one by one by destroy_grant.


> -        ioreq->blkdev->cnt_map -= ioreq->v.niov;
> +        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
>          ioreq->pages = NULL;
>      } else {
> -        for (i = 0; i < ioreq->v.niov; i++) {
> +        for (i = 0; i < ioreq->num_unmap; i++) {
>              if (!ioreq->page[i]) {
>                  continue;
>              }

I would be tempted to remove persistent grants support for the
!batch_maps case, if it is going to make the code simpler


> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
>  static int ioreq_map(struct ioreq *ioreq)
>  {
>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
> -    int i;
> +    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    int i, j, new_maps = 0;
> +    struct persistent_gnt *grant;
>  
>      if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
>          return 0;
>      }
> -    if (batch_maps) {
> +    if (ioreq->blkdev->feature_persistent) {
> +        for (i = 0; i < ioreq->v.niov; i++) {
> +            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
> +                                    GUINT_TO_POINTER(ioreq->refs[i]));
> +
> +            if (grant != NULL) {
> +                page[i] = grant->page;
> +                xen_be_printf(&ioreq->blkdev->xendev, 3,
> +                              "using persistent-grant %" PRIu32 "\n",
> +                              ioreq->refs[i]);
> +            } else {
> +                    /* Add the grant to the list of grants that
> +                     * should be mapped
> +                     */
> +                    domids[new_maps] = ioreq->domids[i];
> +                    refs[new_maps] = ioreq->refs[i];
> +                    page[i] = NULL;
> +                    new_maps++;
> +            }
> +        }
> +        /* Set the protection to RW, since grants may be reused later
> +         * with a different protection than the one needed for this request
> +         */
> +        ioreq->prot = PROT_WRITE | PROT_READ;
> +    } else {
> +        /* All grants in the request should be mapped */
> +        memcpy(refs, ioreq->refs, sizeof(refs));
> +        memcpy(domids, ioreq->domids, sizeof(domids));
> +        memset(page, 0, sizeof(page));
> +        new_maps = ioreq->v.niov;
> +    }
> +
> +    if (batch_maps && new_maps) {
>          ioreq->pages = xc_gnttab_map_grant_refs
> -            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
> +            (gnt, new_maps, domids, refs, ioreq->prot);
>          if (ioreq->pages == NULL) {
>              xen_be_printf(&ioreq->blkdev->xendev, 0,
>                            "can't map %d grant refs (%s, %d maps)\n",
> -                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
> +                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
>              return -1;
>          }
> -        for (i = 0; i < ioreq->v.niov; i++) {
> -            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
> -                (uintptr_t)ioreq->v.iov[i].iov_base;
> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
> +            if (page[i] == NULL)
> +                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
>          }

Code style.

Is it correct to assume that the refs to map are always the first set?
What if the first 10 refs are already mapped but 10-20 are not?
It seems to me that we are going to try to map the first 10 again.
Considering that this behaviour can be triggered by the frontend, it is
a potential security issue too.

This loop should be shared between the two cases batch_maps and !batch_maps.


> -        ioreq->blkdev->cnt_map += ioreq->v.niov;
> -    } else  {
> -        for (i = 0; i < ioreq->v.niov; i++) {
> +        ioreq->blkdev->cnt_map += new_maps;
> +    } else if (new_maps)  {
> +        for (i = 0; i < new_maps; i++) {
>              ioreq->page[i] = xc_gnttab_map_grant_ref
> -                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
> +                (gnt, domids[i], refs[i], ioreq->prot);

same assumption

>              if (ioreq->page[i] == NULL) {
>                  xen_be_printf(&ioreq->blkdev->xendev, 0,
>                                "can't map grant ref %d (%s, %d maps)\n",
> -                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
> +                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
>                  ioreq_unmap(ioreq);
>                  return -1;
>              }
> -            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
>              ioreq->blkdev->cnt_map++;
>          }
> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
> +            if (page[i] == NULL)
> +                page[i] = ioreq->page[j++];
> +        }

same comments about this loop

> +    }
> +    if (ioreq->blkdev->feature_persistent) {
> +        while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) &&
> +              new_maps) {
> +            /* Go through the list of newly mapped grants and add as many
> +             * as possible to the list of persistently mapped grants
> +             */
> +            grant = g_malloc0(sizeof(*grant));
> +            new_maps--;
> +            if (batch_maps)
> +                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
> +            else
> +                grant->page = ioreq->page[new_maps];
> +            grant->blkdev = ioreq->blkdev;
> +            xen_be_printf(&ioreq->blkdev->xendev, 3,
> +                          "adding grant %" PRIu32 " page: %p\n",
> +                          refs[new_maps], grant->page);
> +            g_tree_insert(ioreq->blkdev->persistent_gnts,
> +                          GUINT_TO_POINTER(refs[new_maps]),
> +                          grant);
> +            ioreq->blkdev->persistent_gnt_c++;
> +        }
> +    }

Please run this patch through check_patch.pl.

You are still exploiting the same assumption that only the first new_maps
pages are being mapped (page[new_maps] corresponds to refs[new_maps]).

I would like a comment saying that the first num_unmap pages of the
array are left out and need to be unmapped separately.

Wouldn't it be clearer if you just did this inside the previous loop
where you set page[i]?


> +    for (i = 0; i < ioreq->v.niov; i++) {
> +        ioreq->v.iov[i].iov_base = page[i] +
> +                                   (uintptr_t)ioreq->v.iov[i].iov_base;
>      }

I would also put this inside the same loop


>      ioreq->mapped = 1;
> +    ioreq->num_unmap = new_maps;
>      return 0;
>  }
>  
> @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev)
>  
>      /* fill info */
>      xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
> +    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
>      xenstore_write_be_int(&blkdev->xendev, "info",            info);
>      xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
>      xenstore_write_be_int(&blkdev->xendev, "sectors",
> @@ -713,6 +815,7 @@ out_error:
>  static int blk_connect(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> +    int pers;
>  
>      if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
>          return -1;
> @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev)
>                               &blkdev->xendev.remote_port) == -1) {
>          return -1;
>      }
> +    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
> +        blkdev->feature_persistent = FALSE;
> +    } else {
> +        blkdev->feature_persistent = !!pers;
> +    }
>  
>      blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
>      if (blkdev->xendev.protocol) {
> @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev)
>      }
>      }
>  
> +    if (blkdev->feature_persistent) {
> +        /* Init persistent grants */
> +        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
> +                                             NULL, NULL,
> +                                             (GDestroyNotify)destroy_grant);
> +        blkdev->persistent_gnt_c = 0;
> +    }
> +
>      xen_be_bind_evtchn(&blkdev->xendev);
>  
>      xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
> @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev)
>          blk_disconnect(xendev);
>      }
>  
> +    /* Free persistent grants */
> +    if (blkdev->feature_persistent)
> +        g_tree_destroy(blkdev->persistent_gnts);

code style

>      while (!QLIST_EMPTY(&blkdev->freelist)) {
>          ioreq = QLIST_FIRST(&blkdev->freelist);
>          QLIST_REMOVE(ioreq, list);
> -- 
> 1.7.7.5 (Apple Git-26)
> 
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
  2013-01-04 16:42   ` Stefano Stabellini
@ 2013-01-04 17:37     ` Roger Pau Monné
  2013-01-04 18:02       ` Stefano Stabellini
  0 siblings, 1 reply; 14+ messages in thread
From: Roger Pau Monné @ 2013-01-04 17:37 UTC (permalink / raw)
  To: Stefano Stabellini
  Cc: Anthony Perard, Konrad Rzeszutek Wilk, qemu-devel@nongnu.org,
	xen-devel@lists.xen.org

On 04/01/13 17:42, Stefano Stabellini wrote:
> On Mon, 31 Dec 2012, Roger Pau Monne wrote:
>> This protocol extension reuses the same set of grant pages for all
>> transactions between the front/back drivers, avoiding expensive tlb
>> flushes, grant table lock contention and switches between userspace
>> and kernel space. The full description of the protocol can be found in
>> the public blkif.h header.
> 
> it would be great if you could add a link to a webpage too

Sure, thanks for the review.

> 
>> Speed improvement with 15 guests performing I/O is ~450%.
>>
>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>> Cc: xen-devel@lists.xen.org
>> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
>> Cc: Anthony PERARD <anthony.perard@citrix.com>
>> ---
>> Performance comparison with the previous implementation can be seen in
>> the followign graph:
>>
>> http://xenbits.xen.org/people/royger/persistent_read_qemu.png
> 
> this is what I like to see!
> 
> 
>>  hw/xen_disk.c |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
>>  1 files changed, 138 insertions(+), 17 deletions(-)
>>
>> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
>> index 1eb485a..bafeceb 100644
>> --- a/hw/xen_disk.c
>> +++ b/hw/xen_disk.c
>> @@ -52,6 +52,11 @@ static int max_requests = 32;
>>  #define BLOCK_SIZE  512
>>  #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
>>
>> +struct persistent_gnt {
>> +    void *page;
>> +    struct XenBlkDev *blkdev;
>> +};
>> +
>>  struct ioreq {
>>      blkif_request_t     req;
>>      int16_t             status;
>> @@ -69,6 +74,7 @@ struct ioreq {
>>      int                 prot;
>>      void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>>      void                *pages;
>> +    int                 num_unmap;
>>
>>      /* aio status */
>>      int                 aio_inflight;
>> @@ -105,6 +111,12 @@ struct XenBlkDev {
>>      int                 requests_inflight;
>>      int                 requests_finished;
>>
>> +    /* Persistent grants extension */
>> +    gboolean            feature_persistent;
>> +    GTree               *persistent_gnts;
>> +    unsigned int        persistent_gnt_c;
> 
> can you come up with a better name for this variable?

persistent_gnt_num or persistent_gnt_count?

> 
>> +    unsigned int        max_grants;
>> +
>>      /* qemu block driver */
>>      DriveInfo           *dinfo;
>>      BlockDriverState    *bs;
>> @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq)
>>      qemu_iovec_reset(&ioreq->v);
>>  }
>>
>> +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
>> +{
>> +    uint ua = GPOINTER_TO_UINT(a);
>> +    uint ub = GPOINTER_TO_UINT(b);
>> +    return (ua > ub) - (ua < ub);
>> +}
>> +
>> +static void destroy_grant(gpointer pgnt)
>> +{
>> +    struct persistent_gnt *grant = pgnt;
>> +    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
>> +
>> +    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
>> +        xen_be_printf(&grant->blkdev->xendev, 0,
>> +                      "xc_gnttab_munmap failed: %s\n",
>> +                      strerror(errno));
>> +    }
>> +    grant->blkdev->persistent_gnt_c--;
>> +    xen_be_printf(&grant->blkdev->xendev, 3,
>> +                  "unmapped grant %p\n", grant->page);
>> +    g_free(grant);
>> +}
>> +
>>  static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>>  {
>>      struct ioreq *ioreq = NULL;
>> @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
>>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
>>      int i;
>>
>> -    if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
>> +    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
>>          return;
>>      }
>>      if (batch_maps) {
>>          if (!ioreq->pages) {
>>              return;
>>          }
>> -        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
>> +        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
>>              xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
>>                            strerror(errno));
>>          }
> 
> Could you please add a comment saying that only the first num_unmap
> pages are going to be unmapped here?

Sure

> The others are going to be unmapped one by one by destroy_grant.
> 
> 
>> -        ioreq->blkdev->cnt_map -= ioreq->v.niov;
>> +        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
>>          ioreq->pages = NULL;
>>      } else {
>> -        for (i = 0; i < ioreq->v.niov; i++) {
>> +        for (i = 0; i < ioreq->num_unmap; i++) {
>>              if (!ioreq->page[i]) {
>>                  continue;
>>              }
> 
> I would be tempted to remove persistent grants support for the
> !batch_maps case, if it is going to make the code simpler

It is probably going to make the code simpler.

> 
>> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
>>  static int ioreq_map(struct ioreq *ioreq)
>>  {
>>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
>> -    int i;
>> +    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> +    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> +    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> +    int i, j, new_maps = 0;
>> +    struct persistent_gnt *grant;
>>
>>      if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
>>          return 0;
>>      }
>> -    if (batch_maps) {
>> +    if (ioreq->blkdev->feature_persistent) {
>> +        for (i = 0; i < ioreq->v.niov; i++) {
>> +            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
>> +                                    GUINT_TO_POINTER(ioreq->refs[i]));
>> +
>> +            if (grant != NULL) {
>> +                page[i] = grant->page;
>> +                xen_be_printf(&ioreq->blkdev->xendev, 3,
>> +                              "using persistent-grant %" PRIu32 "\n",
>> +                              ioreq->refs[i]);
>> +            } else {
>> +                    /* Add the grant to the list of grants that
>> +                     * should be mapped
>> +                     */
>> +                    domids[new_maps] = ioreq->domids[i];
>> +                    refs[new_maps] = ioreq->refs[i];
>> +                    page[i] = NULL;
>> +                    new_maps++;
>> +            }
>> +        }
>> +        /* Set the protection to RW, since grants may be reused later
>> +         * with a different protection than the one needed for this request
>> +         */
>> +        ioreq->prot = PROT_WRITE | PROT_READ;
>> +    } else {
>> +        /* All grants in the request should be mapped */
>> +        memcpy(refs, ioreq->refs, sizeof(refs));
>> +        memcpy(domids, ioreq->domids, sizeof(domids));
>> +        memset(page, 0, sizeof(page));
>> +        new_maps = ioreq->v.niov;
>> +    }
>> +
>> +    if (batch_maps && new_maps) {
>>          ioreq->pages = xc_gnttab_map_grant_refs
>> -            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
>> +            (gnt, new_maps, domids, refs, ioreq->prot);
>>          if (ioreq->pages == NULL) {
>>              xen_be_printf(&ioreq->blkdev->xendev, 0,
>>                            "can't map %d grant refs (%s, %d maps)\n",
>> -                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
>> +                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
>>              return -1;
>>          }
>> -        for (i = 0; i < ioreq->v.niov; i++) {
>> -            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
>> -                (uintptr_t)ioreq->v.iov[i].iov_base;
>> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
>> +            if (page[i] == NULL)
>> +                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
>>          }
> 
> Code style.
> 
> Is it correct to assume that the refs to map are always the first set?

"refs" only contains grant references that should be mapped. Note that
refs is not ioreq->refs, it is the subset of references in ioreq->refs
that are not yet mapped.

> What if the first 10 refs are already mapped but 10-20 are not?

"refs" should only contain unmapped references, and it's size is "new_maps"

> It seems to me that we are going to try to map the first 10 again.
> Considering that this behaviour can be triggered by the frontend, it is
> a potential security issue too.

I think I should add a comment to ioreq_map, because it's logic can be a
bit tricky. The basic approach is that "refs" and "domids" should only
contain the information about the grants that should be mapped, and the
number of grants to map is "new_maps".

After fetching all the persistent grants, and mapping the ones that we
need to complete the request, "page" contains the address of the memory
pages where data should be read or written. ioreq->page or ioreq->pages
will contain the memory addresses of the pages that have been newly
mapped for this request.

Then, if we still have room to store more persistent grants, we will
start by the end of ioreq->page or ioreq->pages and make those granted
pages persistent, decreasing new_maps as we go.

So at the end new_maps will contain the number of maps that cannot be
made persistent, and ioreq->page or ioreq->pages will contain the memory
address of those pages (that should be unmapped) at the start.

> This loop should be shared between the two cases batch_maps and !batch_maps.
> 
> 
>> -        ioreq->blkdev->cnt_map += ioreq->v.niov;
>> -    } else  {
>> -        for (i = 0; i < ioreq->v.niov; i++) {
>> +        ioreq->blkdev->cnt_map += new_maps;
>> +    } else if (new_maps)  {
>> +        for (i = 0; i < new_maps; i++) {
>>              ioreq->page[i] = xc_gnttab_map_grant_ref
>> -                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
>> +                (gnt, domids[i], refs[i], ioreq->prot);
> 
> same assumption
> 
>>              if (ioreq->page[i] == NULL) {
>>                  xen_be_printf(&ioreq->blkdev->xendev, 0,
>>                                "can't map grant ref %d (%s, %d maps)\n",
>> -                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
>> +                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
>>                  ioreq_unmap(ioreq);
>>                  return -1;
>>              }
>> -            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
>>              ioreq->blkdev->cnt_map++;
>>          }
>> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
>> +            if (page[i] == NULL)
>> +                page[i] = ioreq->page[j++];
>> +        }
> 
> same comments about this loop

Here we fill the blanks in the page array, those blanks are grants that
are not (yet) persistently mapped, so we need to set them to the pages
that have just been mapped in the above call to xc_gnttab_map_grant_ref.

>> +    }
>> +    if (ioreq->blkdev->feature_persistent) {
>> +        while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) &&
>> +              new_maps) {
>> +            /* Go through the list of newly mapped grants and add as many
>> +             * as possible to the list of persistently mapped grants
>> +             */
>> +            grant = g_malloc0(sizeof(*grant));
>> +            new_maps--;
>> +            if (batch_maps)
>> +                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
>> +            else
>> +                grant->page = ioreq->page[new_maps];
>> +            grant->blkdev = ioreq->blkdev;
>> +            xen_be_printf(&ioreq->blkdev->xendev, 3,
>> +                          "adding grant %" PRIu32 " page: %p\n",
>> +                          refs[new_maps], grant->page);
>> +            g_tree_insert(ioreq->blkdev->persistent_gnts,
>> +                          GUINT_TO_POINTER(refs[new_maps]),
>> +                          grant);
>> +            ioreq->blkdev->persistent_gnt_c++;
>> +        }
>> +    }
> 
> Please run this patch through check_patch.pl.
> 
> You are still exploiting the same assumption that only the first new_maps
> pages are being mapped (page[new_maps] corresponds to refs[new_maps]).

This is not true, page[new_maps] doesn't correspond to refs[new_maps]
(this is true for ioreq->page[new_maps] and refs[new_maps]), since
page[x] contains the page address of all the granted references needed
to complete the request, and refs[x] only contains the grant references
of the grants that have been mapped to complete this request.

> 
> I would like a comment saying that the first num_unmap pages of the
> array are left out and need to be unmapped separately.
> 
> Wouldn't it be clearer if you just did this inside the previous loop
> where you set page[i]?
> 
> 
>> +    for (i = 0; i < ioreq->v.niov; i++) {
>> +        ioreq->v.iov[i].iov_base = page[i] +
>> +                                   (uintptr_t)ioreq->v.iov[i].iov_base;
>>      }
> 
> I would also put this inside the same loop
> 
> 
>>      ioreq->mapped = 1;
>> +    ioreq->num_unmap = new_maps;
>>      return 0;
>>  }
>>
>> @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev)
>>
>>      /* fill info */
>>      xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
>> +    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
>>      xenstore_write_be_int(&blkdev->xendev, "info",            info);
>>      xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
>>      xenstore_write_be_int(&blkdev->xendev, "sectors",
>> @@ -713,6 +815,7 @@ out_error:
>>  static int blk_connect(struct XenDevice *xendev)
>>  {
>>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
>> +    int pers;
>>
>>      if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
>>          return -1;
>> @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev)
>>                               &blkdev->xendev.remote_port) == -1) {
>>          return -1;
>>      }
>> +    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
>> +        blkdev->feature_persistent = FALSE;
>> +    } else {
>> +        blkdev->feature_persistent = !!pers;
>> +    }
>>
>>      blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
>>      if (blkdev->xendev.protocol) {
>> @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev)
>>      }
>>      }
>>
>> +    if (blkdev->feature_persistent) {
>> +        /* Init persistent grants */
>> +        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
>> +        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
>> +                                             NULL, NULL,
>> +                                             (GDestroyNotify)destroy_grant);
>> +        blkdev->persistent_gnt_c = 0;
>> +    }
>> +
>>      xen_be_bind_evtchn(&blkdev->xendev);
>>
>>      xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
>> @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev)
>>          blk_disconnect(xendev);
>>      }
>>
>> +    /* Free persistent grants */
>> +    if (blkdev->feature_persistent)
>> +        g_tree_destroy(blkdev->persistent_gnts);
> 
> code style
> 
>>      while (!QLIST_EMPTY(&blkdev->freelist)) {
>>          ioreq = QLIST_FIRST(&blkdev->freelist);
>>          QLIST_REMOVE(ioreq, list);
>> --
>> 1.7.7.5 (Apple Git-26)
>>
>>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
  2013-01-04 17:37     ` Roger Pau Monné
@ 2013-01-04 18:02       ` Stefano Stabellini
  0 siblings, 0 replies; 14+ messages in thread
From: Stefano Stabellini @ 2013-01-04 18:02 UTC (permalink / raw)
  To: Roger Pau Monne
  Cc: Anthony Perard, Konrad Rzeszutek Wilk, xen-devel@lists.xen.org,
	qemu-devel@nongnu.org, Stefano Stabellini

[-- Attachment #1: Type: text/plain, Size: 5979 bytes --]

On Fri, 4 Jan 2013, Roger Pau Monne wrote:
> On 04/01/13 17:42, Stefano Stabellini wrote:
> > On Mon, 31 Dec 2012, Roger Pau Monne wrote:
> >> This protocol extension reuses the same set of grant pages for all
> >> transactions between the front/back drivers, avoiding expensive tlb
> >> flushes, grant table lock contention and switches between userspace
> >> and kernel space. The full description of the protocol can be found in
> >> the public blkif.h header.
> >
> > it would be great if you could add a link to a webpage too
> 
> Sure, thanks for the review.
> 
> >
> >> Speed improvement with 15 guests performing I/O is ~450%.
> >>
> >> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> >> Cc: xen-devel@lists.xen.org
> >> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> >> Cc: Anthony PERARD <anthony.perard@citrix.com>
> >> ---
> >> Performance comparison with the previous implementation can be seen in
> >> the followign graph:
> >>
> >> http://xenbits.xen.org/people/royger/persistent_read_qemu.png
> >
> > this is what I like to see!
> >
> >
> >>  hw/xen_disk.c |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
> >>  1 files changed, 138 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> >> index 1eb485a..bafeceb 100644
> >> --- a/hw/xen_disk.c
> >> +++ b/hw/xen_disk.c
> >> @@ -52,6 +52,11 @@ static int max_requests = 32;
> >>  #define BLOCK_SIZE  512
> >>  #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
> >>
> >> +struct persistent_gnt {
> >> +    void *page;
> >> +    struct XenBlkDev *blkdev;
> >> +};
> >> +
> >>  struct ioreq {
> >>      blkif_request_t     req;
> >>      int16_t             status;
> >> @@ -69,6 +74,7 @@ struct ioreq {
> >>      int                 prot;
> >>      void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> >>      void                *pages;
> >> +    int                 num_unmap;
> >>
> >>      /* aio status */
> >>      int                 aio_inflight;
> >> @@ -105,6 +111,12 @@ struct XenBlkDev {
> >>      int                 requests_inflight;
> >>      int                 requests_finished;
> >>
> >> +    /* Persistent grants extension */
> >> +    gboolean            feature_persistent;
> >> +    GTree               *persistent_gnts;
> >> +    unsigned int        persistent_gnt_c;
> >
> > can you come up with a better name for this variable?
> 
> persistent_gnt_num or persistent_gnt_count?

persistent_gnt_count



> >> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
> >>  static int ioreq_map(struct ioreq *ioreq)
> >>  {
> >>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
> >> -    int i;
> >> +    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> >> +    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> >> +    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> >> +    int i, j, new_maps = 0;
> >> +    struct persistent_gnt *grant;
> >>
> >>      if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
> >>          return 0;
> >>      }
> >> -    if (batch_maps) {
> >> +    if (ioreq->blkdev->feature_persistent) {
> >> +        for (i = 0; i < ioreq->v.niov; i++) {
> >> +            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
> >> +                                    GUINT_TO_POINTER(ioreq->refs[i]));
> >> +
> >> +            if (grant != NULL) {
> >> +                page[i] = grant->page;
> >> +                xen_be_printf(&ioreq->blkdev->xendev, 3,
> >> +                              "using persistent-grant %" PRIu32 "\n",
> >> +                              ioreq->refs[i]);
> >> +            } else {
> >> +                    /* Add the grant to the list of grants that
> >> +                     * should be mapped
> >> +                     */
> >> +                    domids[new_maps] = ioreq->domids[i];
> >> +                    refs[new_maps] = ioreq->refs[i];
> >> +                    page[i] = NULL;
> >> +                    new_maps++;
> >> +            }
> >> +        }
> >> +        /* Set the protection to RW, since grants may be reused later
> >> +         * with a different protection than the one needed for this request
> >> +         */
> >> +        ioreq->prot = PROT_WRITE | PROT_READ;
> >> +    } else {
> >> +        /* All grants in the request should be mapped */
> >> +        memcpy(refs, ioreq->refs, sizeof(refs));
> >> +        memcpy(domids, ioreq->domids, sizeof(domids));
> >> +        memset(page, 0, sizeof(page));
> >> +        new_maps = ioreq->v.niov;
> >> +    }
> >> +
> >> +    if (batch_maps && new_maps) {
> >>          ioreq->pages = xc_gnttab_map_grant_refs
> >> -            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
> >> +            (gnt, new_maps, domids, refs, ioreq->prot);
> >>          if (ioreq->pages == NULL) {
> >>              xen_be_printf(&ioreq->blkdev->xendev, 0,
> >>                            "can't map %d grant refs (%s, %d maps)\n",
> >> -                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
> >> +                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
> >>              return -1;
> >>          }
> >> -        for (i = 0; i < ioreq->v.niov; i++) {
> >> -            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
> >> -                (uintptr_t)ioreq->v.iov[i].iov_base;
> >> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
> >> +            if (page[i] == NULL)
> >> +                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
> >>          }
> >
> > Code style.
> >
> > Is it correct to assume that the refs to map are always the first set?
> 
> "refs" only contains grant references that should be mapped. Note that
> refs is not ioreq->refs, it is the subset of references in ioreq->refs
> that are not yet mapped.

Oh right, I missed that

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend
  2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Roger Pau Monne
  2013-01-04 16:42   ` Stefano Stabellini
@ 2013-01-04 21:01   ` Blue Swirl
  1 sibling, 0 replies; 14+ messages in thread
From: Blue Swirl @ 2013-01-04 21:01 UTC (permalink / raw)
  To: Roger Pau Monne; +Cc: Anthony PERARD, Stefano Stabellini, qemu-devel, xen-devel

On Mon, Dec 31, 2012 at 12:16 PM, Roger Pau Monne <roger.pau@citrix.com> wrote:
> This protocol extension reuses the same set of grant pages for all
> transactions between the front/back drivers, avoiding expensive tlb
> flushes, grant table lock contention and switches between userspace
> and kernel space. The full description of the protocol can be found in
> the public blkif.h header.
>
> Speed improvement with 15 guests performing I/O is ~450%.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> Cc: xen-devel@lists.xen.org
> Cc: Stefano Stabellini <Stefano.Stabellini@eu.citrix.com>
> Cc: Anthony PERARD <anthony.perard@citrix.com>
> ---
> Performance comparison with the previous implementation can be seen in
> the followign graph:
>
> http://xenbits.xen.org/people/royger/persistent_read_qemu.png
> ---
>  hw/xen_disk.c |  155 ++++++++++++++++++++++++++++++++++++++++++++++++++------
>  1 files changed, 138 insertions(+), 17 deletions(-)
>
> diff --git a/hw/xen_disk.c b/hw/xen_disk.c
> index 1eb485a..bafeceb 100644
> --- a/hw/xen_disk.c
> +++ b/hw/xen_disk.c
> @@ -52,6 +52,11 @@ static int max_requests = 32;
>  #define BLOCK_SIZE  512
>  #define IOCB_COUNT  (BLKIF_MAX_SEGMENTS_PER_REQUEST + 2)
>
> +struct persistent_gnt {

This should be PersistentGrant and don't forget to add a typedef.

> +    void *page;
> +    struct XenBlkDev *blkdev;
> +};
> +
>  struct ioreq {
>      blkif_request_t     req;
>      int16_t             status;
> @@ -69,6 +74,7 @@ struct ioreq {
>      int                 prot;
>      void                *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>      void                *pages;
> +    int                 num_unmap;
>
>      /* aio status */
>      int                 aio_inflight;
> @@ -105,6 +111,12 @@ struct XenBlkDev {
>      int                 requests_inflight;
>      int                 requests_finished;
>
> +    /* Persistent grants extension */
> +    gboolean            feature_persistent;
> +    GTree               *persistent_gnts;
> +    unsigned int        persistent_gnt_c;
> +    unsigned int        max_grants;
> +
>      /* qemu block driver */
>      DriveInfo           *dinfo;
>      BlockDriverState    *bs;
> @@ -138,6 +150,29 @@ static void ioreq_reset(struct ioreq *ioreq)
>      qemu_iovec_reset(&ioreq->v);
>  }
>
> +static gint int_cmp(gconstpointer a, gconstpointer b, gpointer user_data)
> +{
> +    uint ua = GPOINTER_TO_UINT(a);
> +    uint ub = GPOINTER_TO_UINT(b);
> +    return (ua > ub) - (ua < ub);
> +}
> +
> +static void destroy_grant(gpointer pgnt)
> +{
> +    struct persistent_gnt *grant = pgnt;
> +    XenGnttab gnt = grant->blkdev->xendev.gnttabdev;
> +
> +    if (xc_gnttab_munmap(gnt, grant->page, 1) != 0) {
> +        xen_be_printf(&grant->blkdev->xendev, 0,
> +                      "xc_gnttab_munmap failed: %s\n",
> +                      strerror(errno));
> +    }
> +    grant->blkdev->persistent_gnt_c--;
> +    xen_be_printf(&grant->blkdev->xendev, 3,
> +                  "unmapped grant %p\n", grant->page);
> +    g_free(grant);
> +}
> +
>  static struct ioreq *ioreq_start(struct XenBlkDev *blkdev)
>  {
>      struct ioreq *ioreq = NULL;
> @@ -266,21 +301,21 @@ static void ioreq_unmap(struct ioreq *ioreq)
>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
>      int i;
>
> -    if (ioreq->v.niov == 0 || ioreq->mapped == 0) {
> +    if (ioreq->num_unmap == 0 || ioreq->mapped == 0) {
>          return;
>      }
>      if (batch_maps) {
>          if (!ioreq->pages) {
>              return;
>          }
> -        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->v.niov) != 0) {
> +        if (xc_gnttab_munmap(gnt, ioreq->pages, ioreq->num_unmap) != 0) {
>              xen_be_printf(&ioreq->blkdev->xendev, 0, "xc_gnttab_munmap failed: %s\n",
>                            strerror(errno));
>          }
> -        ioreq->blkdev->cnt_map -= ioreq->v.niov;
> +        ioreq->blkdev->cnt_map -= ioreq->num_unmap;
>          ioreq->pages = NULL;
>      } else {
> -        for (i = 0; i < ioreq->v.niov; i++) {
> +        for (i = 0; i < ioreq->num_unmap; i++) {
>              if (!ioreq->page[i]) {
>                  continue;
>              }
> @@ -298,41 +333,107 @@ static void ioreq_unmap(struct ioreq *ioreq)
>  static int ioreq_map(struct ioreq *ioreq)
>  {
>      XenGnttab gnt = ioreq->blkdev->xendev.gnttabdev;
> -    int i;
> +    uint32_t domids[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    uint32_t refs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    void *page[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> +    int i, j, new_maps = 0;
> +    struct persistent_gnt *grant;
>
>      if (ioreq->v.niov == 0 || ioreq->mapped == 1) {
>          return 0;
>      }
> -    if (batch_maps) {
> +    if (ioreq->blkdev->feature_persistent) {
> +        for (i = 0; i < ioreq->v.niov; i++) {
> +            grant = g_tree_lookup(ioreq->blkdev->persistent_gnts,
> +                                    GUINT_TO_POINTER(ioreq->refs[i]));
> +
> +            if (grant != NULL) {
> +                page[i] = grant->page;
> +                xen_be_printf(&ioreq->blkdev->xendev, 3,
> +                              "using persistent-grant %" PRIu32 "\n",
> +                              ioreq->refs[i]);
> +            } else {
> +                    /* Add the grant to the list of grants that
> +                     * should be mapped
> +                     */
> +                    domids[new_maps] = ioreq->domids[i];
> +                    refs[new_maps] = ioreq->refs[i];
> +                    page[i] = NULL;
> +                    new_maps++;
> +            }
> +        }
> +        /* Set the protection to RW, since grants may be reused later
> +         * with a different protection than the one needed for this request
> +         */
> +        ioreq->prot = PROT_WRITE | PROT_READ;
> +    } else {
> +        /* All grants in the request should be mapped */
> +        memcpy(refs, ioreq->refs, sizeof(refs));
> +        memcpy(domids, ioreq->domids, sizeof(domids));
> +        memset(page, 0, sizeof(page));
> +        new_maps = ioreq->v.niov;
> +    }
> +
> +    if (batch_maps && new_maps) {
>          ioreq->pages = xc_gnttab_map_grant_refs
> -            (gnt, ioreq->v.niov, ioreq->domids, ioreq->refs, ioreq->prot);
> +            (gnt, new_maps, domids, refs, ioreq->prot);
>          if (ioreq->pages == NULL) {
>              xen_be_printf(&ioreq->blkdev->xendev, 0,
>                            "can't map %d grant refs (%s, %d maps)\n",
> -                          ioreq->v.niov, strerror(errno), ioreq->blkdev->cnt_map);
> +                          new_maps, strerror(errno), ioreq->blkdev->cnt_map);
>              return -1;
>          }
> -        for (i = 0; i < ioreq->v.niov; i++) {
> -            ioreq->v.iov[i].iov_base = ioreq->pages + i * XC_PAGE_SIZE +
> -                (uintptr_t)ioreq->v.iov[i].iov_base;
> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
> +            if (page[i] == NULL)
> +                page[i] = ioreq->pages + (j++) * XC_PAGE_SIZE;
>          }
> -        ioreq->blkdev->cnt_map += ioreq->v.niov;
> -    } else  {
> -        for (i = 0; i < ioreq->v.niov; i++) {
> +        ioreq->blkdev->cnt_map += new_maps;
> +    } else if (new_maps)  {
> +        for (i = 0; i < new_maps; i++) {
>              ioreq->page[i] = xc_gnttab_map_grant_ref
> -                (gnt, ioreq->domids[i], ioreq->refs[i], ioreq->prot);
> +                (gnt, domids[i], refs[i], ioreq->prot);
>              if (ioreq->page[i] == NULL) {
>                  xen_be_printf(&ioreq->blkdev->xendev, 0,
>                                "can't map grant ref %d (%s, %d maps)\n",
> -                              ioreq->refs[i], strerror(errno), ioreq->blkdev->cnt_map);
> +                              refs[i], strerror(errno), ioreq->blkdev->cnt_map);
>                  ioreq_unmap(ioreq);
>                  return -1;
>              }
> -            ioreq->v.iov[i].iov_base = ioreq->page[i] + (uintptr_t)ioreq->v.iov[i].iov_base;
>              ioreq->blkdev->cnt_map++;
>          }
> +        for (i = 0, j = 0; i < ioreq->v.niov; i++) {
> +            if (page[i] == NULL)
> +                page[i] = ioreq->page[j++];
> +        }
> +    }
> +    if (ioreq->blkdev->feature_persistent) {
> +        while((ioreq->blkdev->persistent_gnt_c < ioreq->blkdev->max_grants) &&
> +              new_maps) {
> +            /* Go through the list of newly mapped grants and add as many
> +             * as possible to the list of persistently mapped grants
> +             */
> +            grant = g_malloc0(sizeof(*grant));
> +            new_maps--;
> +            if (batch_maps)
> +                grant->page = ioreq->pages + (new_maps) * XC_PAGE_SIZE;
> +            else
> +                grant->page = ioreq->page[new_maps];
> +            grant->blkdev = ioreq->blkdev;
> +            xen_be_printf(&ioreq->blkdev->xendev, 3,
> +                          "adding grant %" PRIu32 " page: %p\n",
> +                          refs[new_maps], grant->page);
> +            g_tree_insert(ioreq->blkdev->persistent_gnts,
> +                          GUINT_TO_POINTER(refs[new_maps]),
> +                          grant);
> +            ioreq->blkdev->persistent_gnt_c++;
> +        }
> +    }
> +    for (i = 0; i < ioreq->v.niov; i++) {
> +        ioreq->v.iov[i].iov_base = page[i] +
> +                                   (uintptr_t)ioreq->v.iov[i].iov_base;
>      }
>      ioreq->mapped = 1;
> +    ioreq->num_unmap = new_maps;
>      return 0;
>  }
>
> @@ -690,6 +791,7 @@ static int blk_init(struct XenDevice *xendev)
>
>      /* fill info */
>      xenstore_write_be_int(&blkdev->xendev, "feature-barrier", 1);
> +    xenstore_write_be_int(&blkdev->xendev, "feature-persistent", 1);
>      xenstore_write_be_int(&blkdev->xendev, "info",            info);
>      xenstore_write_be_int(&blkdev->xendev, "sector-size",     blkdev->file_blk);
>      xenstore_write_be_int(&blkdev->xendev, "sectors",
> @@ -713,6 +815,7 @@ out_error:
>  static int blk_connect(struct XenDevice *xendev)
>  {
>      struct XenBlkDev *blkdev = container_of(xendev, struct XenBlkDev, xendev);
> +    int pers;
>
>      if (xenstore_read_fe_int(&blkdev->xendev, "ring-ref", &blkdev->ring_ref) == -1) {
>          return -1;
> @@ -721,6 +824,11 @@ static int blk_connect(struct XenDevice *xendev)
>                               &blkdev->xendev.remote_port) == -1) {
>          return -1;
>      }
> +    if (xenstore_read_fe_int(&blkdev->xendev, "feature-persistent", &pers)) {
> +        blkdev->feature_persistent = FALSE;
> +    } else {
> +        blkdev->feature_persistent = !!pers;
> +    }
>
>      blkdev->protocol = BLKIF_PROTOCOL_NATIVE;
>      if (blkdev->xendev.protocol) {
> @@ -764,6 +872,15 @@ static int blk_connect(struct XenDevice *xendev)
>      }
>      }
>
> +    if (blkdev->feature_persistent) {
> +        /* Init persistent grants */
> +        blkdev->max_grants = max_requests * BLKIF_MAX_SEGMENTS_PER_REQUEST;
> +        blkdev->persistent_gnts = g_tree_new_full((GCompareDataFunc)int_cmp,
> +                                             NULL, NULL,
> +                                             (GDestroyNotify)destroy_grant);
> +        blkdev->persistent_gnt_c = 0;
> +    }
> +
>      xen_be_bind_evtchn(&blkdev->xendev);
>
>      xen_be_printf(&blkdev->xendev, 1, "ok: proto %s, ring-ref %d, "
> @@ -804,6 +921,10 @@ static int blk_free(struct XenDevice *xendev)
>          blk_disconnect(xendev);
>      }
>
> +    /* Free persistent grants */
> +    if (blkdev->feature_persistent)
> +        g_tree_destroy(blkdev->persistent_gnts);
> +
>      while (!QLIST_EMPTY(&blkdev->freelist)) {
>          ioreq = QLIST_FIRST(&blkdev->freelist);
>          QLIST_REMOVE(ioreq, list);
> --
> 1.7.7.5 (Apple Git-26)
>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2013-01-04 21:02 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-12-31 12:16 [Qemu-devel] [PATCH RFC 0/3] xen pv disk persistent grants implementation Roger Pau Monne
2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 1/3] xen_disk: handle disk files on ramfs/tmpfs Roger Pau Monne
2013-01-03 14:21   ` [Qemu-devel] [Xen-devel] " Konrad Rzeszutek Wilk
2013-01-03 14:28   ` Ian Campbell
2013-01-04 14:54     ` Stefano Stabellini
2013-01-04 15:05       ` Roger Pau Monné
2013-01-04 15:30         ` Stefano Stabellini
2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 2/3] xen_disk: fix memory leak Roger Pau Monne
2013-01-04 15:06   ` Stefano Stabellini
2012-12-31 12:16 ` [Qemu-devel] [PATCH RFC 3/3] xen_disk: add persistent grant support to xen_disk backend Roger Pau Monne
2013-01-04 16:42   ` Stefano Stabellini
2013-01-04 17:37     ` Roger Pau Monné
2013-01-04 18:02       ` Stefano Stabellini
2013-01-04 21:01   ` Blue Swirl

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).