[Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support
@ 2014-12-02 14:33 Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests Peter Lieven
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Peter Lieven @ 2014-12-02 14:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, Peter Lieven, armbru, mreitz,
	stefanha, pbonzini

this series adds the long missing multiread support to virtio-blk.

some remarks:
 - i introduced rd_merged and wr_merged block accounting stats to
   blockstats as a generic interface which can be set from any
   driver that will introduce multirequst merging in the future.
 - the knob to disable request merging is not yet there. I would
   add it to the device properties also as a generic interface
   to have the same switch on for any driver that might introduce
   request merging in the future
 - there is cleanup and iotest adjustion missing.

Peter Lieven (3):
  block: add accounting for merged requests
  hw/virtio-blk: add a constant for max number of merged requests
  virtio-blk: introduce multiread

 block.c                         |    2 +
 block/accounting.c              |    7 ++
 block/qapi.c                    |    2 +
 hmp.c                           |    6 +-
 hw/block/dataplane/virtio-blk.c |   10 +-
 hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
 include/block/accounting.h      |    3 +
 include/hw/virtio/virtio-blk.h  |   21 ++--
 qapi/block-core.json            |   10 +-
 9 files changed, 184 insertions(+), 99 deletions(-)

-- 
1.7.9.5

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests
  2014-12-02 14:33 [Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support Peter Lieven
@ 2014-12-02 14:33 ` Peter Lieven
  2014-12-03 17:39   ` Eric Blake
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 2/3] hw/virtio-blk: add a constant for max number of " Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread Peter Lieven
  2 siblings, 1 reply; 11+ messages in thread
From: Peter Lieven @ 2014-12-02 14:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, Peter Lieven, armbru, mreitz,
	stefanha, pbonzini

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block.c                    |    2 ++
 block/accounting.c         |    7 +++++++
 block/qapi.c               |    2 ++
 hmp.c                      |    6 +++++-
 include/block/accounting.h |    3 +++
 qapi/block-core.json       |   10 +++++++++-
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index a612594..75450e3 100644
--- a/block.c
+++ b/block.c
@@ -4508,6 +4508,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs,
         }
     }
 
+    block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1);
+
     return outidx + 1;
 }
 
diff --git a/block/accounting.c b/block/accounting.c
index edbb1cc..d1afcd9 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -52,3 +52,10 @@ void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
         stats->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 }
+
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                      int num_requests)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+    stats->merged[type] += num_requests;
+}
diff --git a/block/qapi.c b/block/qapi.c
index a87a34a..ec28240 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -316,6 +316,8 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs)
     s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE];
     s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ];
     s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE];
+    s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ];
+    s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE];
     s->stats->wr_highest_offset =
         bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE;
     s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH];
diff --git a/hmp.c b/hmp.c
index 63d7686..baaa9e5 100644
--- a/hmp.c
+++ b/hmp.c
@@ -419,6 +419,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        " wr_total_time_ns=%" PRId64
                        " rd_total_time_ns=%" PRId64
                        " flush_total_time_ns=%" PRId64
+                       " rd_merged=%" PRId64
+                       " wr_merged=%" PRId64
                        "\n",
                        stats->value->stats->rd_bytes,
                        stats->value->stats->wr_bytes,
@@ -427,7 +429,9 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        stats->value->stats->flush_operations,
                        stats->value->stats->wr_total_time_ns,
                        stats->value->stats->rd_total_time_ns,
-                       stats->value->stats->flush_total_time_ns);
+                       stats->value->stats->flush_total_time_ns,
+                       stats->value->stats->rd_merged,
+                       stats->value->stats->wr_merged);
     }
 
     qapi_free_BlockStatsList(stats_list);
diff --git a/include/block/accounting.h b/include/block/accounting.h
index 50b42b3..4c406cf 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -39,6 +39,7 @@ typedef struct BlockAcctStats {
     uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
     uint64_t nr_ops[BLOCK_MAX_IOTYPE];
     uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
+    uint64_t merged[BLOCK_MAX_IOTYPE];
     uint64_t wr_highest_sector;
 } BlockAcctStats;
 
@@ -53,5 +54,7 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
 void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num,
                                unsigned int nb_sectors);
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                           int num_requests);
 
 #endif
diff --git a/qapi/block-core.json b/qapi/block-core.json
index a14e6ab..b64ffb6 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -389,13 +389,21 @@
 #                     growable sparse files (like qcow2) that are used on top
 #                     of a physical device.
 #
+# @rd_merged: Number of read requests that have been merged into another request
+#             (Since 2.3).
+#
+# @wr_merged: Number of write requests that have been merged into another request
+#             (Since 2.3).
+#
+#
 # Since: 0.14.0
 ##
 { 'type': 'BlockDeviceStats',
   'data': {'rd_bytes': 'int', 'wr_bytes': 'int', 'rd_operations': 'int',
            'wr_operations': 'int', 'flush_operations': 'int',
            'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int',
-           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int' } }
+           'rd_total_time_ns': 'int', 'wr_highest_offset': 'int',
+           'rd_merged': 'int', 'wr_merged': 'int' } }
 
 ##
 # @BlockStats:
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests Peter Lieven
@ 2014-12-03 17:39   ` Eric Blake
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Blake @ 2014-12-03 17:39 UTC (permalink / raw)
  To: Peter Lieven, qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, armbru, mreitz, stefanha, pbonzini

[-- Attachment #1: Type: text/plain, Size: 827 bytes --]

On 12/02/2014 07:33 AM, Peter Lieven wrote:
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block.c                    |    2 ++
>  block/accounting.c         |    7 +++++++
>  block/qapi.c               |    2 ++
>  hmp.c                      |    6 +++++-
>  include/block/accounting.h |    3 +++
>  qapi/block-core.json       |   10 +++++++++-
>  6 files changed, 28 insertions(+), 2 deletions(-)

Incomplete.  qmp-commands.hx includes a query-blockstats example that
should be updated to show the output of the new non-optional fields.

> +# @wr_merged: Number of write requests that have been merged into another request

Longer than 80 columns; please wrap.

Otherwise looks good to me.

-- 
Eric Blake   eblake redhat com    +1-919-301-3266
Libvirt virtualization library http://libvirt.org


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 539 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Qemu-devel] [RFC PATCH 2/3] hw/virtio-blk: add a constant for max number of merged requests
  2014-12-02 14:33 [Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests Peter Lieven
@ 2014-12-02 14:33 ` Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread Peter Lieven
  2 siblings, 0 replies; 11+ messages in thread
From: Peter Lieven @ 2014-12-02 14:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, Peter Lieven, armbru, mreitz,
	stefanha, pbonzini

As it was not obvious (at least for me) where the 32 comes from;
add a constant for it.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 hw/block/virtio-blk.c          |    2 +-
 include/hw/virtio/virtio-blk.h |    4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index b19b102..490f961 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -326,7 +326,7 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
                      BLOCK_ACCT_WRITE);
 
-    if (mrb->num_writes == 32) {
+    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
         virtio_submit_multiwrite(req->dev->blk, mrb);
     }
 
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 3979dc4..3f2652f 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,8 +134,10 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
+#define VIRTIO_BLK_MAX_MERGE_REQS 32
+
 typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[32];
+    BlockRequest        blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
     unsigned int        num_writes;
 } MultiReqBuffer;
 
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-02 14:33 [Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests Peter Lieven
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 2/3] hw/virtio-blk: add a constant for max number of " Peter Lieven
@ 2014-12-02 14:33 ` Peter Lieven
  2014-12-03 17:25   ` Max Reitz
  2014-12-04 14:12   ` Kevin Wolf
  2 siblings, 2 replies; 11+ messages in thread
From: Peter Lieven @ 2014-12-02 14:33 UTC (permalink / raw)
  To: qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, Peter Lieven, armbru, mreitz,
	stefanha, pbonzini

this patch finally introduce multiread support to virtio-blk while
multiwrite support was there for a long time read support was missing.

To achieve this the patch does serveral things which might need futher
explaination:

 - the whole merge and multireq logic is moved from block.c into
   virtio-blk. This is move is a preparation for directly creating a
   coroutine out of virtio-blk.

 - requests are only merged if they are strictly sequential and no
   longer sorted. This simplification decreases overhead and reduces
   latency. It will also merge some requests which were unmergable before.

   The old algorithm took up to 32 requests sorted them and tried to merge
   them. The outcome was anything between 1 and 32 requests. In case of
   32 requests there were 31 requests unnecessarily delayed.

   On the other hand lets imagine e.g. 16 unmergeable requests followed
   by 32 mergable requests. The latter 32 requests would have been split
   into two 16 byte requests.

   Last the simplified logic allows for a fast path if we have only a
   single request in the multirequest. In this case the request is sent as
   ordinary request without mulltireq callbacks.

As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
merged requests is in the same order while the latency is slightly decreased.
One should not stick to much to the numbers because the number of wr_requests
are highly fluctuant. I hope the numbers show that this patch is at least
not causing to big harm:

Cmdline:
qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
 -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio

Before:
virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
         flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
         flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453

After:
virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
         flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
         flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859

Some first numbers of improved read performance while booting:

The Ubuntu 14.04.1 vServer from above:
virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
         flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
         flush_total_time_ns=288776 rd_merged=1297 wr_merged=2

Windows 2012R2:
virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
         flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
         flush_total_time_ns=18115517 rd_merged=641 wr_merged=216

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 hw/block/dataplane/virtio-blk.c |   10 +-
 hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
 include/hw/virtio/virtio-blk.h  |   23 ++--
 3 files changed, 156 insertions(+), 99 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index 1222a37..aa4ad91 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
     event_notifier_test_and_clear(&s->host_notifier);
     blk_io_plug(s->conf->conf.blk);
     for (;;) {
-        MultiReqBuffer mrb = {
-            .num_writes = 0,
-        };
+        MultiReqBuffer mrb_rd = {};
+        MultiReqBuffer mrb_wr = {.is_write = 1};
         int ret;
 
         /* Disable guest->host notifies to avoid unnecessary vmexits */
@@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
                                                         req->elem.in_num,
                                                         req->elem.index);
 
-            virtio_blk_handle_request(req, &mrb);
+            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
         }
 
-        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
+        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
+        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
 
         if (likely(ret == -EAGAIN)) { /* vring emptied */
             /* Re-enable guest->host notifies and stop processing the vring.
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 490f961..f522bfc 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -22,12 +22,15 @@
 #include "dataplane/virtio-blk.h"
 #include "migration/migration.h"
 #include "block/scsi.h"
+#include "block/block_int.h"
 #ifdef __linux__
 # include <scsi/sg.h>
 #endif
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/virtio-access.h"
 
+/* #define DEBUG_MULTIREQ */
+
 VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
 {
     VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
@@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
 
     trace_virtio_blk_rw_complete(req, ret);
 
+#ifdef DEBUG_MULTIREQ
+    printf("virtio_blk_rw_complete p %p ret %d\n",
+           req, ret);
+#endif
+
     if (ret) {
         int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
         bool is_read = !(p & VIRTIO_BLK_T_OUT);
@@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
     virtio_blk_free_request(req);
 }
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
+static void virtio_multireq_cb(void *opaque, int ret)
+{
+    MultiReqBuffer *mrb = opaque;
+    int i;
+#ifdef DEBUG_MULTIREQ
+    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
+           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
+#endif
+    for (i = 0; i < mrb->num_reqs; i++) {
+        virtio_blk_rw_complete(mrb->reqs[i], ret);
+    }
+
+    qemu_iovec_destroy(&mrb->qiov);
+    g_free(mrb);
+}
+
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
 {
-    int i, ret;
+    MultiReqBuffer *mrb = NULL;
 
-    if (!mrb->num_writes) {
+    if (!mrb0->num_reqs) {
         return;
     }
 
-    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
-    if (ret != 0) {
-        for (i = 0; i < mrb->num_writes; i++) {
-            if (mrb->blkreq[i].error) {
-                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
-            }
+    if (mrb0->num_reqs == 1) {
+        if (mrb0->is_write) {
+            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
+                           virtio_blk_rw_complete, mrb0->reqs[0]);
+        } else {
+            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
+                          virtio_blk_rw_complete, mrb0->reqs[0]);
         }
+        qemu_iovec_destroy(&mrb0->qiov);
+        mrb0->num_reqs = 0;
+        return;
+    }
+
+    mrb = g_malloc(sizeof(MultiReqBuffer));
+    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
+    mrb0->num_reqs = 0;
+
+#ifdef DEBUG_MULTIREQ
+    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
+           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
+#endif
+
+    if (mrb->is_write) {
+        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
+                       virtio_multireq_cb, mrb);
+    } else {
+        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
+                      virtio_multireq_cb, mrb);
     }
 
-    mrb->num_writes = 0;
+    block_acct_merge_done(blk_get_stats(blk),
+                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
+                          mrb->num_reqs - 1);
 }
 
 static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
@@ -283,9 +330,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
                      BLOCK_ACCT_FLUSH);
 
     /*
-     * Make sure all outstanding writes are posted to the backing device.
+     * Make sure all outstanding requests are posted to the backing device.
      */
-    virtio_submit_multiwrite(req->dev->blk, mrb);
+    virtio_submit_multireq(req->dev->blk, mrb);
     blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
 }
 
@@ -308,61 +355,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
     return true;
 }
 
-static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
-{
-    BlockRequest *blkreq;
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_WRITE);
-
-    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
-        virtio_submit_multiwrite(req->dev->blk, mrb);
-    }
-
-    blkreq = &mrb->blkreq[mrb->num_writes];
-    blkreq->sector = sector;
-    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
-    blkreq->qiov = &req->qiov;
-    blkreq->cb = virtio_blk_rw_complete;
-    blkreq->opaque = req;
-    blkreq->error = 0;
-
-    mrb->num_writes++;
-}
-
-static void virtio_blk_handle_read(VirtIOBlockReq *req)
-{
-    uint64_t sector;
-
-    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
-
-    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
-
-    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
-        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        virtio_blk_free_request(req);
-        return;
-    }
-
-    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
-                     BLOCK_ACCT_READ);
-    blk_aio_readv(req->dev->blk, sector, &req->qiov,
-                  req->qiov.size / BDRV_SECTOR_SIZE,
-                  virtio_blk_rw_complete, req);
-}
-
-void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
+void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
+                               MultiReqBuffer *mrb_rd)
 {
     uint32_t type;
     struct iovec *in_iov = req->elem.in_sg;
@@ -397,7 +391,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
     type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
 
     if (type & VIRTIO_BLK_T_FLUSH) {
-        virtio_blk_handle_flush(req, mrb);
+        virtio_blk_handle_flush(req, mrb_wr);
     } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
         virtio_blk_handle_scsi(req);
     } else if (type & VIRTIO_BLK_T_GET_ID) {
@@ -414,13 +408,71 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         iov_from_buf(in_iov, in_num, 0, serial, size);
         virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
         virtio_blk_free_request(req);
-    } else if (type & VIRTIO_BLK_T_OUT) {
-        qemu_iovec_init_external(&req->qiov, iov, out_num);
-        virtio_blk_handle_write(req, mrb);
-    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
-        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
-        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
-        virtio_blk_handle_read(req);
+    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
+        bool is_write = type & VIRTIO_BLK_T_OUT;
+        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
+        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
+        BlockDriverState *bs = blk_bs(req->dev->blk);
+        int nb_sectors = 0;
+        int merge = 1;
+
+        if (!virtio_blk_sect_range_ok(req->dev, sector_num, req->qiov.size)) {
+            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+            virtio_blk_free_request(req);
+            return;
+        }
+
+        if (is_write) {
+            qemu_iovec_init_external(&req->qiov, iov, out_num);
+            trace_virtio_blk_handle_write(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
+        } else {
+            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
+            trace_virtio_blk_handle_read(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
+        }
+
+        nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
+#ifdef DEBUG_MULTIREQ
+        printf("virtio_blk_handle_request p %p sector_num %ld nb_sectors %d is_write %d\n",
+               req, sector_num, nb_sectors, is_write);
+#endif
+
+        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
+                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+        /* merge would exceed maximum number of requests or IOVs */
+        if (mrb->num_reqs == MAX_MERGE_REQS ||
+            mrb->qiov.niov + req->qiov.niov + 1 > IOV_MAX) {
+            merge = 0;
+        }
+
+        /* merge would exceed maximum transfer length of backend device */
+        if (bs->bl.max_transfer_length &&
+            mrb->nb_sectors + nb_sectors > bs->bl.max_transfer_length) {
+            merge = 0;
+        }
+
+        /* requests are not sequential */
+        if (mrb->num_reqs && mrb->sector_num + mrb->nb_sectors != sector_num) {
+            merge = 0;
+        }
+
+        if (!merge) {
+            virtio_submit_multireq(req->dev->blk, mrb);
+        }
+
+        if (mrb->num_reqs == 0) {
+            qemu_iovec_init(&mrb->qiov, MAX_MERGE_REQS);
+            mrb->sector_num = sector_num;
+            mrb->nb_sectors = 0;
+        }
+
+        qemu_iovec_concat(&mrb->qiov, &req->qiov, 0, req->qiov.size);
+        mrb->nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
+
+        assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
+
+        mrb->reqs[mrb->num_reqs] = req;
+        mrb->num_reqs++;
     } else {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
         virtio_blk_free_request(req);
@@ -431,9 +483,8 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
     VirtIOBlock *s = VIRTIO_BLK(vdev);
     VirtIOBlockReq *req;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb_rd = {};
+    MultiReqBuffer mrb_wr = {.is_write = 1};
 
     /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
      * dataplane here instead of waiting for .set_status().
@@ -444,10 +495,11 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
     }
 
     while ((req = virtio_blk_get_request(s))) {
-        virtio_blk_handle_request(req, &mrb);
+        virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    virtio_submit_multireq(s->blk, &mrb_wr);
+    virtio_submit_multireq(s->blk, &mrb_rd);
 
     /*
      * FIXME: Want to check for completions before returning to guest mode,
@@ -460,9 +512,8 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 {
     VirtIOBlock *s = opaque;
     VirtIOBlockReq *req = s->rq;
-    MultiReqBuffer mrb = {
-        .num_writes = 0,
-    };
+    MultiReqBuffer mrb_rd = {};
+    MultiReqBuffer mrb_wr = {.is_write = 1};
 
     qemu_bh_delete(s->bh);
     s->bh = NULL;
@@ -471,11 +522,12 @@ static void virtio_blk_dma_restart_bh(void *opaque)
 
     while (req) {
         VirtIOBlockReq *next = req->next;
-        virtio_blk_handle_request(req, &mrb);
+        virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
         req = next;
     }
 
-    virtio_submit_multiwrite(s->blk, &mrb);
+    virtio_submit_multireq(s->blk, &mrb_wr);
+    virtio_submit_multireq(s->blk, &mrb_rd);
 }
 
 static void virtio_blk_dma_restart_cb(void *opaque, int running,
diff --git a/include/hw/virtio/virtio-blk.h b/include/hw/virtio/virtio-blk.h
index 3f2652f..edfac67 100644
--- a/include/hw/virtio/virtio-blk.h
+++ b/include/hw/virtio/virtio-blk.h
@@ -134,13 +134,6 @@ typedef struct VirtIOBlock {
     struct VirtIOBlockDataPlane *dataplane;
 } VirtIOBlock;
 
-#define VIRTIO_BLK_MAX_MERGE_REQS 32
-
-typedef struct MultiReqBuffer {
-    BlockRequest        blkreq[VIRTIO_BLK_MAX_MERGE_REQS];
-    unsigned int        num_writes;
-} MultiReqBuffer;
-
 typedef struct VirtIOBlockReq {
     VirtIOBlock *dev;
     VirtQueueElement elem;
@@ -151,6 +144,17 @@ typedef struct VirtIOBlockReq {
     BlockAcctCookie acct;
 } VirtIOBlockReq;
 
+#define MAX_MERGE_REQS 32
+
+typedef struct MultiReqBuffer {
+    VirtIOBlockReq *reqs[MAX_MERGE_REQS];
+    unsigned int   num_reqs;
+    bool           is_write;
+    QEMUIOVector   qiov;
+    int64_t        sector_num;
+    int            nb_sectors;
+} MultiReqBuffer;
+
 VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s);
 
 void virtio_blk_free_request(VirtIOBlockReq *req);
@@ -158,8 +162,9 @@ void virtio_blk_free_request(VirtIOBlockReq *req);
 int virtio_blk_handle_scsi_req(VirtIOBlock *blk,
                                VirtQueueElement *elem);
 
-void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb);
+void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
+                               MultiReqBuffer *mrb_rd);
 
-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb);
+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb);
 
 #endif
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread Peter Lieven
@ 2014-12-03 17:25   ` Max Reitz
  2014-12-04  7:16     ` Peter Lieven
  2014-12-04 14:12   ` Kevin Wolf
  1 sibling, 1 reply; 11+ messages in thread
From: Max Reitz @ 2014-12-03 17:25 UTC (permalink / raw)
  To: Peter Lieven, qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, armbru, stefanha, pbonzini

On 2014-12-02 at 15:33, Peter Lieven wrote:
> this patch finally introduce multiread support to virtio-blk while
> multiwrite support was there for a long time read support was missing.
>
> To achieve this the patch does serveral things which might need futher
> explaination:
>
>   - the whole merge and multireq logic is moved from block.c into
>     virtio-blk. This is move is a preparation for directly creating a
>     coroutine out of virtio-blk.
>
>   - requests are only merged if they are strictly sequential and no
>     longer sorted. This simplification decreases overhead and reduces
>     latency. It will also merge some requests which were unmergable before.
>
>     The old algorithm took up to 32 requests sorted them and tried to merge
>     them. The outcome was anything between 1 and 32 requests. In case of
>     32 requests there were 31 requests unnecessarily delayed.
>
>     On the other hand lets imagine e.g. 16 unmergeable requests followed
>     by 32 mergable requests. The latter 32 requests would have been split
>     into two 16 byte requests.
>
>     Last the simplified logic allows for a fast path if we have only a
>     single request in the multirequest. In this case the request is sent as
>     ordinary request without mulltireq callbacks.
>
> As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
> merged requests is in the same order while the latency is slightly decreased.
> One should not stick to much to the numbers because the number of wr_requests
> are highly fluctuant. I hope the numbers show that this patch is at least
> not causing to big harm:
>
> Cmdline:
> qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
>   -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
>
> Before:
> virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
>           flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
>           flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
>
> After:
> virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
>           flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
>           flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
>
> Some first numbers of improved read performance while booting:
>
> The Ubuntu 14.04.1 vServer from above:
> virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
>           flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
>           flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
>
> Windows 2012R2:
> virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
>           flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
>           flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
>
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>   hw/block/dataplane/virtio-blk.c |   10 +-
>   hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
>   include/hw/virtio/virtio-blk.h  |   23 ++--
>   3 files changed, 156 insertions(+), 99 deletions(-)

So, since you CC-ed me, I guess you're expecting a reply from me. I feel 
like I'm saying this more often recently (about anything, not just about 
virtio), but I don't know anything about virtio, so you should take 
anything from me with a not just a grain but more of a pack of salt.

> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 1222a37..aa4ad91 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
>       event_notifier_test_and_clear(&s->host_notifier);
>       blk_io_plug(s->conf->conf.blk);
>       for (;;) {
> -        MultiReqBuffer mrb = {
> -            .num_writes = 0,
> -        };
> +        MultiReqBuffer mrb_rd = {};
> +        MultiReqBuffer mrb_wr = {.is_write = 1};
>           int ret;
>   
>           /* Disable guest->host notifies to avoid unnecessary vmexits */
> @@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
>                                                           req->elem.in_num,
>                                                           req->elem.index);
>   
> -            virtio_blk_handle_request(req, &mrb);
> +            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
>           }
>   
> -        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
>   
>           if (likely(ret == -EAGAIN)) { /* vring emptied */
>               /* Re-enable guest->host notifies and stop processing the vring.
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 490f961..f522bfc 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -22,12 +22,15 @@
>   #include "dataplane/virtio-blk.h"
>   #include "migration/migration.h"
>   #include "block/scsi.h"
> +#include "block/block_int.h"
>   #ifdef __linux__
>   # include <scsi/sg.h>
>   #endif
>   #include "hw/virtio/virtio-bus.h"
>   #include "hw/virtio/virtio-access.h"
>   
> +/* #define DEBUG_MULTIREQ */
> +
>   VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
>   {
>       VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
> @@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
>   
>       trace_virtio_blk_rw_complete(req, ret);
>   
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_blk_rw_complete p %p ret %d\n",
> +           req, ret);
> +#endif
> +
>       if (ret) {
>           int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>           bool is_read = !(p & VIRTIO_BLK_T_OUT);
> @@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>       virtio_blk_free_request(req);
>   }
>   
> -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
> +static void virtio_multireq_cb(void *opaque, int ret)
> +{
> +    MultiReqBuffer *mrb = opaque;
> +    int i;
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> +#endif
> +    for (i = 0; i < mrb->num_reqs; i++) {
> +        virtio_blk_rw_complete(mrb->reqs[i], ret);
> +    }
> +
> +    qemu_iovec_destroy(&mrb->qiov);
> +    g_free(mrb);
> +}
> +
> +void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
>   {
> -    int i, ret;
> +    MultiReqBuffer *mrb = NULL;
>   
> -    if (!mrb->num_writes) {
> +    if (!mrb0->num_reqs) {
>           return;
>       }
>   
> -    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
> -    if (ret != 0) {
> -        for (i = 0; i < mrb->num_writes; i++) {
> -            if (mrb->blkreq[i].error) {
> -                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
> -            }
> +    if (mrb0->num_reqs == 1) {
> +        if (mrb0->is_write) {
> +            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> +                           virtio_blk_rw_complete, mrb0->reqs[0]);
> +        } else {
> +            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> +                          virtio_blk_rw_complete, mrb0->reqs[0]);
>           }
> +        qemu_iovec_destroy(&mrb0->qiov);
> +        mrb0->num_reqs = 0;
> +        return;
> +    }
> +
> +    mrb = g_malloc(sizeof(MultiReqBuffer));
> +    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
> +    mrb0->num_reqs = 0;

I guess you're making mrb0->num_reqs == 1 a special case so you don't 
have to duplicate mrb0 in that case. But why are you duplicating it at 
all? And you're just moving over the IO vector without duplicating 
mrb->qiov.iov, which seems risky to me considering the 
qemu_iovec_destroy() in virtio_multireq_cb().

I could imagine you're duplicating it so the request can run in the 
background while you're again filling up the MultiReqBuffer, but that 
doesn't fit together with not duplicating it in the case of 
mrb0->num_reqs == 1 which can run in the background just the same.

> +
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> +#endif
> +
> +    if (mrb->is_write) {
> +        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> +                       virtio_multireq_cb, mrb);
> +    } else {
> +        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> +                      virtio_multireq_cb, mrb);
>       }
>   
> -    mrb->num_writes = 0;
> +    block_acct_merge_done(blk_get_stats(blk),
> +                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
> +                          mrb->num_reqs - 1);
>   }
>   
>   static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> @@ -283,9 +330,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>                        BLOCK_ACCT_FLUSH);
>   
>       /*
> -     * Make sure all outstanding writes are posted to the backing device.
> +     * Make sure all outstanding requests are posted to the backing device.
>        */
> -    virtio_submit_multiwrite(req->dev->blk, mrb);
> +    virtio_submit_multireq(req->dev->blk, mrb);
>       blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
>   }
>   
> @@ -308,61 +355,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>       return true;
>   }
>   
> -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> -{
> -    BlockRequest *blkreq;
> -    uint64_t sector;
> -
> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> -
> -    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
> -
> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> -        virtio_blk_free_request(req);
> -        return;
> -    }
> -
> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
> -                     BLOCK_ACCT_WRITE);
> -
> -    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
> -        virtio_submit_multiwrite(req->dev->blk, mrb);
> -    }
> -
> -    blkreq = &mrb->blkreq[mrb->num_writes];
> -    blkreq->sector = sector;
> -    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
> -    blkreq->qiov = &req->qiov;
> -    blkreq->cb = virtio_blk_rw_complete;
> -    blkreq->opaque = req;
> -    blkreq->error = 0;
> -
> -    mrb->num_writes++;
> -}
> -
> -static void virtio_blk_handle_read(VirtIOBlockReq *req)
> -{
> -    uint64_t sector;
> -
> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> -
> -    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
> -
> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> -        virtio_blk_free_request(req);
> -        return;
> -    }
> -
> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
> -                     BLOCK_ACCT_READ);
> -    blk_aio_readv(req->dev->blk, sector, &req->qiov,
> -                  req->qiov.size / BDRV_SECTOR_SIZE,
> -                  virtio_blk_rw_complete, req);
> -}
> -
> -void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> +void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
> +                               MultiReqBuffer *mrb_rd)
>   {
>       uint32_t type;
>       struct iovec *in_iov = req->elem.in_sg;
> @@ -397,7 +391,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>       type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>   
>       if (type & VIRTIO_BLK_T_FLUSH) {
> -        virtio_blk_handle_flush(req, mrb);
> +        virtio_blk_handle_flush(req, mrb_wr);
>       } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
>           virtio_blk_handle_scsi(req);
>       } else if (type & VIRTIO_BLK_T_GET_ID) {
> @@ -414,13 +408,71 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>           iov_from_buf(in_iov, in_num, 0, serial, size);
>           virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
>           virtio_blk_free_request(req);
> -    } else if (type & VIRTIO_BLK_T_OUT) {
> -        qemu_iovec_init_external(&req->qiov, iov, out_num);
> -        virtio_blk_handle_write(req, mrb);
> -    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
> -        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
> -        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
> -        virtio_blk_handle_read(req);
> +    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {

Please don't omit the comment about VIRTIO_BLK_T_IN being 0, I spend 
quite a while thinking about why VIRTIO_BLK_T_BARRIER is seemingly 
handled just the same as VIRTIO_BLK_T_IN. I still don't know what 
VIRTIO_BLK_T_BARRIER is supposed to do (and I don't really want to grab 
the virtio spec right now, although I should do it some time...), but 
now it at least seems to me like it doesn't really matter.

> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
> +        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> +        BlockDriverState *bs = blk_bs(req->dev->blk);
> +        int nb_sectors = 0;
> +        int merge = 1;

I shouldn't be criticizing such small issues in an RFC, but it's the 
most I can do for a virtio-blk series: I'd like this to be a bool 
instead of an int, please.

Judging from the not-so-quick-in-quantity-but-pretty-brief-in-quality 
scan through this RFC, it looks OK to me. There are some minor things 
(e.g. "too" instead of "to" in the commit message) which should be fixed 
eventually, but logic-wise, I didn't spot anything major.

Perhaps we could put the merge logic into an own function, though, just 
like bdrv_aio_multiwrite() did with multiwrite_merge().

I trust you that overlapping requests are not an issue, at least when 
not sorting the requests. Other than that, nothing bad from me. The 
patch looks good, but I don't want to be too optimistic, as I don't want 
to definitely judge anything related to virtio-blk. :-)

(Apart from the first two patches of this series, they were completely 
fine, but I seem to remember having them reviewed once already...)

Max

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-03 17:25   ` Max Reitz
@ 2014-12-04  7:16     ` Peter Lieven
  0 siblings, 0 replies; 11+ messages in thread
From: Peter Lieven @ 2014-12-04  7:16 UTC (permalink / raw)
  To: Max Reitz, qemu-devel
  Cc: kwolf, famz, benoit, ming.lei, armbru, stefanha, pbonzini

On 03.12.2014 18:25, Max Reitz wrote:
> On 2014-12-02 at 15:33, Peter Lieven wrote:
>> this patch finally introduce multiread support to virtio-blk while
>> multiwrite support was there for a long time read support was missing.
>>
>> To achieve this the patch does serveral things which might need futher
>> explaination:
>>
>>   - the whole merge and multireq logic is moved from block.c into
>>     virtio-blk. This is move is a preparation for directly creating a
>>     coroutine out of virtio-blk.
>>
>>   - requests are only merged if they are strictly sequential and no
>>     longer sorted. This simplification decreases overhead and reduces
>>     latency. It will also merge some requests which were unmergable before.
>>
>>     The old algorithm took up to 32 requests sorted them and tried to merge
>>     them. The outcome was anything between 1 and 32 requests. In case of
>>     32 requests there were 31 requests unnecessarily delayed.
>>
>>     On the other hand lets imagine e.g. 16 unmergeable requests followed
>>     by 32 mergable requests. The latter 32 requests would have been split
>>     into two 16 byte requests.
>>
>>     Last the simplified logic allows for a fast path if we have only a
>>     single request in the multirequest. In this case the request is sent as
>>     ordinary request without mulltireq callbacks.
>>
>> As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
>> merged requests is in the same order while the latency is slightly decreased.
>> One should not stick to much to the numbers because the number of wr_requests
>> are highly fluctuant. I hope the numbers show that this patch is at least
>> not causing to big harm:
>>
>> Cmdline:
>> qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
>>   -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
>>
>> Before:
>> virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
>>           flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
>>           flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
>>
>> After:
>> virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
>>           flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
>>           flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
>>
>> Some first numbers of improved read performance while booting:
>>
>> The Ubuntu 14.04.1 vServer from above:
>> virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
>>           flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
>>           flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
>>
>> Windows 2012R2:
>> virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
>>           flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
>>           flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>   hw/block/dataplane/virtio-blk.c |   10 +-
>>   hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
>>   include/hw/virtio/virtio-blk.h  |   23 ++--
>>   3 files changed, 156 insertions(+), 99 deletions(-)
>
> So, since you CC-ed me, I guess you're expecting a reply from me. I feel like I'm saying this more often recently (about anything, not just about virtio), but I don't know anything about virtio, so you should take anything from me with a not just a 
> grain but more of a pack of salt.

Thats fine with me, I appreciate your feedback nevertheless.

>
>> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
>> index 1222a37..aa4ad91 100644
>> --- a/hw/block/dataplane/virtio-blk.c
>> +++ b/hw/block/dataplane/virtio-blk.c
>> @@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
>>       event_notifier_test_and_clear(&s->host_notifier);
>>       blk_io_plug(s->conf->conf.blk);
>>       for (;;) {
>> -        MultiReqBuffer mrb = {
>> -            .num_writes = 0,
>> -        };
>> +        MultiReqBuffer mrb_rd = {};
>> +        MultiReqBuffer mrb_wr = {.is_write = 1};
>>           int ret;
>>             /* Disable guest->host notifies to avoid unnecessary vmexits */
>> @@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
>> req->elem.in_num,
>> req->elem.index);
>>   -            virtio_blk_handle_request(req, &mrb);
>> +            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
>>           }
>>   -        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
>>             if (likely(ret == -EAGAIN)) { /* vring emptied */
>>               /* Re-enable guest->host notifies and stop processing the vring.
>> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
>> index 490f961..f522bfc 100644
>> --- a/hw/block/virtio-blk.c
>> +++ b/hw/block/virtio-blk.c
>> @@ -22,12 +22,15 @@
>>   #include "dataplane/virtio-blk.h"
>>   #include "migration/migration.h"
>>   #include "block/scsi.h"
>> +#include "block/block_int.h"
>>   #ifdef __linux__
>>   # include <scsi/sg.h>
>>   #endif
>>   #include "hw/virtio/virtio-bus.h"
>>   #include "hw/virtio/virtio-access.h"
>>   +/* #define DEBUG_MULTIREQ */
>> +
>>   VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
>>   {
>>       VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
>> @@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
>>         trace_virtio_blk_rw_complete(req, ret);
>>   +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_blk_rw_complete p %p ret %d\n",
>> +           req, ret);
>> +#endif
>> +
>>       if (ret) {
>>           int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>>           bool is_read = !(p & VIRTIO_BLK_T_OUT);
>> @@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>>       virtio_blk_free_request(req);
>>   }
>>   -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
>> +static void virtio_multireq_cb(void *opaque, int ret)
>> +{
>> +    MultiReqBuffer *mrb = opaque;
>> +    int i;
>> +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>> +#endif
>> +    for (i = 0; i < mrb->num_reqs; i++) {
>> +        virtio_blk_rw_complete(mrb->reqs[i], ret);
>> +    }
>> +
>> +    qemu_iovec_destroy(&mrb->qiov);
>> +    g_free(mrb);
>> +}
>> +
>> +void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
>>   {
>> -    int i, ret;
>> +    MultiReqBuffer *mrb = NULL;
>>   -    if (!mrb->num_writes) {
>> +    if (!mrb0->num_reqs) {
>>           return;
>>       }
>>   -    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
>> -    if (ret != 0) {
>> -        for (i = 0; i < mrb->num_writes; i++) {
>> -            if (mrb->blkreq[i].error) {
>> - virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
>> -            }
>> +    if (mrb0->num_reqs == 1) {
>> +        if (mrb0->is_write) {
>> +            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>> +                           virtio_blk_rw_complete, mrb0->reqs[0]);
>> +        } else {
>> +            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>> +                          virtio_blk_rw_complete, mrb0->reqs[0]);
>>           }
>> +        qemu_iovec_destroy(&mrb0->qiov);
>> +        mrb0->num_reqs = 0;
>> +        return;
>> +    }
>> +
>> +    mrb = g_malloc(sizeof(MultiReqBuffer));
>> +    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
>> +    mrb0->num_reqs = 0;
>
> I guess you're making mrb0->num_reqs == 1 a special case so you don't have to duplicate mrb0 in that case. But why are you duplicating it at all? And you're just moving over the IO vector without duplicating mrb->qiov.iov, which seems risky to me 
> considering the qemu_iovec_destroy() in virtio_multireq_cb().
>
> I could imagine you're duplicating it so the request can run in the background while you're again filling up the MultiReqBuffer, but that doesn't fit together with not duplicating it in the case of mrb0->num_reqs == 1 which can run in the background 
> just the same.

I need to duplicate it in the > 1 case because MultiReqBuffer might be recycled before the virtio_multireq_cb callback is fired. In the == 1 case this can't happen because I pass the qiov from req[0] and use
the normal callback.

However, your concern reminds me of an optimization I had in mind, but that I forgot in this RFC patch.
In case of == 1 there is a useless init and destroy of mrb->qiov and 1 qemu_iovec_concat. I will adjust this
the next version by creating mrb->qiov from the req[]->qiov array only I enter the part of virtio_submit_multireq
for mrb->num_reqs > 1.

>
>> +
>> +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>> +#endif
>> +
>> +    if (mrb->is_write) {
>> +        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>> +                       virtio_multireq_cb, mrb);
>> +    } else {
>> +        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>> +                      virtio_multireq_cb, mrb);
>>       }
>>   -    mrb->num_writes = 0;
>> +    block_acct_merge_done(blk_get_stats(blk),
>> +                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
>> +                          mrb->num_reqs - 1);
>>   }
>>     static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> @@ -283,9 +330,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>                        BLOCK_ACCT_FLUSH);
>>         /*
>> -     * Make sure all outstanding writes are posted to the backing device.
>> +     * Make sure all outstanding requests are posted to the backing device.
>>        */
>> -    virtio_submit_multiwrite(req->dev->blk, mrb);
>> +    virtio_submit_multireq(req->dev->blk, mrb);
>>       blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
>>   }
>>   @@ -308,61 +355,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>>       return true;
>>   }
>>   -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> -{
>> -    BlockRequest *blkreq;
>> -    uint64_t sector;
>> -
>> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> -
>> -    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
>> -
>> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
>> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
>> -        virtio_blk_free_request(req);
>> -        return;
>> -    }
>> -
>> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
>> -                     BLOCK_ACCT_WRITE);
>> -
>> -    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
>> -        virtio_submit_multiwrite(req->dev->blk, mrb);
>> -    }
>> -
>> -    blkreq = &mrb->blkreq[mrb->num_writes];
>> -    blkreq->sector = sector;
>> -    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
>> -    blkreq->qiov = &req->qiov;
>> -    blkreq->cb = virtio_blk_rw_complete;
>> -    blkreq->opaque = req;
>> -    blkreq->error = 0;
>> -
>> -    mrb->num_writes++;
>> -}
>> -
>> -static void virtio_blk_handle_read(VirtIOBlockReq *req)
>> -{
>> -    uint64_t sector;
>> -
>> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> -
>> -    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
>> -
>> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
>> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
>> -        virtio_blk_free_request(req);
>> -        return;
>> -    }
>> -
>> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
>> -                     BLOCK_ACCT_READ);
>> -    blk_aio_readv(req->dev->blk, sector, &req->qiov,
>> -                  req->qiov.size / BDRV_SECTOR_SIZE,
>> -                  virtio_blk_rw_complete, req);
>> -}
>> -
>> -void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> +void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
>> +                               MultiReqBuffer *mrb_rd)
>>   {
>>       uint32_t type;
>>       struct iovec *in_iov = req->elem.in_sg;
>> @@ -397,7 +391,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>       type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>>         if (type & VIRTIO_BLK_T_FLUSH) {
>> -        virtio_blk_handle_flush(req, mrb);
>> +        virtio_blk_handle_flush(req, mrb_wr);
>>       } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
>>           virtio_blk_handle_scsi(req);
>>       } else if (type & VIRTIO_BLK_T_GET_ID) {
>> @@ -414,13 +408,71 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>           iov_from_buf(in_iov, in_num, 0, serial, size);
>>           virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
>>           virtio_blk_free_request(req);
>> -    } else if (type & VIRTIO_BLK_T_OUT) {
>> -        qemu_iovec_init_external(&req->qiov, iov, out_num);
>> -        virtio_blk_handle_write(req, mrb);
>> -    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
>> -        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
>> -        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
>> -        virtio_blk_handle_read(req);
>> +    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
>
> Please don't omit the comment about VIRTIO_BLK_T_IN being 0, I spend quite a while thinking about why VIRTIO_BLK_T_BARRIER is seemingly handled just the same as VIRTIO_BLK_T_IN. I still don't know what VIRTIO_BLK_T_BARRIER is supposed to do (and I don't 
> really want to grab the virtio spec right now, although I should do it some time...), but now it at least seems to me like it doesn't really matter.

Actually I don't know as well. I just copied the behaviour. But I will try to find out what VIRTIO_BLK_BARRIER is supposed to do.

>
>> +        bool is_write = type & VIRTIO_BLK_T_OUT;
>> +        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
>> +        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> +        BlockDriverState *bs = blk_bs(req->dev->blk);
>> +        int nb_sectors = 0;
>> +        int merge = 1;
>
> I shouldn't be criticizing such small issues in an RFC, but it's the most I can do for a virtio-blk series: I'd like this to be a bool instead of an int, please.
>
> Judging from the not-so-quick-in-quantity-but-pretty-brief-in-quality scan through this RFC, it looks OK to me. There are some minor things (e.g. "too" instead of "to" in the commit message) which should be fixed eventually, but logic-wise, I didn't 
> spot anything major.
>
> Perhaps we could put the merge logic into an own function, though, just like bdrv_aio_multiwrite() did with multiwrite_merge().

I also though about that, but the merge logic itself is pretty compact now and deals with Virtio specific structs. I think there will be much more code
and more complicated code if I add a generic version. I don't know if an driver will adopt the multiread stuff. No driver has done since 2009 actually.

>
> I trust you that overlapping requests are not an issue, at least when not sorting the requests. Other than that, nothing bad from me. The patch looks good, but I don't want to be too optimistic, as I don't want to definitely judge anything related to 
> virtio-blk. :-)
>
> (Apart from the first two patches of this series, they were completely fine, but I seem to remember having them reviewed once already...)

Mostly, but I had to change a few things. So I couldn't copy the Reviewed-by.

Thank you,
Peter

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread Peter Lieven
  2014-12-03 17:25   ` Max Reitz
@ 2014-12-04 14:12   ` Kevin Wolf
  2014-12-04 14:42     ` Peter Lieven
  1 sibling, 1 reply; 11+ messages in thread
From: Kevin Wolf @ 2014-12-04 14:12 UTC (permalink / raw)
  To: Peter Lieven
  Cc: famz, benoit, ming.lei, armbru, qemu-devel, stefanha, pbonzini,
	mreitz

Am 02.12.2014 um 15:33 hat Peter Lieven geschrieben:
> this patch finally introduce multiread support to virtio-blk while
> multiwrite support was there for a long time read support was missing.
> 
> To achieve this the patch does serveral things which might need futher
> explaination:
> 
>  - the whole merge and multireq logic is moved from block.c into
>    virtio-blk. This is move is a preparation for directly creating a
>    coroutine out of virtio-blk.
> 
>  - requests are only merged if they are strictly sequential and no
>    longer sorted. This simplification decreases overhead and reduces
>    latency. It will also merge some requests which were unmergable before.
> 
>    The old algorithm took up to 32 requests sorted them and tried to merge
>    them. The outcome was anything between 1 and 32 requests. In case of
>    32 requests there were 31 requests unnecessarily delayed.
> 
>    On the other hand lets imagine e.g. 16 unmergeable requests followed
>    by 32 mergable requests. The latter 32 requests would have been split
>    into two 16 byte requests.
> 
>    Last the simplified logic allows for a fast path if we have only a
>    single request in the multirequest. In this case the request is sent as
>    ordinary request without mulltireq callbacks.
> 
> As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
> merged requests is in the same order while the latency is slightly decreased.
> One should not stick to much to the numbers because the number of wr_requests
> are highly fluctuant. I hope the numbers show that this patch is at least
> not causing to big harm:
> 
> Cmdline:
> qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
>  -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
> 
> Before:
> virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
>          flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
>          flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
> 
> After:
> virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
>          flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
>          flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
> 
> Some first numbers of improved read performance while booting:
> 
> The Ubuntu 14.04.1 vServer from above:
> virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
>          flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
>          flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
> 
> Windows 2012R2:
> virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
>          flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
>          flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  hw/block/dataplane/virtio-blk.c |   10 +-
>  hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
>  include/hw/virtio/virtio-blk.h  |   23 ++--
>  3 files changed, 156 insertions(+), 99 deletions(-)
> 
> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> index 1222a37..aa4ad91 100644
> --- a/hw/block/dataplane/virtio-blk.c
> +++ b/hw/block/dataplane/virtio-blk.c
> @@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
>      event_notifier_test_and_clear(&s->host_notifier);
>      blk_io_plug(s->conf->conf.blk);
>      for (;;) {
> -        MultiReqBuffer mrb = {
> -            .num_writes = 0,
> -        };
> +        MultiReqBuffer mrb_rd = {};
> +        MultiReqBuffer mrb_wr = {.is_write = 1};
>          int ret;
>  
>          /* Disable guest->host notifies to avoid unnecessary vmexits */
> @@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
>                                                          req->elem.in_num,
>                                                          req->elem.index);
>  
> -            virtio_blk_handle_request(req, &mrb);
> +            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
>          }
>  
> -        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
>  
>          if (likely(ret == -EAGAIN)) { /* vring emptied */
>              /* Re-enable guest->host notifies and stop processing the vring.
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 490f961..f522bfc 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -22,12 +22,15 @@
>  #include "dataplane/virtio-blk.h"
>  #include "migration/migration.h"
>  #include "block/scsi.h"
> +#include "block/block_int.h"

No. :-)

We could either expose the information that you need through
BlockBackend, or wait until your automatic request splitting logic is
implemented in block.c and the information isn't needed here any more.

>  #ifdef __linux__
>  # include <scsi/sg.h>
>  #endif
>  #include "hw/virtio/virtio-bus.h"
>  #include "hw/virtio/virtio-access.h"
>  
> +/* #define DEBUG_MULTIREQ */
> +
>  VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
>  {
>      VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
> @@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
>  
>      trace_virtio_blk_rw_complete(req, ret);
>  
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_blk_rw_complete p %p ret %d\n",
> +           req, ret);
> +#endif

In the final version, please use tracepoints instead of printfs.

>      if (ret) {
>          int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>          bool is_read = !(p & VIRTIO_BLK_T_OUT);
> @@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>      virtio_blk_free_request(req);
>  }
>  
> -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
> +static void virtio_multireq_cb(void *opaque, int ret)
> +{
> +    MultiReqBuffer *mrb = opaque;
> +    int i;
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> +#endif
> +    for (i = 0; i < mrb->num_reqs; i++) {
> +        virtio_blk_rw_complete(mrb->reqs[i], ret);
> +    }
> +
> +    qemu_iovec_destroy(&mrb->qiov);
> +    g_free(mrb);
> +}
> +
> +void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
>  {
> -    int i, ret;
> +    MultiReqBuffer *mrb = NULL;
>  
> -    if (!mrb->num_writes) {
> +    if (!mrb0->num_reqs) {
>          return;
>      }
>  
> -    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
> -    if (ret != 0) {
> -        for (i = 0; i < mrb->num_writes; i++) {
> -            if (mrb->blkreq[i].error) {
> -                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
> -            }
> +    if (mrb0->num_reqs == 1) {
> +        if (mrb0->is_write) {
> +            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> +                           virtio_blk_rw_complete, mrb0->reqs[0]);
> +        } else {
> +            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> +                          virtio_blk_rw_complete, mrb0->reqs[0]);
>          }
> +        qemu_iovec_destroy(&mrb0->qiov);
> +        mrb0->num_reqs = 0;
> +        return;
> +    }
> +
> +    mrb = g_malloc(sizeof(MultiReqBuffer));
> +    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
> +    mrb0->num_reqs = 0;

We probably want to avoid allocations as much as we can. If at all
possible, we don't want to memcpy() either.

Most parts of MultiReqBuffer don't seem to be actually needed during the
request and in the callback any more. Essentially, all that is needed is
the qiov and a way to find all the requests that belong to the same
MultiReqBuffer, so that they can all be completed.

We could create a linked list by having a VirtIOBlockReq *next in the
request struct. For the qiov we can probably just use the field in the
first request (i.e. we extend the qiov of the first request).

With these requirements removed, MultiReqBuffer can live on the stack
and doesn't need to be copied to the heap.


With the additional patch that you send me, the next few lines read:

+    qemu_iovec_init(&mrb->qiov, mrb->num_reqs);
+
+    for (i = 0; i < mrb->num_reqs; i++) {
+        qemu_iovec_concat(&mrb->qiov, &mrb->reqs[i]->qiov, 0, mrb->reqs[i]->qiov.size);
+    }
+    assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);

The long line is more than 80 characters, but probably more importantly,
mrb->num_reqs is a really bad estimation for niov. If the guest uses any
vectored I/O, we're sure to get reallocations. What you really want to
use here is the old mrb->qiov.niov (which is an abuse to store in the
qiov, see below).

(I also wonder whether it would make sense to keep qiovs allocated when
putting VirtIOBlockReqs into the pool, but that's not for this series.)

> +
> +#ifdef DEBUG_MULTIREQ
> +    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> +#endif
> +
> +    if (mrb->is_write) {
> +        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> +                       virtio_multireq_cb, mrb);
> +    } else {
> +        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> +                      virtio_multireq_cb, mrb);
>      }
>  
> -    mrb->num_writes = 0;
> +    block_acct_merge_done(blk_get_stats(blk),
> +                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
> +                          mrb->num_reqs - 1);
>  }
>  
>  static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> @@ -283,9 +330,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>                       BLOCK_ACCT_FLUSH);
>  
>      /*
> -     * Make sure all outstanding writes are posted to the backing device.
> +     * Make sure all outstanding requests are posted to the backing device.
>       */
> -    virtio_submit_multiwrite(req->dev->blk, mrb);
> +    virtio_submit_multireq(req->dev->blk, mrb);
>      blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
>  }
>  
> @@ -308,61 +355,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>      return true;
>  }
>  
> -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> -{
> -    BlockRequest *blkreq;
> -    uint64_t sector;
> -
> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> -
> -    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
> -
> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> -        virtio_blk_free_request(req);
> -        return;
> -    }
> -
> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
> -                     BLOCK_ACCT_WRITE);
> -
> -    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
> -        virtio_submit_multiwrite(req->dev->blk, mrb);
> -    }
> -
> -    blkreq = &mrb->blkreq[mrb->num_writes];
> -    blkreq->sector = sector;
> -    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
> -    blkreq->qiov = &req->qiov;
> -    blkreq->cb = virtio_blk_rw_complete;
> -    blkreq->opaque = req;
> -    blkreq->error = 0;
> -
> -    mrb->num_writes++;
> -}
> -
> -static void virtio_blk_handle_read(VirtIOBlockReq *req)
> -{
> -    uint64_t sector;
> -
> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> -
> -    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
> -
> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> -        virtio_blk_free_request(req);
> -        return;
> -    }
> -
> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
> -                     BLOCK_ACCT_READ);
> -    blk_aio_readv(req->dev->blk, sector, &req->qiov,
> -                  req->qiov.size / BDRV_SECTOR_SIZE,
> -                  virtio_blk_rw_complete, req);
> -}

Handling reads and writes in a common place makes sense, but the code is
long enough that I would still keep it in a separate function, like
virtio_blk_handle_rw().

> -void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> +void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
> +                               MultiReqBuffer *mrb_rd)
>  {
>      uint32_t type;
>      struct iovec *in_iov = req->elem.in_sg;
> @@ -397,7 +391,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>      type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>  
>      if (type & VIRTIO_BLK_T_FLUSH) {
> -        virtio_blk_handle_flush(req, mrb);
> +        virtio_blk_handle_flush(req, mrb_wr);
>      } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
>          virtio_blk_handle_scsi(req);
>      } else if (type & VIRTIO_BLK_T_GET_ID) {
> @@ -414,13 +408,71 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>          iov_from_buf(in_iov, in_num, 0, serial, size);
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
>          virtio_blk_free_request(req);
> -    } else if (type & VIRTIO_BLK_T_OUT) {
> -        qemu_iovec_init_external(&req->qiov, iov, out_num);
> -        virtio_blk_handle_write(req, mrb);
> -    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
> -        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
> -        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
> -        virtio_blk_handle_read(req);
> +    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
> +        bool is_write = type & VIRTIO_BLK_T_OUT;
> +        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
> +        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
> +        BlockDriverState *bs = blk_bs(req->dev->blk);
> +        int nb_sectors = 0;
> +        int merge = 1;
> +
> +        if (!virtio_blk_sect_range_ok(req->dev, sector_num, req->qiov.size)) {
> +            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
> +            virtio_blk_free_request(req);
> +            return;
> +        }
> +
> +        if (is_write) {
> +            qemu_iovec_init_external(&req->qiov, iov, out_num);
> +            trace_virtio_blk_handle_write(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
> +        } else {
> +            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
> +            trace_virtio_blk_handle_read(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
> +        }
> +
> +        nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
> +#ifdef DEBUG_MULTIREQ
> +        printf("virtio_blk_handle_request p %p sector_num %ld nb_sectors %d is_write %d\n",
> +               req, sector_num, nb_sectors, is_write);
> +#endif
> +
> +        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
> +                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +        /* merge would exceed maximum number of requests or IOVs */
> +        if (mrb->num_reqs == MAX_MERGE_REQS ||
> +            mrb->qiov.niov + req->qiov.niov + 1 > IOV_MAX) {
> +            merge = 0;
> +        }

Again with your latest patch applied on top:

You're abusing mrb->qiov.niov while gathering requests. mrb->qiov is
never initialised and you don't actually add iovs anywhere, but you just
increase qiov.niov without the associated data.

Why don't you make it in simple int instead of QEMUIOVector?

> +
> +        /* merge would exceed maximum transfer length of backend device */
> +        if (bs->bl.max_transfer_length &&
> +            mrb->nb_sectors + nb_sectors > bs->bl.max_transfer_length) {
> +            merge = 0;
> +        }
> +
> +        /* requests are not sequential */
> +        if (mrb->num_reqs && mrb->sector_num + mrb->nb_sectors != sector_num) {
> +            merge = 0;
> +        }
> +
> +        if (!merge) {
> +            virtio_submit_multireq(req->dev->blk, mrb);
> +        }
> +
> +        if (mrb->num_reqs == 0) {
> +            qemu_iovec_init(&mrb->qiov, MAX_MERGE_REQS);
> +            mrb->sector_num = sector_num;
> +            mrb->nb_sectors = 0;
> +        }
> +
> +        qemu_iovec_concat(&mrb->qiov, &req->qiov, 0, req->qiov.size);
> +        mrb->nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
> +
> +        assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
> +
> +        mrb->reqs[mrb->num_reqs] = req;
> +        mrb->num_reqs++;
>      } else {
>          virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
>          virtio_blk_free_request(req);

Otherwise it looks as if it could work.

Kevin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-04 14:12   ` Kevin Wolf
@ 2014-12-04 14:42     ` Peter Lieven
  2014-12-04 15:03       ` Kevin Wolf
  0 siblings, 1 reply; 11+ messages in thread
From: Peter Lieven @ 2014-12-04 14:42 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: famz, benoit, ming.lei, armbru, qemu-devel, stefanha, pbonzini,
	mreitz

On 04.12.2014 15:12, Kevin Wolf wrote:
> Am 02.12.2014 um 15:33 hat Peter Lieven geschrieben:
>> this patch finally introduce multiread support to virtio-blk while
>> multiwrite support was there for a long time read support was missing.
>>
>> To achieve this the patch does serveral things which might need futher
>> explaination:
>>
>>   - the whole merge and multireq logic is moved from block.c into
>>     virtio-blk. This is move is a preparation for directly creating a
>>     coroutine out of virtio-blk.
>>
>>   - requests are only merged if they are strictly sequential and no
>>     longer sorted. This simplification decreases overhead and reduces
>>     latency. It will also merge some requests which were unmergable before.
>>
>>     The old algorithm took up to 32 requests sorted them and tried to merge
>>     them. The outcome was anything between 1 and 32 requests. In case of
>>     32 requests there were 31 requests unnecessarily delayed.
>>
>>     On the other hand lets imagine e.g. 16 unmergeable requests followed
>>     by 32 mergable requests. The latter 32 requests would have been split
>>     into two 16 byte requests.
>>
>>     Last the simplified logic allows for a fast path if we have only a
>>     single request in the multirequest. In this case the request is sent as
>>     ordinary request without mulltireq callbacks.
>>
>> As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
>> merged requests is in the same order while the latency is slightly decreased.
>> One should not stick to much to the numbers because the number of wr_requests
>> are highly fluctuant. I hope the numbers show that this patch is at least
>> not causing to big harm:
>>
>> Cmdline:
>> qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
>>   -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
>>
>> Before:
>> virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
>>           flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
>>           flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
>>
>> After:
>> virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
>>           flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
>>           flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
>>
>> Some first numbers of improved read performance while booting:
>>
>> The Ubuntu 14.04.1 vServer from above:
>> virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
>>           flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
>>           flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
>>
>> Windows 2012R2:
>> virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
>>           flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
>>           flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>   hw/block/dataplane/virtio-blk.c |   10 +-
>>   hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
>>   include/hw/virtio/virtio-blk.h  |   23 ++--
>>   3 files changed, 156 insertions(+), 99 deletions(-)
>>
>> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
>> index 1222a37..aa4ad91 100644
>> --- a/hw/block/dataplane/virtio-blk.c
>> +++ b/hw/block/dataplane/virtio-blk.c
>> @@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
>>       event_notifier_test_and_clear(&s->host_notifier);
>>       blk_io_plug(s->conf->conf.blk);
>>       for (;;) {
>> -        MultiReqBuffer mrb = {
>> -            .num_writes = 0,
>> -        };
>> +        MultiReqBuffer mrb_rd = {};
>> +        MultiReqBuffer mrb_wr = {.is_write = 1};
>>           int ret;
>>   
>>           /* Disable guest->host notifies to avoid unnecessary vmexits */
>> @@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
>>                                                           req->elem.in_num,
>>                                                           req->elem.index);
>>   
>> -            virtio_blk_handle_request(req, &mrb);
>> +            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
>>           }
>>   
>> -        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
>>   
>>           if (likely(ret == -EAGAIN)) { /* vring emptied */
>>               /* Re-enable guest->host notifies and stop processing the vring.
>> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
>> index 490f961..f522bfc 100644
>> --- a/hw/block/virtio-blk.c
>> +++ b/hw/block/virtio-blk.c
>> @@ -22,12 +22,15 @@
>>   #include "dataplane/virtio-blk.h"
>>   #include "migration/migration.h"
>>   #include "block/scsi.h"
>> +#include "block/block_int.h"
> No. :-)
>
> We could either expose the information that you need through
> BlockBackend, or wait until your automatic request splitting logic is
> implemented in block.c and the information isn't needed here any more.

I will add sth to block.c and follow-up with the request splitting logic
later. It will still make sense to now the limit here. Otherwise we
merge something that we split again afterwards.

>
>>   #ifdef __linux__
>>   # include <scsi/sg.h>
>>   #endif
>>   #include "hw/virtio/virtio-bus.h"
>>   #include "hw/virtio/virtio-access.h"
>>   
>> +/* #define DEBUG_MULTIREQ */
>> +
>>   VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
>>   {
>>       VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
>> @@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
>>   
>>       trace_virtio_blk_rw_complete(req, ret);
>>   
>> +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_blk_rw_complete p %p ret %d\n",
>> +           req, ret);
>> +#endif
> In the final version, please use tracepoints instead of printfs.

of course. This is just my debug code.

>
>>       if (ret) {
>>           int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>>           bool is_read = !(p & VIRTIO_BLK_T_OUT);
>> @@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>>       virtio_blk_free_request(req);
>>   }
>>   
>> -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
>> +static void virtio_multireq_cb(void *opaque, int ret)
>> +{
>> +    MultiReqBuffer *mrb = opaque;
>> +    int i;
>> +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>> +#endif
>> +    for (i = 0; i < mrb->num_reqs; i++) {
>> +        virtio_blk_rw_complete(mrb->reqs[i], ret);
>> +    }
>> +
>> +    qemu_iovec_destroy(&mrb->qiov);
>> +    g_free(mrb);
>> +}
>> +
>> +void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
>>   {
>> -    int i, ret;
>> +    MultiReqBuffer *mrb = NULL;
>>   
>> -    if (!mrb->num_writes) {
>> +    if (!mrb0->num_reqs) {
>>           return;
>>       }
>>   
>> -    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
>> -    if (ret != 0) {
>> -        for (i = 0; i < mrb->num_writes; i++) {
>> -            if (mrb->blkreq[i].error) {
>> -                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
>> -            }
>> +    if (mrb0->num_reqs == 1) {
>> +        if (mrb0->is_write) {
>> +            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>> +                           virtio_blk_rw_complete, mrb0->reqs[0]);
>> +        } else {
>> +            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>> +                          virtio_blk_rw_complete, mrb0->reqs[0]);
>>           }
>> +        qemu_iovec_destroy(&mrb0->qiov);
>> +        mrb0->num_reqs = 0;
>> +        return;
>> +    }
>> +
>> +    mrb = g_malloc(sizeof(MultiReqBuffer));
>> +    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
>> +    mrb0->num_reqs = 0;
> We probably want to avoid allocations as much as we can. If at all
> possible, we don't want to memcpy() either.
>
> Most parts of MultiReqBuffer don't seem to be actually needed during the
> request and in the callback any more. Essentially, all that is needed is
> the qiov and a way to find all the requests that belong to the same
> MultiReqBuffer, so that they can all be completed.
>
> We could create a linked list by having a VirtIOBlockReq *next in the
> request struct. For the qiov we can probably just use the field in the
> first request (i.e. we extend the qiov of the first request).

Thats a good idea and should work. I will look into this.
But where would you store the merge qiov?

It looks like I would need a small struct which holds the qiov and the pointer to req[0].
This can then be used as opaque value to the callback.


>
> With these requirements removed, MultiReqBuffer can live on the stack
> and doesn't need to be copied to the heap.
>
>
> With the additional patch that you send me, the next few lines read:
>
> +    qemu_iovec_init(&mrb->qiov, mrb->num_reqs);
> +
> +    for (i = 0; i < mrb->num_reqs; i++) {
> +        qemu_iovec_concat(&mrb->qiov, &mrb->reqs[i]->qiov, 0, mrb->reqs[i]->qiov.size);
> +    }
> +    assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
>
> The long line is more than 80 characters, but probably more importantly,
> mrb->num_reqs is a really bad estimation for niov. If the guest uses any
> vectored I/O, we're sure to get reallocations. What you really want to
> use here is the old mrb->qiov.niov (which is an abuse to store in the
> qiov, see below).

Of course, my fault. I was so happy to have an exact value for the number
of niov that I put in the wrong variable ;-)

>
> (I also wonder whether it would make sense to keep qiovs allocated when
> putting VirtIOBlockReqs into the pool, but that's not for this series.)
>
>> +
>> +#ifdef DEBUG_MULTIREQ
>> +    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>> +#endif
>> +
>> +    if (mrb->is_write) {
>> +        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>> +                       virtio_multireq_cb, mrb);
>> +    } else {
>> +        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>> +                      virtio_multireq_cb, mrb);
>>       }
>>   
>> -    mrb->num_writes = 0;
>> +    block_acct_merge_done(blk_get_stats(blk),
>> +                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
>> +                          mrb->num_reqs - 1);
>>   }
>>   
>>   static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> @@ -283,9 +330,9 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>                        BLOCK_ACCT_FLUSH);
>>   
>>       /*
>> -     * Make sure all outstanding writes are posted to the backing device.
>> +     * Make sure all outstanding requests are posted to the backing device.
>>        */
>> -    virtio_submit_multiwrite(req->dev->blk, mrb);
>> +    virtio_submit_multireq(req->dev->blk, mrb);
>>       blk_aio_flush(req->dev->blk, virtio_blk_flush_complete, req);
>>   }
>>   
>> @@ -308,61 +355,8 @@ static bool virtio_blk_sect_range_ok(VirtIOBlock *dev,
>>       return true;
>>   }
>>   
>> -static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> -{
>> -    BlockRequest *blkreq;
>> -    uint64_t sector;
>> -
>> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> -
>> -    trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512);
>> -
>> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
>> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
>> -        virtio_blk_free_request(req);
>> -        return;
>> -    }
>> -
>> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
>> -                     BLOCK_ACCT_WRITE);
>> -
>> -    if (mrb->num_writes == VIRTIO_BLK_MAX_MERGE_REQS) {
>> -        virtio_submit_multiwrite(req->dev->blk, mrb);
>> -    }
>> -
>> -    blkreq = &mrb->blkreq[mrb->num_writes];
>> -    blkreq->sector = sector;
>> -    blkreq->nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
>> -    blkreq->qiov = &req->qiov;
>> -    blkreq->cb = virtio_blk_rw_complete;
>> -    blkreq->opaque = req;
>> -    blkreq->error = 0;
>> -
>> -    mrb->num_writes++;
>> -}
>> -
>> -static void virtio_blk_handle_read(VirtIOBlockReq *req)
>> -{
>> -    uint64_t sector;
>> -
>> -    sector = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> -
>> -    trace_virtio_blk_handle_read(req, sector, req->qiov.size / 512);
>> -
>> -    if (!virtio_blk_sect_range_ok(req->dev, sector, req->qiov.size)) {
>> -        virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
>> -        virtio_blk_free_request(req);
>> -        return;
>> -    }
>> -
>> -    block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
>> -                     BLOCK_ACCT_READ);
>> -    blk_aio_readv(req->dev->blk, sector, &req->qiov,
>> -                  req->qiov.size / BDRV_SECTOR_SIZE,
>> -                  virtio_blk_rw_complete, req);
>> -}
> Handling reads and writes in a common place makes sense, but the code is
> long enough that I would still keep it in a separate function, like
> virtio_blk_handle_rw().

Sure

>
>> -void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>> +void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb_wr,
>> +                               MultiReqBuffer *mrb_rd)
>>   {
>>       uint32_t type;
>>       struct iovec *in_iov = req->elem.in_sg;
>> @@ -397,7 +391,7 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>       type = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>>   
>>       if (type & VIRTIO_BLK_T_FLUSH) {
>> -        virtio_blk_handle_flush(req, mrb);
>> +        virtio_blk_handle_flush(req, mrb_wr);
>>       } else if (type & VIRTIO_BLK_T_SCSI_CMD) {
>>           virtio_blk_handle_scsi(req);
>>       } else if (type & VIRTIO_BLK_T_GET_ID) {
>> @@ -414,13 +408,71 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
>>           iov_from_buf(in_iov, in_num, 0, serial, size);
>>           virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
>>           virtio_blk_free_request(req);
>> -    } else if (type & VIRTIO_BLK_T_OUT) {
>> -        qemu_iovec_init_external(&req->qiov, iov, out_num);
>> -        virtio_blk_handle_write(req, mrb);
>> -    } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
>> -        /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
>> -        qemu_iovec_init_external(&req->qiov, in_iov, in_num);
>> -        virtio_blk_handle_read(req);
>> +    } else if (type & VIRTIO_BLK_T_OUT || type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
>> +        bool is_write = type & VIRTIO_BLK_T_OUT;
>> +        MultiReqBuffer *mrb = is_write ? mrb_wr : mrb_rd;
>> +        int64_t sector_num = virtio_ldq_p(VIRTIO_DEVICE(req->dev), &req->out.sector);
>> +        BlockDriverState *bs = blk_bs(req->dev->blk);
>> +        int nb_sectors = 0;
>> +        int merge = 1;
>> +
>> +        if (!virtio_blk_sect_range_ok(req->dev, sector_num, req->qiov.size)) {
>> +            virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
>> +            virtio_blk_free_request(req);
>> +            return;
>> +        }
>> +
>> +        if (is_write) {
>> +            qemu_iovec_init_external(&req->qiov, iov, out_num);
>> +            trace_virtio_blk_handle_write(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
>> +        } else {
>> +            qemu_iovec_init_external(&req->qiov, in_iov, in_num);
>> +            trace_virtio_blk_handle_read(req, sector_num, req->qiov.size / BDRV_SECTOR_SIZE);
>> +        }
>> +
>> +        nb_sectors = req->qiov.size / BDRV_SECTOR_SIZE;
>> +#ifdef DEBUG_MULTIREQ
>> +        printf("virtio_blk_handle_request p %p sector_num %ld nb_sectors %d is_write %d\n",
>> +               req, sector_num, nb_sectors, is_write);
>> +#endif
>> +
>> +        block_acct_start(blk_get_stats(req->dev->blk), &req->acct, req->qiov.size,
>> +                         is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
>> +
>> +        /* merge would exceed maximum number of requests or IOVs */
>> +        if (mrb->num_reqs == MAX_MERGE_REQS ||
>> +            mrb->qiov.niov + req->qiov.niov + 1 > IOV_MAX) {
>> +            merge = 0;
>> +        }
> Again with your latest patch applied on top:
>
> You're abusing mrb->qiov.niov while gathering requests. mrb->qiov is
> never initialised and you don't actually add iovs anywhere, but you just
> increase qiov.niov without the associated data.
>
> Why don't you make it in simple int instead of QEMUIOVector?

I was trying to use same struct for collecting the requests and building the
callback opaque. I will split that.

>
>> +
>> +        /* merge would exceed maximum transfer length of backend device */
>> +        if (bs->bl.max_transfer_length &&
>> +            mrb->nb_sectors + nb_sectors > bs->bl.max_transfer_length) {
>> +            merge = 0;
>> +        }
>> +
>> +        /* requests are not sequential */
>> +        if (mrb->num_reqs && mrb->sector_num + mrb->nb_sectors != sector_num) {
>> +            merge = 0;
>> +        }
>> +
>> +        if (!merge) {
>> +            virtio_submit_multireq(req->dev->blk, mrb);
>> +        }
>> +
>> +        if (mrb->num_reqs == 0) {
>> +            qemu_iovec_init(&mrb->qiov, MAX_MERGE_REQS);
>> +            mrb->sector_num = sector_num;
>> +            mrb->nb_sectors = 0;
>> +        }
>> +
>> +        qemu_iovec_concat(&mrb->qiov, &req->qiov, 0, req->qiov.size);
>> +        mrb->nb_sectors += req->qiov.size / BDRV_SECTOR_SIZE;
>> +
>> +        assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
>> +
>> +        mrb->reqs[mrb->num_reqs] = req;
>> +        mrb->num_reqs++;
>>       } else {
>>           virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
>>           virtio_blk_free_request(req);
> Otherwise it looks as if it could work.

Thanks for your comments,
Peter

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-04 14:42     ` Peter Lieven
@ 2014-12-04 15:03       ` Kevin Wolf
  2014-12-04 15:11         ` Peter Lieven
  0 siblings, 1 reply; 11+ messages in thread
From: Kevin Wolf @ 2014-12-04 15:03 UTC (permalink / raw)
  To: Peter Lieven
  Cc: famz, benoit, ming.lei, armbru, qemu-devel, stefanha, pbonzini,
	mreitz

Am 04.12.2014 um 15:42 hat Peter Lieven geschrieben:
> On 04.12.2014 15:12, Kevin Wolf wrote:
> >Am 02.12.2014 um 15:33 hat Peter Lieven geschrieben:
> >>this patch finally introduce multiread support to virtio-blk while
> >>multiwrite support was there for a long time read support was missing.
> >>
> >>To achieve this the patch does serveral things which might need futher
> >>explaination:
> >>
> >>  - the whole merge and multireq logic is moved from block.c into
> >>    virtio-blk. This is move is a preparation for directly creating a
> >>    coroutine out of virtio-blk.
> >>
> >>  - requests are only merged if they are strictly sequential and no
> >>    longer sorted. This simplification decreases overhead and reduces
> >>    latency. It will also merge some requests which were unmergable before.
> >>
> >>    The old algorithm took up to 32 requests sorted them and tried to merge
> >>    them. The outcome was anything between 1 and 32 requests. In case of
> >>    32 requests there were 31 requests unnecessarily delayed.
> >>
> >>    On the other hand lets imagine e.g. 16 unmergeable requests followed
> >>    by 32 mergable requests. The latter 32 requests would have been split
> >>    into two 16 byte requests.
> >>
> >>    Last the simplified logic allows for a fast path if we have only a
> >>    single request in the multirequest. In this case the request is sent as
> >>    ordinary request without mulltireq callbacks.
> >>
> >>As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
> >>merged requests is in the same order while the latency is slightly decreased.
> >>One should not stick to much to the numbers because the number of wr_requests
> >>are highly fluctuant. I hope the numbers show that this patch is at least
> >>not causing to big harm:
> >>
> >>Cmdline:
> >>qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
> >>  -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
> >>
> >>Before:
> >>virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
> >>          flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
> >>          flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
> >>
> >>After:
> >>virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
> >>          flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
> >>          flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
> >>
> >>Some first numbers of improved read performance while booting:
> >>
> >>The Ubuntu 14.04.1 vServer from above:
> >>virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
> >>          flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
> >>          flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
> >>
> >>Windows 2012R2:
> >>virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
> >>          flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
> >>          flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
> >>
> >>Signed-off-by: Peter Lieven <pl@kamp.de>
> >>---
> >>  hw/block/dataplane/virtio-blk.c |   10 +-
> >>  hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
> >>  include/hw/virtio/virtio-blk.h  |   23 ++--
> >>  3 files changed, 156 insertions(+), 99 deletions(-)
> >>
> >>diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
> >>index 1222a37..aa4ad91 100644
> >>--- a/hw/block/dataplane/virtio-blk.c
> >>+++ b/hw/block/dataplane/virtio-blk.c
> >>@@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
> >>      event_notifier_test_and_clear(&s->host_notifier);
> >>      blk_io_plug(s->conf->conf.blk);
> >>      for (;;) {
> >>-        MultiReqBuffer mrb = {
> >>-            .num_writes = 0,
> >>-        };
> >>+        MultiReqBuffer mrb_rd = {};
> >>+        MultiReqBuffer mrb_wr = {.is_write = 1};
> >>          int ret;
> >>          /* Disable guest->host notifies to avoid unnecessary vmexits */
> >>@@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
> >>                                                          req->elem.in_num,
> >>                                                          req->elem.index);
> >>-            virtio_blk_handle_request(req, &mrb);
> >>+            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
> >>          }
> >>-        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
> >>+        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
> >>+        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
> >>          if (likely(ret == -EAGAIN)) { /* vring emptied */
> >>              /* Re-enable guest->host notifies and stop processing the vring.
> >>diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> >>index 490f961..f522bfc 100644
> >>--- a/hw/block/virtio-blk.c
> >>+++ b/hw/block/virtio-blk.c
> >>@@ -22,12 +22,15 @@
> >>  #include "dataplane/virtio-blk.h"
> >>  #include "migration/migration.h"
> >>  #include "block/scsi.h"
> >>+#include "block/block_int.h"
> >No. :-)
> >
> >We could either expose the information that you need through
> >BlockBackend, or wait until your automatic request splitting logic is
> >implemented in block.c and the information isn't needed here any more.
> 
> I will add sth to block.c and follow-up with the request splitting logic
> later. It will still make sense to now the limit here. Otherwise we
> merge something that we split again afterwards.

Yup, but then make it an official block layer interface instead of
including block_int.h in the device emulation.

> >
> >>  #ifdef __linux__
> >>  # include <scsi/sg.h>
> >>  #endif
> >>  #include "hw/virtio/virtio-bus.h"
> >>  #include "hw/virtio/virtio-access.h"
> >>+/* #define DEBUG_MULTIREQ */
> >>+
> >>  VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
> >>  {
> >>      VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
> >>@@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
> >>      trace_virtio_blk_rw_complete(req, ret);
> >>+#ifdef DEBUG_MULTIREQ
> >>+    printf("virtio_blk_rw_complete p %p ret %d\n",
> >>+           req, ret);
> >>+#endif
> >In the final version, please use tracepoints instead of printfs.
> 
> of course. This is just my debug code.
> 
> >
> >>      if (ret) {
> >>          int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
> >>          bool is_read = !(p & VIRTIO_BLK_T_OUT);
> >>@@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
> >>      virtio_blk_free_request(req);
> >>  }
> >>-void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
> >>+static void virtio_multireq_cb(void *opaque, int ret)
> >>+{
> >>+    MultiReqBuffer *mrb = opaque;
> >>+    int i;
> >>+#ifdef DEBUG_MULTIREQ
> >>+    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> >>+           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> >>+#endif
> >>+    for (i = 0; i < mrb->num_reqs; i++) {
> >>+        virtio_blk_rw_complete(mrb->reqs[i], ret);
> >>+    }
> >>+
> >>+    qemu_iovec_destroy(&mrb->qiov);
> >>+    g_free(mrb);
> >>+}
> >>+
> >>+void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
> >>  {
> >>-    int i, ret;
> >>+    MultiReqBuffer *mrb = NULL;
> >>-    if (!mrb->num_writes) {
> >>+    if (!mrb0->num_reqs) {
> >>          return;
> >>      }
> >>-    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
> >>-    if (ret != 0) {
> >>-        for (i = 0; i < mrb->num_writes; i++) {
> >>-            if (mrb->blkreq[i].error) {
> >>-                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
> >>-            }
> >>+    if (mrb0->num_reqs == 1) {
> >>+        if (mrb0->is_write) {
> >>+            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> >>+                           virtio_blk_rw_complete, mrb0->reqs[0]);
> >>+        } else {
> >>+            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
> >>+                          virtio_blk_rw_complete, mrb0->reqs[0]);
> >>          }
> >>+        qemu_iovec_destroy(&mrb0->qiov);
> >>+        mrb0->num_reqs = 0;
> >>+        return;
> >>+    }
> >>+
> >>+    mrb = g_malloc(sizeof(MultiReqBuffer));
> >>+    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
> >>+    mrb0->num_reqs = 0;
> >We probably want to avoid allocations as much as we can. If at all
> >possible, we don't want to memcpy() either.
> >
> >Most parts of MultiReqBuffer don't seem to be actually needed during the
> >request and in the callback any more. Essentially, all that is needed is
> >the qiov and a way to find all the requests that belong to the same
> >MultiReqBuffer, so that they can all be completed.
> >
> >We could create a linked list by having a VirtIOBlockReq *next in the
> >request struct. For the qiov we can probably just use the field in the
> >first request (i.e. we extend the qiov of the first request).
> 
> Thats a good idea and should work. I will look into this.
> But where would you store the merge qiov?
> 
> It looks like I would need a small struct which holds the qiov and the pointer to req[0].
> This can then be used as opaque value to the callback.

My thought was that you replace the original qiov of the first request
with the merged one. I don't think you need the original one any more
after merging the requests.  Alternatively, you could use a separate
field in VirtIOBlockReq which stays unused in all but the first request.

In any case the goal is to get rid of a dynamically allocated struct, no
matter how small, so opaque must be directly pointing to a request in
the end.

> >With these requirements removed, MultiReqBuffer can live on the stack
> >and doesn't need to be copied to the heap.
> >
> >
> >With the additional patch that you send me, the next few lines read:
> >
> >+    qemu_iovec_init(&mrb->qiov, mrb->num_reqs);
> >+
> >+    for (i = 0; i < mrb->num_reqs; i++) {
> >+        qemu_iovec_concat(&mrb->qiov, &mrb->reqs[i]->qiov, 0, mrb->reqs[i]->qiov.size);
> >+    }
> >+    assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
> >
> >The long line is more than 80 characters, but probably more importantly,
> >mrb->num_reqs is a really bad estimation for niov. If the guest uses any
> >vectored I/O, we're sure to get reallocations. What you really want to
> >use here is the old mrb->qiov.niov (which is an abuse to store in the
> >qiov, see below).
> 
> Of course, my fault. I was so happy to have an exact value for the number
> of niov that I put in the wrong variable ;-)

Happens. :-)

> >(I also wonder whether it would make sense to keep qiovs allocated when
> >putting VirtIOBlockReqs into the pool, but that's not for this series.)
> >
> >>+
> >>+#ifdef DEBUG_MULTIREQ
> >>+    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
> >>+           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
> >>+#endif
> >>+
> >>+    if (mrb->is_write) {
> >>+        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> >>+                       virtio_multireq_cb, mrb);
> >>+    } else {
> >>+        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
> >>+                      virtio_multireq_cb, mrb);
> >>      }
> >>-    mrb->num_writes = 0;
> >>+    block_acct_merge_done(blk_get_stats(blk),
> >>+                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
> >>+                          mrb->num_reqs - 1);
> >>  }
> >>  static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)

Kevin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread
  2014-12-04 15:03       ` Kevin Wolf
@ 2014-12-04 15:11         ` Peter Lieven
  0 siblings, 0 replies; 11+ messages in thread
From: Peter Lieven @ 2014-12-04 15:11 UTC (permalink / raw)
  To: Kevin Wolf
  Cc: famz, benoit, ming.lei, armbru, qemu-devel, stefanha, pbonzini,
	mreitz

On 04.12.2014 16:03, Kevin Wolf wrote:
> Am 04.12.2014 um 15:42 hat Peter Lieven geschrieben:
>> On 04.12.2014 15:12, Kevin Wolf wrote:
>>> Am 02.12.2014 um 15:33 hat Peter Lieven geschrieben:
>>>> this patch finally introduce multiread support to virtio-blk while
>>>> multiwrite support was there for a long time read support was missing.
>>>>
>>>> To achieve this the patch does serveral things which might need futher
>>>> explaination:
>>>>
>>>>   - the whole merge and multireq logic is moved from block.c into
>>>>     virtio-blk. This is move is a preparation for directly creating a
>>>>     coroutine out of virtio-blk.
>>>>
>>>>   - requests are only merged if they are strictly sequential and no
>>>>     longer sorted. This simplification decreases overhead and reduces
>>>>     latency. It will also merge some requests which were unmergable before.
>>>>
>>>>     The old algorithm took up to 32 requests sorted them and tried to merge
>>>>     them. The outcome was anything between 1 and 32 requests. In case of
>>>>     32 requests there were 31 requests unnecessarily delayed.
>>>>
>>>>     On the other hand lets imagine e.g. 16 unmergeable requests followed
>>>>     by 32 mergable requests. The latter 32 requests would have been split
>>>>     into two 16 byte requests.
>>>>
>>>>     Last the simplified logic allows for a fast path if we have only a
>>>>     single request in the multirequest. In this case the request is sent as
>>>>     ordinary request without mulltireq callbacks.
>>>>
>>>> As a first benchmark I installed Ubuntu 14.04.1 on a ramdisk. The number of
>>>> merged requests is in the same order while the latency is slightly decreased.
>>>> One should not stick to much to the numbers because the number of wr_requests
>>>> are highly fluctuant. I hope the numbers show that this patch is at least
>>>> not causing to big harm:
>>>>
>>>> Cmdline:
>>>> qemu-system-x86_64 -m 1024 -smp 2 -enable-kvm -cdrom ubuntu-14.04.1-server-amd64.iso \
>>>>   -drive if=virtio,file=/tmp/ubuntu.raw -monitor stdio
>>>>
>>>> Before:
>>>> virtio0: rd_bytes=150979072 wr_bytes=2591138816 rd_operations=18475 wr_operations=69216
>>>>           flush_operations=15343 wr_total_time_ns=26969283701 rd_total_time_ns=1018449432
>>>>           flush_total_time_ns=562596819 rd_merged=0 wr_merged=10453
>>>>
>>>> After:
>>>> virtio0: rd_bytes=148112896 wr_bytes=2845834240 rd_operations=17535 wr_operations=72197
>>>>           flush_operations=15760 wr_total_time_ns=26104971623 rd_total_time_ns=1012926283
>>>>           flush_total_time_ns=564414752 rd_merged=517 wr_merged=9859
>>>>
>>>> Some first numbers of improved read performance while booting:
>>>>
>>>> The Ubuntu 14.04.1 vServer from above:
>>>> virtio0: rd_bytes=86158336 wr_bytes=688128 rd_operations=5851 wr_operations=70
>>>>           flush_operations=4 wr_total_time_ns=10886705 rd_total_time_ns=416688595
>>>>           flush_total_time_ns=288776 rd_merged=1297 wr_merged=2
>>>>
>>>> Windows 2012R2:
>>>> virtio0: rd_bytes=176559104 wr_bytes=61859840 rd_operations=7200 wr_operations=360
>>>>           flush_operations=68 wr_total_time_ns=34344992718 rd_total_time_ns=134386844669
>>>>           flush_total_time_ns=18115517 rd_merged=641 wr_merged=216
>>>>
>>>> Signed-off-by: Peter Lieven <pl@kamp.de>
>>>> ---
>>>>   hw/block/dataplane/virtio-blk.c |   10 +-
>>>>   hw/block/virtio-blk.c           |  222 ++++++++++++++++++++++++---------------
>>>>   include/hw/virtio/virtio-blk.h  |   23 ++--
>>>>   3 files changed, 156 insertions(+), 99 deletions(-)
>>>>
>>>> diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
>>>> index 1222a37..aa4ad91 100644
>>>> --- a/hw/block/dataplane/virtio-blk.c
>>>> +++ b/hw/block/dataplane/virtio-blk.c
>>>> @@ -96,9 +96,8 @@ static void handle_notify(EventNotifier *e)
>>>>       event_notifier_test_and_clear(&s->host_notifier);
>>>>       blk_io_plug(s->conf->conf.blk);
>>>>       for (;;) {
>>>> -        MultiReqBuffer mrb = {
>>>> -            .num_writes = 0,
>>>> -        };
>>>> +        MultiReqBuffer mrb_rd = {};
>>>> +        MultiReqBuffer mrb_wr = {.is_write = 1};
>>>>           int ret;
>>>>           /* Disable guest->host notifies to avoid unnecessary vmexits */
>>>> @@ -117,10 +116,11 @@ static void handle_notify(EventNotifier *e)
>>>>                                                           req->elem.in_num,
>>>>                                                           req->elem.index);
>>>> -            virtio_blk_handle_request(req, &mrb);
>>>> +            virtio_blk_handle_request(req, &mrb_wr, &mrb_rd);
>>>>           }
>>>> -        virtio_submit_multiwrite(s->conf->conf.blk, &mrb);
>>>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_wr);
>>>> +        virtio_submit_multireq(s->conf->conf.blk, &mrb_rd);
>>>>           if (likely(ret == -EAGAIN)) { /* vring emptied */
>>>>               /* Re-enable guest->host notifies and stop processing the vring.
>>>> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
>>>> index 490f961..f522bfc 100644
>>>> --- a/hw/block/virtio-blk.c
>>>> +++ b/hw/block/virtio-blk.c
>>>> @@ -22,12 +22,15 @@
>>>>   #include "dataplane/virtio-blk.h"
>>>>   #include "migration/migration.h"
>>>>   #include "block/scsi.h"
>>>> +#include "block/block_int.h"
>>> No. :-)
>>>
>>> We could either expose the information that you need through
>>> BlockBackend, or wait until your automatic request splitting logic is
>>> implemented in block.c and the information isn't needed here any more.
>> I will add sth to block.c and follow-up with the request splitting logic
>> later. It will still make sense to now the limit here. Otherwise we
>> merge something that we split again afterwards.
> Yup, but then make it an official block layer interface instead of
> including block_int.h in the device emulation.
>
>>>>   #ifdef __linux__
>>>>   # include <scsi/sg.h>
>>>>   #endif
>>>>   #include "hw/virtio/virtio-bus.h"
>>>>   #include "hw/virtio/virtio-access.h"
>>>> +/* #define DEBUG_MULTIREQ */
>>>> +
>>>>   VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s)
>>>>   {
>>>>       VirtIOBlockReq *req = g_slice_new(VirtIOBlockReq);
>>>> @@ -88,6 +91,11 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
>>>>       trace_virtio_blk_rw_complete(req, ret);
>>>> +#ifdef DEBUG_MULTIREQ
>>>> +    printf("virtio_blk_rw_complete p %p ret %d\n",
>>>> +           req, ret);
>>>> +#endif
>>> In the final version, please use tracepoints instead of printfs.
>> of course. This is just my debug code.
>>
>>>>       if (ret) {
>>>>           int p = virtio_ldl_p(VIRTIO_DEVICE(req->dev), &req->out.type);
>>>>           bool is_read = !(p & VIRTIO_BLK_T_OUT);
>>>> @@ -257,24 +265,63 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req)
>>>>       virtio_blk_free_request(req);
>>>>   }
>>>> -void virtio_submit_multiwrite(BlockBackend *blk, MultiReqBuffer *mrb)
>>>> +static void virtio_multireq_cb(void *opaque, int ret)
>>>> +{
>>>> +    MultiReqBuffer *mrb = opaque;
>>>> +    int i;
>>>> +#ifdef DEBUG_MULTIREQ
>>>> +    printf("virtio_multireq_cb: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>>>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>>>> +#endif
>>>> +    for (i = 0; i < mrb->num_reqs; i++) {
>>>> +        virtio_blk_rw_complete(mrb->reqs[i], ret);
>>>> +    }
>>>> +
>>>> +    qemu_iovec_destroy(&mrb->qiov);
>>>> +    g_free(mrb);
>>>> +}
>>>> +
>>>> +void virtio_submit_multireq(BlockBackend *blk, MultiReqBuffer *mrb0)
>>>>   {
>>>> -    int i, ret;
>>>> +    MultiReqBuffer *mrb = NULL;
>>>> -    if (!mrb->num_writes) {
>>>> +    if (!mrb0->num_reqs) {
>>>>           return;
>>>>       }
>>>> -    ret = blk_aio_multiwrite(blk, mrb->blkreq, mrb->num_writes);
>>>> -    if (ret != 0) {
>>>> -        for (i = 0; i < mrb->num_writes; i++) {
>>>> -            if (mrb->blkreq[i].error) {
>>>> -                virtio_blk_rw_complete(mrb->blkreq[i].opaque, -EIO);
>>>> -            }
>>>> +    if (mrb0->num_reqs == 1) {
>>>> +        if (mrb0->is_write) {
>>>> +            blk_aio_writev(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>>>> +                           virtio_blk_rw_complete, mrb0->reqs[0]);
>>>> +        } else {
>>>> +            blk_aio_readv(blk, mrb0->sector_num, &mrb0->reqs[0]->qiov, mrb0->nb_sectors,
>>>> +                          virtio_blk_rw_complete, mrb0->reqs[0]);
>>>>           }
>>>> +        qemu_iovec_destroy(&mrb0->qiov);
>>>> +        mrb0->num_reqs = 0;
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    mrb = g_malloc(sizeof(MultiReqBuffer));
>>>> +    memcpy(mrb, mrb0, sizeof(MultiReqBuffer));
>>>> +    mrb0->num_reqs = 0;
>>> We probably want to avoid allocations as much as we can. If at all
>>> possible, we don't want to memcpy() either.
>>>
>>> Most parts of MultiReqBuffer don't seem to be actually needed during the
>>> request and in the callback any more. Essentially, all that is needed is
>>> the qiov and a way to find all the requests that belong to the same
>>> MultiReqBuffer, so that they can all be completed.
>>>
>>> We could create a linked list by having a VirtIOBlockReq *next in the
>>> request struct. For the qiov we can probably just use the field in the
>>> first request (i.e. we extend the qiov of the first request).
>> Thats a good idea and should work. I will look into this.
>> But where would you store the merge qiov?
>>
>> It looks like I would need a small struct which holds the qiov and the pointer to req[0].
>> This can then be used as opaque value to the callback.
> My thought was that you replace the original qiov of the first request
> with the merged one. I don't think you need the original one any more
> after merging the requests.  Alternatively, you could use a separate
> field in VirtIOBlockReq which stays unused in all but the first request.
>
> In any case the goal is to get rid of a dynamically allocated struct, no
> matter how small, so opaque must be directly pointing to a request in
> the end.

Okay, that sounds doable. The old interface does this for any write
even if it is not mergable at all.

Peter

>
>>> With these requirements removed, MultiReqBuffer can live on the stack
>>> and doesn't need to be copied to the heap.
>>>
>>>
>>> With the additional patch that you send me, the next few lines read:
>>>
>>> +    qemu_iovec_init(&mrb->qiov, mrb->num_reqs);
>>> +
>>> +    for (i = 0; i < mrb->num_reqs; i++) {
>>> +        qemu_iovec_concat(&mrb->qiov, &mrb->reqs[i]->qiov, 0, mrb->reqs[i]->qiov.size);
>>> +    }
>>> +    assert(mrb->nb_sectors == mrb->qiov.size / BDRV_SECTOR_SIZE);
>>>
>>> The long line is more than 80 characters, but probably more importantly,
>>> mrb->num_reqs is a really bad estimation for niov. If the guest uses any
>>> vectored I/O, we're sure to get reallocations. What you really want to
>>> use here is the old mrb->qiov.niov (which is an abuse to store in the
>>> qiov, see below).
>> Of course, my fault. I was so happy to have an exact value for the number
>> of niov that I put in the wrong variable ;-)
> Happens. :-)
>
>>> (I also wonder whether it would make sense to keep qiovs allocated when
>>> putting VirtIOBlockReqs into the pool, but that's not for this series.)
>>>
>>>> +
>>>> +#ifdef DEBUG_MULTIREQ
>>>> +    printf("virtio_submit_multireq: p %p sector_num %ld nb_sectors %d is_write %d num_reqs %d\n",
>>>> +           mrb, mrb->sector_num, mrb->nb_sectors, mrb->is_write, mrb->num_reqs);
>>>> +#endif
>>>> +
>>>> +    if (mrb->is_write) {
>>>> +        blk_aio_writev(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>>>> +                       virtio_multireq_cb, mrb);
>>>> +    } else {
>>>> +        blk_aio_readv(blk, mrb->sector_num, &mrb->qiov, mrb->nb_sectors,
>>>> +                      virtio_multireq_cb, mrb);
>>>>       }
>>>> -    mrb->num_writes = 0;
>>>> +    block_acct_merge_done(blk_get_stats(blk),
>>>> +                          mrb->is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ,
>>>> +                          mrb->num_reqs - 1);
>>>>   }
>>>>   static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb)
> Kevin


-- 

Mit freundlichen Grüßen

Peter Lieven

...........................................................

   KAMP Netzwerkdienste GmbH
   Vestische Str. 89-91 | 46117 Oberhausen
   Tel: +49 (0) 208.89 402-50 | Fax: +49 (0) 208.89 402-40
   pl@kamp.de | http://www.kamp.de

   Geschäftsführer: Heiner Lante | Michael Lante
   Amtsgericht Duisburg | HRB Nr. 12154
   USt-Id-Nr.: DE 120607556

...........................................................

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-12-04 15:11 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-12-02 14:33 [Qemu-devel] [RFC PATCH 0/3] virtio-blk: add multiread support Peter Lieven
2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 1/3] block: add accounting for merged requests Peter Lieven
2014-12-03 17:39   ` Eric Blake
2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 2/3] hw/virtio-blk: add a constant for max number of " Peter Lieven
2014-12-02 14:33 ` [Qemu-devel] [RFC PATCH 3/3] virtio-blk: introduce multiread Peter Lieven
2014-12-03 17:25   ` Max Reitz
2014-12-04  7:16     ` Peter Lieven
2014-12-04 14:12   ` Kevin Wolf
2014-12-04 14:42     ` Peter Lieven
2014-12-04 15:03       ` Kevin Wolf
2014-12-04 15:11         ` Peter Lieven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).