qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/2] virtio-scsi: Optimizing request allocation
@ 2014-09-11 10:16 Fam Zheng
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc Fam Zheng
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req Fam Zheng
  0 siblings, 2 replies; 5+ messages in thread
From: Fam Zheng @ 2014-09-11 10:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini

Zeroing is relatively expensive since we have big request structures.
VirtQueueElement (> 4k) and sense_buf (256 bytes) are two points to look at.

This visibly reduces overhead of request handling when testing with the
unmerged "null" driver and virtio-scsi dataplane. Before, the issue is very
obvious with perf top:

perf top -G -p `pidof qemu-system-x86_64`
-----------------------------------------
+  16.50%  libc-2.17.so             [.] __memset_sse2
+   2.28%  libc-2.17.so             [.] _int_malloc
+   2.25%  [vdso]                   [.] 0x0000000000000cd1
+   2.02%  [kernel]                 [k] _raw_spin_lock_irqsave
+   1.97%  libpthread-2.17.so       [.] pthread_mutex_lock
+   1.87%  libpthread-2.17.so       [.] pthread_mutex_unlock
+   1.81%  [kernel]                 [k] fget_light
+   1.70%  libc-2.17.so             [.] malloc

After, the high __memset_sse2 and _int_malloc is gone:

perf top -G -p `pidof qemu-system-x86_64`
-----------------------------------------
+   4.20%  [kernel]                 [k] vcpu_enter_guest
+   3.97%  [kernel]                 [k] vmx_vcpu_run
+   2.63%  [kernel]                 [k] _raw_spin_lock_irqsave
+   1.72%  [kernel]                 [k] native_read_msr_safe
+   1.65%  [kernel]                 [k] __srcu_read_lock
+   1.64%  [kernel]                 [k] _raw_spin_unlock_irqrestore
+   1.57%  [vdso]                   [.] 0x00000000000008d8
+   1.49%  libc-2.17.so             [.] _int_malloc
+   1.29%  libpthread-2.17.so       [.] pthread_mutex_unlock
+   1.26%  [kernel]                 [k] native_write_msr_safe

See the commit message of patch 2 for some fio test data.

Thanks,
Fam

Fam Zheng (2):
  scsi: Optimize scsi_req_alloc
  virtio-scsi: Optimize virtio_scsi_init_req

 hw/scsi/scsi-bus.c              |  7 +++++--
 hw/scsi/virtio-scsi.c           | 17 ++++++++++-------
 include/hw/scsi/scsi.h          | 21 ++++++++++++++-------
 include/hw/virtio/virtio-scsi.h |  1 +
 4 files changed, 30 insertions(+), 16 deletions(-)

-- 
1.9.3

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc
  2014-09-11 10:16 [Qemu-devel] [PATCH 0/2] virtio-scsi: Optimizing request allocation Fam Zheng
@ 2014-09-11 10:16 ` Fam Zheng
  2014-09-11 10:52   ` Paolo Bonzini
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req Fam Zheng
  1 sibling, 1 reply; 5+ messages in thread
From: Fam Zheng @ 2014-09-11 10:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini

Zeroing sense buffer for each scsi request is not efficient, we can just
leave it uninitialized because sense_len is set to 0.

Move the implicitly zeroes fields to the end of the structure and use a
partial memset.

Also change g_malloc0 to g_slice_alloc.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 hw/scsi/scsi-bus.c     |  7 +++++--
 include/hw/scsi/scsi.h | 21 ++++++++++++++-------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 954c607..71b45c0 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -551,8 +551,11 @@ SCSIRequest *scsi_req_alloc(const SCSIReqOps *reqops, SCSIDevice *d,
     SCSIRequest *req;
     SCSIBus *bus = scsi_bus_from_device(d);
     BusState *qbus = BUS(bus);
+    const int memset_off = offsetof(SCSIRequest, sense)
+                           + sizeof(req->sense);
 
-    req = g_malloc0(reqops->size);
+    req = g_slice_alloc(reqops->size);
+    memset((uint8_t *)req + memset_off, 0, reqops->size - memset_off);
     req->refcount = 1;
     req->bus = bus;
     req->dev = d;
@@ -1603,7 +1606,7 @@ void scsi_req_unref(SCSIRequest *req)
         }
         object_unref(OBJECT(req->dev));
         object_unref(OBJECT(qbus->parent));
-        g_free(req);
+        g_slice_free1(req->ops->size, req);
     }
 }
 
diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 2e3a8f9..889f659 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -50,17 +50,24 @@ struct SCSIRequest {
     uint32_t          tag;
     uint32_t          lun;
     uint32_t          status;
+    uint32_t          sense_len;
+
+    /* Note:
+     * - fields before sense are initialized by scsi_req_alloc;
+     * - sense[] is uninitialized;
+     * - fields after sense are memset to 0 by scsi_req_alloc.
+     * */
+
+    uint8_t           sense[SCSI_SENSE_BUF_SIZE];
+    bool              enqueued;
+    bool              io_canceled;
+    bool              retry;
+    bool              dma_started;
+    void              *hba_private;
     size_t            resid;
     SCSICommand       cmd;
     BlockDriverAIOCB  *aiocb;
     QEMUSGList        *sg;
-    bool              dma_started;
-    uint8_t sense[SCSI_SENSE_BUF_SIZE];
-    uint32_t sense_len;
-    bool enqueued;
-    bool io_canceled;
-    bool retry;
-    void *hba_private;
     QTAILQ_ENTRY(SCSIRequest) next;
 };
 
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req
  2014-09-11 10:16 [Qemu-devel] [PATCH 0/2] virtio-scsi: Optimizing request allocation Fam Zheng
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc Fam Zheng
@ 2014-09-11 10:16 ` Fam Zheng
  2014-09-11 10:55   ` Paolo Bonzini
  1 sibling, 1 reply; 5+ messages in thread
From: Fam Zheng @ 2014-09-11 10:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Paolo Bonzini

The VirtQueueElement is a very big structure (> 4k), since it will be
initialzed by virtqueue_pop, we can save the expensive zeroing here.

This saves a few nanoseconds per request in my test:

[fio-test]      rw         bs         iodepth    jobs       bw         iops       latency
--------------------------------------------------------------------------------------------
Before          read       4k         1          1          110        28269      34
After           read       4k         1          1          131        33745      28

virtio-blk      read       4k         1          1          217        55673      16

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 hw/scsi/virtio-scsi.c           | 17 ++++++++++-------
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 86aba88..0792529 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -24,12 +24,12 @@
 typedef struct VirtIOSCSIReq {
     VirtIOSCSI *dev;
     VirtQueue *vq;
-    VirtQueueElement elem;
     QEMUSGList qsgl;
+    QEMUIOVector resp_iov;
+    VirtQueueElement elem;
     SCSIRequest *sreq;
     size_t resp_size;
     enum SCSIXferMode mode;
-    QEMUIOVector resp_iov;
     union {
         VirtIOSCSICmdResp     cmd;
         VirtIOSCSICtrlTMFResp tmf;
@@ -44,6 +44,7 @@ typedef struct VirtIOSCSIReq {
         VirtIOSCSICtrlTMFReq  tmf;
         VirtIOSCSICtrlANReq   an;
     } req;
+    uint8_t cdb[VIRTIO_SCSI_CDB_SIZE_MAX];
 } VirtIOSCSIReq;
 
 QEMU_BUILD_BUG_ON(offsetof(VirtIOSCSIReq, req.cdb) !=
@@ -68,15 +69,16 @@ static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
 static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
 {
     VirtIOSCSIReq *req;
-    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
-
-    req = g_malloc0(sizeof(*req) + vs->cdb_size);
+    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
+                             + sizeof(VirtQueueElement);
 
+    req = g_slice_new(VirtIOSCSIReq);
     req->vq = vq;
     req->dev = s;
     req->sreq = NULL;
     qemu_sglist_init(&req->qsgl, DEVICE(s), 8, &address_space_memory);
     qemu_iovec_init(&req->resp_iov, 1);
+    memset((uint8_t *)req + zero_skip, 0, sizeof(*req) - zero_skip);
     return req;
 }
 
@@ -84,7 +86,7 @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
 {
     qemu_iovec_destroy(&req->resp_iov);
     qemu_sglist_destroy(&req->qsgl);
-    g_free(req);
+    g_slice_free(VirtIOSCSIReq, req);
 }
 
 static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
@@ -532,7 +534,8 @@ static void virtio_scsi_set_config(VirtIODevice *vdev,
     VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
 
     if ((uint32_t) virtio_ldl_p(vdev, &scsiconf->sense_size) >= 65536 ||
-        (uint32_t) virtio_ldl_p(vdev, &scsiconf->cdb_size) >= 256) {
+        (uint32_t) virtio_ldl_p(vdev, &scsiconf->cdb_size)
+                   >= VIRTIO_SCSI_CDB_SIZE_MAX) {
         error_report("bad data written to virtio-scsi configuration space");
         exit(1);
     }
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 188a2d9..6e876f4 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -37,6 +37,7 @@
 
 #define VIRTIO_SCSI_VQ_SIZE     128
 #define VIRTIO_SCSI_CDB_SIZE    32
+#define VIRTIO_SCSI_CDB_SIZE_MAX 256
 #define VIRTIO_SCSI_SENSE_SIZE  96
 #define VIRTIO_SCSI_MAX_CHANNEL 0
 #define VIRTIO_SCSI_MAX_TARGET  255
-- 
1.9.3

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc Fam Zheng
@ 2014-09-11 10:52   ` Paolo Bonzini
  0 siblings, 0 replies; 5+ messages in thread
From: Paolo Bonzini @ 2014-09-11 10:52 UTC (permalink / raw)
  To: Fam Zheng, qemu-devel

Just a few nits...

Il 11/09/2014 12:16, Fam Zheng ha scritto:
> +    uint32_t          sense_len;

sense_len is initialized to zero, might as well do that via memset.

> +    uint8_t           sense[SCSI_SENSE_BUF_SIZE];
> +    bool              enqueued;
> +    bool              io_canceled;
> +    bool              retry;
> +    bool              dma_started;
> +    void              *hba_private;

hba_private is always initialized by scsi_req_alloc.

>      size_t            resid;
>      SCSICommand       cmd;

resid and cmd are initialized by scsi_req_new (all calls to
scsi_req_alloc happen in scsi_req_new, possibly via scsi_device_alloc_req).

>      BlockDriverAIOCB  *aiocb;
>      QEMUSGList        *sg;
> -    bool              dma_started;
> -    uint8_t sense[SCSI_SENSE_BUF_SIZE];
> -    uint32_t sense_len;
> -    bool enqueued;
> -    bool io_canceled;
> -    bool retry;
> -    void *hba_private;
>      QTAILQ_ENTRY(SCSIRequest) next;

Not sure if next needs to be initialized for the QTAILQ functions to
work, but it's definitely safer this way.

Thanks,

Paolo

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req
  2014-09-11 10:16 ` [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req Fam Zheng
@ 2014-09-11 10:55   ` Paolo Bonzini
  0 siblings, 0 replies; 5+ messages in thread
From: Paolo Bonzini @ 2014-09-11 10:55 UTC (permalink / raw)
  To: Fam Zheng, qemu-devel

Similar nits to patch 1, but a good patch nevertheless!

Il 11/09/2014 12:16, Fam Zheng ha scritto:
> The VirtQueueElement is a very big structure (> 4k), since it will be
> initialzed by virtqueue_pop, we can save the expensive zeroing here.
> 
> This saves a few nanoseconds per request in my test:
> 
> [fio-test]      rw         bs         iodepth    jobs       bw         iops       latency
> --------------------------------------------------------------------------------------------
> Before          read       4k         1          1          110        28269      34
> After           read       4k         1          1          131        33745      28
> 
> virtio-blk      read       4k         1          1          217        55673      16
> 
> Signed-off-by: Fam Zheng <famz@redhat.com>
> ---
>  hw/scsi/virtio-scsi.c           | 17 ++++++++++-------
>  include/hw/virtio/virtio-scsi.h |  1 +
>  2 files changed, 11 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index 86aba88..0792529 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -24,12 +24,12 @@
>  typedef struct VirtIOSCSIReq {
>      VirtIOSCSI *dev;
>      VirtQueue *vq;
> -    VirtQueueElement elem;
>      QEMUSGList qsgl;
> +    QEMUIOVector resp_iov;
> +    VirtQueueElement elem;

Needs a comment where the zeroed section begins.

>      SCSIRequest *sreq;
>      size_t resp_size;
>      enum SCSIXferMode mode;
> -    QEMUIOVector resp_iov;
>      union {
>          VirtIOSCSICmdResp     cmd;
>          VirtIOSCSICtrlTMFResp tmf;
> @@ -44,6 +44,7 @@ typedef struct VirtIOSCSIReq {
>          VirtIOSCSICtrlTMFReq  tmf;
>          VirtIOSCSICtrlANReq   an;
>      } req;
> +    uint8_t cdb[VIRTIO_SCSI_CDB_SIZE_MAX];
>  } VirtIOSCSIReq;
>  
>  QEMU_BUILD_BUG_ON(offsetof(VirtIOSCSIReq, req.cdb) !=
> @@ -68,15 +69,16 @@ static inline SCSIDevice *virtio_scsi_device_find(VirtIOSCSI *s, uint8_t *lun)
>  static VirtIOSCSIReq *virtio_scsi_init_req(VirtIOSCSI *s, VirtQueue *vq)
>  {
>      VirtIOSCSIReq *req;
> -    VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
> -
> -    req = g_malloc0(sizeof(*req) + vs->cdb_size);
> +    const size_t zero_skip = offsetof(VirtIOSCSIReq, elem)
> +                             + sizeof(VirtQueueElement);
>  
> +    req = g_slice_new(VirtIOSCSIReq);

I would use g_slice_alloc here, and avoid zeroing the largeish cdb field.

>      req->vq = vq;
>      req->dev = s;
>      req->sreq = NULL;

This NULL initialization can be removed.

Paolo

>      qemu_sglist_init(&req->qsgl, DEVICE(s), 8, &address_space_memory);
>      qemu_iovec_init(&req->resp_iov, 1);
> +    memset((uint8_t *)req + zero_skip, 0, sizeof(*req) - zero_skip);
>      return req;
>  }
>  
> @@ -84,7 +86,7 @@ static void virtio_scsi_free_req(VirtIOSCSIReq *req)
>  {
>      qemu_iovec_destroy(&req->resp_iov);
>      qemu_sglist_destroy(&req->qsgl);
> -    g_free(req);
> +    g_slice_free(VirtIOSCSIReq, req);
>  }
>  
>  static void virtio_scsi_complete_req(VirtIOSCSIReq *req)
> @@ -532,7 +534,8 @@ static void virtio_scsi_set_config(VirtIODevice *vdev,
>      VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(vdev);
>  
>      if ((uint32_t) virtio_ldl_p(vdev, &scsiconf->sense_size) >= 65536 ||
> -        (uint32_t) virtio_ldl_p(vdev, &scsiconf->cdb_size) >= 256) {
> +        (uint32_t) virtio_ldl_p(vdev, &scsiconf->cdb_size)
> +                   >= VIRTIO_SCSI_CDB_SIZE_MAX) {
>          error_report("bad data written to virtio-scsi configuration space");
>          exit(1);
>      }
> diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
> index 188a2d9..6e876f4 100644
> --- a/include/hw/virtio/virtio-scsi.h
> +++ b/include/hw/virtio/virtio-scsi.h
> @@ -37,6 +37,7 @@
>  
>  #define VIRTIO_SCSI_VQ_SIZE     128
>  #define VIRTIO_SCSI_CDB_SIZE    32
> +#define VIRTIO_SCSI_CDB_SIZE_MAX 256
>  #define VIRTIO_SCSI_SENSE_SIZE  96
>  #define VIRTIO_SCSI_MAX_CHANNEL 0
>  #define VIRTIO_SCSI_MAX_TARGET  255
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2014-09-11 10:56 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-09-11 10:16 [Qemu-devel] [PATCH 0/2] virtio-scsi: Optimizing request allocation Fam Zheng
2014-09-11 10:16 ` [Qemu-devel] [PATCH 1/2] scsi: Optimize scsi_req_alloc Fam Zheng
2014-09-11 10:52   ` Paolo Bonzini
2014-09-11 10:16 ` [Qemu-devel] [PATCH 2/2] virtio-scsi: Optimize virtio_scsi_init_req Fam Zheng
2014-09-11 10:55   ` Paolo Bonzini

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).