[Qemu-devel] [PULL 26/31] scsi-block: always use SG_IO

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Paolo Bonzini <pbonzini@redhat.com>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PULL 26/31] scsi-block: always use SG_IO
Date: Fri, 27 May 2016 12:06:39 +0200	[thread overview]
Message-ID: <1464343604-517-27-git-send-email-pbonzini@redhat.com> (raw)
In-Reply-To: <1464343604-517-1-git-send-email-pbonzini@redhat.com>

Using pread/pwrite or io_submit has the advantage of eliminating the
bounce buffer, but drops the SCSI status.  This keeps the guest from
seeing unit attention codes, as well as statuses such as RESERVATION
CONFLICT.  Because we know scsi-block operates on an SBC device we can
still use the DMA helpers with SG_IO; just remember to patch the CDBs
if the transfer is split into multiple segments.

This means that scsi-block will always use the thread-pool unfortunately,
instead of respecting aio=native.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 hw/scsi/scsi-disk.c | 214 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 196 insertions(+), 18 deletions(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 1095263..df49164 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -83,6 +83,7 @@ typedef struct SCSIDiskReq {
     struct iovec iov;
     QEMUIOVector qiov;
     BlockAcctCookie acct;
+    unsigned char *status;
 } SCSIDiskReq;
 
 #define SCSI_DISK_F_REMOVABLE             0
@@ -190,6 +191,15 @@ static bool scsi_disk_req_check_error(SCSIDiskReq *r, int ret, bool acct_failed)
         return scsi_handle_rw_error(r, -ret, acct_failed);
     }
 
+    if (r->status && *r->status) {
+        if (acct_failed) {
+            SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+            block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+        }
+        scsi_req_complete(&r->req, *r->status);
+        return true;
+    }
+
     return false;
 }
 
@@ -2555,16 +2565,145 @@ static void scsi_block_realize(SCSIDevice *dev, Error **errp)
     scsi_generic_read_device_identification(&s->qdev);
 }
 
+typedef struct SCSIBlockReq {
+    SCSIDiskReq req;
+    sg_io_hdr_t io_header;
+
+    /* Selected bytes of the original CDB, copied into our own CDB.  */
+    uint8_t cmd, cdb1, group_number;
+
+    /* CDB passed to SG_IO.  */
+    uint8_t cdb[16];
+} SCSIBlockReq;
+
+static BlockAIOCB *scsi_block_do_sgio(SCSIBlockReq *req,
+                                      int64_t offset, QEMUIOVector *iov,
+                                      int direction,
+                                      BlockCompletionFunc *cb, void *opaque)
+{
+    sg_io_hdr_t *io_header = &req->io_header;
+    SCSIDiskReq *r = &req->req;
+    SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+    int nb_logical_blocks;
+    uint64_t lba;
+    BlockAIOCB *aiocb;
+
+    /* This is not supported yet.  It can only happen if the guest does
+     * reads and writes that are not aligned to one logical sectors
+     * _and_ cover multiple MemoryRegions.
+     */
+    assert(offset % s->qdev.blocksize == 0);
+    assert(iov->size % s->qdev.blocksize == 0);
+
+    io_header->interface_id = 'S';
+
+    /* The data transfer comes from the QEMUIOVector.  */
+    io_header->dxfer_direction = direction;
+    io_header->dxfer_len = iov->size;
+    io_header->dxferp = (void *)iov->iov;
+    io_header->iovec_count = iov->niov;
+    assert(io_header->iovec_count == iov->niov); /* no overflow! */
+
+    /* Build a new CDB with the LBA and length patched in, in case
+     * DMA helpers split the transfer in multiple segments.  Do not
+     * build a CDB smaller than what the guest wanted, and only build
+     * a larger one if strictly necessary.
+     */
+    io_header->cmdp = req->cdb;
+    lba = offset / s->qdev.blocksize;
+    nb_logical_blocks = io_header->dxfer_len / s->qdev.blocksize;
+
+    if ((req->cmd >> 5) == 0 && lba <= 0x1ffff) {
+        /* 6-byte CDB */
+        stl_be_p(&req->cdb[0], lba | (req->cmd << 24));
+        req->cdb[4] = nb_logical_blocks;
+        req->cdb[5] = 0;
+        io_header->cmd_len = 6;
+    } else if ((req->cmd >> 5) <= 1 && lba <= 0xffffffffULL) {
+        /* 10-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x20;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        req->cdb[6] = req->group_number;
+        stw_be_p(&req->cdb[7], nb_logical_blocks);
+        req->cdb[9] = 0;
+        io_header->cmd_len = 10;
+    } else if ((req->cmd >> 5) != 4 && lba <= 0xffffffffULL) {
+        /* 12-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0xA0;
+        req->cdb[1] = req->cdb1;
+        stl_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[6], nb_logical_blocks);
+        req->cdb[10] = req->group_number;
+        req->cdb[11] = 0;
+        io_header->cmd_len = 12;
+    } else {
+        /* 16-byte CDB */
+        req->cdb[0] = (req->cmd & 0x1f) | 0x80;
+        req->cdb[1] = req->cdb1;
+        stq_be_p(&req->cdb[2], lba);
+        stl_be_p(&req->cdb[10], nb_logical_blocks);
+        req->cdb[14] = req->group_number;
+        req->cdb[15] = 0;
+        io_header->cmd_len = 16;
+    }
+
+    /* The rest is as in scsi-generic.c.  */
+    io_header->mx_sb_len = sizeof(r->req.sense);
+    io_header->sbp = r->req.sense;
+    io_header->timeout = UINT_MAX;
+    io_header->usr_ptr = r;
+    io_header->flags |= SG_FLAG_DIRECT_IO;
+
+    aiocb = blk_aio_ioctl(s->qdev.conf.blk, SG_IO, io_header, cb, opaque);
+    assert(aiocb != NULL);
+    return aiocb;
+}
+
+static bool scsi_block_no_fua(SCSICommand *cmd)
+{
+    return false;
+}
+
+static BlockAIOCB *scsi_block_dma_readv(int64_t offset,
+                                        QEMUIOVector *iov,
+                                        BlockCompletionFunc *cb, void *cb_opaque,
+                                        void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_FROM_DEV, cb, cb_opaque);
+}
+
+static BlockAIOCB *scsi_block_dma_writev(int64_t offset,
+                                         QEMUIOVector *iov,
+                                         BlockCompletionFunc *cb, void *cb_opaque,
+                                         void *opaque)
+{
+    SCSIBlockReq *r = opaque;
+    return scsi_block_do_sgio(r, offset, iov,
+                              SG_DXFER_TO_DEV, cb, cb_opaque);
+}
+
 static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 {
     switch (buf[0]) {
+    case VERIFY_10:
+    case VERIFY_12:
+    case VERIFY_16:
+        /* Check if BYTCHK == 0x01 (data-out buffer contains data
+         * for the number of logical blocks specified in the length
+         * field).  For other modes, do not use scatter/gather operation.
+         */
+        if ((buf[1] & 6) != 2) {
+            return false;
+        }
+        break;
+
     case READ_6:
     case READ_10:
     case READ_12:
     case READ_16:
-    case VERIFY_10:
-    case VERIFY_12:
-    case VERIFY_16:
     case WRITE_6:
     case WRITE_10:
     case WRITE_12:
@@ -2572,21 +2711,8 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
     case WRITE_VERIFY_10:
     case WRITE_VERIFY_12:
     case WRITE_VERIFY_16:
-        /* If we are not using O_DIRECT, we might read stale data from the
-         * host cache if writes were made using other commands than these
-         * ones (such as WRITE SAME or EXTENDED COPY, etc.).  So, without
-         * O_DIRECT everything must go through SG_IO.
-         */
-        if (!(blk_get_flags(s->qdev.conf.blk) & BDRV_O_NOCACHE)) {
-            break;
-        }
-
-        /* MMC writing cannot be done via pread/pwrite, because it sometimes
+        /* MMC writing cannot be done via DMA helpers, because it sometimes
          * involves writing beyond the maximum LBA or to negative LBA (lead-in).
-         * And once you do these writes, reading from the block device is
-         * unreliable, too.  It is even possible that reads deliver random data
-         * from the host page cache (this is probably a Linux bug).
-         *
          * We might use scsi_disk_dma_reqops as long as no writing commands are
          * seen, but performance usually isn't paramount on optical media.  So,
          * just make scsi-block operate the same as scsi-generic for them.
@@ -2604,6 +2730,54 @@ static bool scsi_block_is_passthrough(SCSIDiskState *s, uint8_t *buf)
 }
 
 
+static int32_t scsi_block_dma_command(SCSIRequest *req, uint8_t *buf)
+{
+    SCSIBlockReq *r = (SCSIBlockReq *)req;
+    r->cmd = req->cmd.buf[0];
+    switch (r->cmd >> 5) {
+    case 0:
+        /* 6-byte CDB.  */
+        r->cdb1 = r->group_number = 0;
+        break;
+    case 1:
+        /* 10-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[6];
+    case 4:
+        /* 12-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[10];
+        break;
+    case 5:
+        /* 16-byte CDB.  */
+        r->cdb1 = req->cmd.buf[1];
+        r->group_number = req->cmd.buf[14];
+        break;
+    default:
+        abort();
+    }
+
+    if (r->cdb1 & 0xe0) {
+        /* Protection information is not supported.  */
+        scsi_check_condition(&r->req, SENSE_CODE(INVALID_FIELD));
+        return 0;
+    }
+
+    r->req.status = &r->io_header.status;
+    return scsi_disk_dma_command(req, buf);
+}
+
+static const SCSIReqOps scsi_block_dma_reqops = {
+    .size         = sizeof(SCSIBlockReq),
+    .free_req     = scsi_free_request,
+    .send_command = scsi_block_dma_command,
+    .read_data    = scsi_read_data,
+    .write_data   = scsi_write_data,
+    .get_buf      = scsi_get_buf,
+    .load_request = scsi_disk_load_request,
+    .save_request = scsi_disk_save_request,
+};
+
 static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
                                            uint32_t lun, uint8_t *buf,
                                            void *hba_private)
@@ -2614,7 +2788,7 @@ static SCSIRequest *scsi_block_new_request(SCSIDevice *d, uint32_t tag,
         return scsi_req_alloc(&scsi_generic_req_ops, &s->qdev, tag, lun,
                               hba_private);
     } else {
-        return scsi_req_alloc(&scsi_disk_dma_reqops, &s->qdev, tag, lun,
+        return scsi_req_alloc(&scsi_block_dma_reqops, &s->qdev, tag, lun,
                               hba_private);
     }
 }
@@ -2770,10 +2944,14 @@ static void scsi_block_class_initfn(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
     SCSIDeviceClass *sc = SCSI_DEVICE_CLASS(klass);
+    SCSIDiskClass *sdc = SCSI_DISK_BASE_CLASS(klass);
 
     sc->realize      = scsi_block_realize;
     sc->alloc_req    = scsi_block_new_request;
     sc->parse_cdb    = scsi_block_parse_cdb;
+    sdc->dma_readv   = scsi_block_dma_readv;
+    sdc->dma_writev  = scsi_block_dma_writev;
+    sdc->need_fua_emulation = scsi_block_no_fua;
     dc->desc = "SCSI block device passthrough";
     dc->props = scsi_block_properties;
     dc->vmsd  = &vmstate_scsi_disk_state;
-- 
2.5.5

next prev parent reply	other threads:[~2016-05-27 10:07 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-05-27 10:06 [Qemu-devel] [PULL 00/31] Misc changes for 2016-05-27 Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 01/31] Add optionrom compatible with fw_cfg DMA version Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 02/31] Revert "memory: Drop FlatRange.romd_mode" Paolo Bonzini
2016-05-27 10:51   ` Laszlo Ersek
2016-05-27 10:06 ` [Qemu-devel] [PULL 03/31] hw/char: QOM'ify escc.c Paolo Bonzini
2016-05-31 22:13   ` Mark Cave-Ayland
2016-06-01  3:06     ` xiaoqiang zhao
2016-06-01  7:04       ` Mark Cave-Ayland
2016-06-01  7:08         ` xiaoqiang zhao
2016-05-27 10:06 ` [Qemu-devel] [PULL 04/31] hw/char: QOM'ify etraxfs_ser.c Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 05/31] hw/char: QOM'ify lm32_juart.c Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 06/31] hw/char: QOM'ify lm32_uart.c Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 07/31] hw/char: QOM'ify milkymist-uart.c Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 08/31] nbd: Don't trim unrequested bytes Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 09/31] kvm_stat: Remove Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 10/31] scsi: pvscsi: check command descriptor ring buffer size (CVE-2016-4952) Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 11/31] scsi: mptsas: infinite loop while fetching requests Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 12/31] scsi: megasas: use appropriate property buffer size Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 13/31] scsi: megasas: initialise local configuration data buffer Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 14/31] scsi: megasas: check 'read_queue_head' index value Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 15/31] block/iscsi: avoid potential overflow of acb->task->cdb Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 16/31] bt: rewrite csrhci_write to avoid out-of-bounds writes Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 17/31] docs/atomics: update atomic_read/set comparison with Linux Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 18/31] atomics: emit an smp_read_barrier_depends() barrier only for Alpha and Thread Sanitizer Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 19/31] atomics: do not emit consume barrier for atomic_rcu_read Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 20/31] docs/atomics: update comparison with Linux Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 21/31] xen-hvm: ignore background I/O sections Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 22/31] scsi-disk: introduce a common base class Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 23/31] scsi-disk: introduce dma_readv and dma_writev Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 24/31] scsi-disk: add need_fua_emulation to SCSIDiskClass Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 25/31] scsi-disk: introduce scsi_disk_req_check_error Paolo Bonzini
2016-05-27 10:06 ` Paolo Bonzini [this message]
2016-05-27 10:06 ` [Qemu-devel] [PULL 27/31] scsi-generic: Merge block max xfer len in INQUIRY response Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 28/31] memory: remove qemu_get_ram_fd, qemu_set_ram_fd, qemu_ram_block_host_ptr Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 29/31] exec: remove ram_addr argument from qemu_ram_block_from_host Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 30/31] memory: split memory_region_from_host from qemu_ram_addr_from_host Paolo Bonzini
2016-05-27 10:06 ` [Qemu-devel] [PULL 31/31] exec: hide mr->ram_addr from qemu_get_ram_ptr users Paolo Bonzini
2016-05-27 12:53 ` [Qemu-devel] [PULL 00/31] Misc changes for 2016-05-27 Peter Maydell
2016-05-27 13:04   ` Peter Maydell
2016-05-27 13:38     ` Richard W.M. Jones
2016-05-27 14:04       ` Peter Maydell
2016-05-27 14:07         ` Paolo Bonzini
2016-05-27 14:06       ` Paolo Bonzini
2016-05-27 14:11         ` Richard W.M. Jones

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:1095263 dfblob:df49164 )
 OR (
bs:"[Qemu-devel] [PULL 26/31] scsi-block: always use SG_IO" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1464343604-517-27-git-send-email-pbonzini@redhat.com \
    --to=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).