All of lore.kernel.org
 help / color / mirror / Atom feed
From: mlin@kernel.org (Ming Lin)
Subject: [PATCH -qemu] nvme: support Google vendor extension
Date: Tue, 17 Nov 2015 21:47:04 -0800	[thread overview]
Message-ID: <1447825624-17011-3-git-send-email-mlin@kernel.org> (raw)
In-Reply-To: <1447825624-17011-1-git-send-email-mlin@kernel.org>

From: Mihai Rusu <dizzy@google.com>

This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.

See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html

On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
    -enable-kvm -m 2048 -smp 4 \
    -drive if=virtio,file=debian.raw,cache=none \
    -drive file=nvme.raw,if=none,id=nvme-dev \
    -device nvme,drive=nvme-dev,serial=nvme-serial

Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
    --numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
    --nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread

I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).

Signed-off-by: Mihai Rusu <dizzy at google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch at intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin at kernel.org>
---
 hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/block/nvme.h | 18 +++++++++++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
  */
 
+#include <exec/memory.h>
 #include <hw/block/block.h>
 #include <hw/hw.h>
 #include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return NVME_SUCCESS;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    if (cq->db_addr) {
+        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+                     &cq->head, sizeof(cq->head));
+    }
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
         NvmeSQueue *sq;
         hwaddr addr;
 
+        nvme_update_cq_head(cq);
+
         if (nvme_cq_full(cq)) {
             break;
         }
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+    sq->db_addr = 0;
+    sq->eventidx_addr = 0;
 
     assert(n->cq[cqid]);
     cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
+    cq->db_addr = 0;
+    cq->eventidx_addr = 0;
     msix_vector_use(&n->parent_obj, cq->vector);
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+    uint64_t db_addr = le64_to_cpu(cmd->prp1);
+    uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+    int i;
+
+    /* Addresses should not be NULL and should be page aligned. */
+    if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+        eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+        return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+    }
+
+    /* This assumes all I/O queues are created before this command is handled.
+     * We skip the admin queues. */
+    for (i = 1; i < n->num_queues; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq != NULL) {
+            /* Submission queue tail pointer location, 2 * QID * stride. */
+            sq->db_addr = db_addr + 2 * i * 4;
+            sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+        }
+
+        if (cq != NULL) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride.
+             */
+            cq->db_addr = db_addr + (2 * i + 1) * 4;
+            cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+        }
+    }
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_SET_DB_MEMORY:
+        return nvme_set_db_memory(n, cmd);
     default:
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    if (sq->eventidx_addr) {
+        pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+                      &sq->tail, sizeof(sq->tail));
+    }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    if (sq->db_addr) {
+        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+                     &sq->tail, sizeof(sq->tail));
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
     NvmeCmd cmd;
     NvmeRequest *req;
 
+    nvme_update_sq_tail(sq);
+
     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
         addr = sq->dma_addr + sq->head * n->sqe_size;
         pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
             req->status = status;
             nvme_enqueue_req_completion(cq, req);
         }
+
+        nvme_update_sq_eventidx(sq);
+        nvme_update_sq_tail(sq);
     }
 }
 
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the head pointer. */
+        if (!cq->db_addr) {
+            cq->head = new_head;
+        }
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             return;
         }
 
-        sq->tail = new_tail;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the tail pointer. */
+        if (!sq->db_addr) {
+            sq->tail = new_tail;
+        }
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
     pci_conf = pci_dev->config;
     pci_conf[PCI_INTERRUPT_PIN] = 1;
     pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_vendor_id(pci_dev->config, n->vid);
+    pci_config_set_device_id(pci_dev->config, n->did);
     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(&n->parent_obj, 0x80);
 
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
     msix_uninit_exclusive_bar(pci_dev);
 }
 
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+    DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+    DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
     pc->exit = nvme_exit;
     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
     pc->vendor_id = PCI_VENDOR_ID_INTEL;
-    pc->device_id = 0x5845;
-    pc->revision = 1;
     pc->is_express = 1;
 
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+    NVME_ADM_CMD_SET_DB_MEMORY  = 0xC0,  /* Vendor specific. */
 };
 
 enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_INVALID_MEMORY_ADDRESS = 0x01C0,  /* Vendor extension. */
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
     QTAILQ_ENTRY(NvmeSQueue) entry;
+    /* Mapped memory location where the tail pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the tail pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the tail pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
     QEMUTimer   *timer;
     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+    /* Mapped memory location where the head pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the head pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the head pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
     uint32_t    num_queues;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint16_t    vid;
+    uint16_t    did;
 
     char            *serial;
     NvmeNamespace   *namespaces;
-- 
1.9.1

WARNING: multiple messages have this Message-ID (diff)
From: Ming Lin <mlin@kernel.org>
To: linux-nvme@lists.infradead.org, qemu-devel@nongnu.org
Cc: fes@google.com, keith.busch@intel.com, tytso@mit.edu,
	virtualization@lists.linux-foundation.org, axboe@fb.com,
	Rob Nelson <rlnelson@google.com>, Christoph Hellwig <hch@lst.de>,
	Mihai Rusu <dizzy@google.com>
Subject: [PATCH -qemu] nvme: support Google vendor extension
Date: Tue, 17 Nov 2015 21:47:04 -0800	[thread overview]
Message-ID: <1447825624-17011-3-git-send-email-mlin@kernel.org> (raw)
In-Reply-To: <1447825624-17011-1-git-send-email-mlin@kernel.org>

From: Mihai Rusu <dizzy@google.com>

This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.

See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html

On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
    -enable-kvm -m 2048 -smp 4 \
    -drive if=virtio,file=debian.raw,cache=none \
    -drive file=nvme.raw,if=none,id=nvme-dev \
    -device nvme,drive=nvme-dev,serial=nvme-serial

Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
    --numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
    --nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread

I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).

Signed-off-by: Mihai Rusu <dizzy@google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch@intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
---
 hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/block/nvme.h | 18 +++++++++++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
  */
 
+#include <exec/memory.h>
 #include <hw/block/block.h>
 #include <hw/hw.h>
 #include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return NVME_SUCCESS;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    if (cq->db_addr) {
+        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+                     &cq->head, sizeof(cq->head));
+    }
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
         NvmeSQueue *sq;
         hwaddr addr;
 
+        nvme_update_cq_head(cq);
+
         if (nvme_cq_full(cq)) {
             break;
         }
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+    sq->db_addr = 0;
+    sq->eventidx_addr = 0;
 
     assert(n->cq[cqid]);
     cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
+    cq->db_addr = 0;
+    cq->eventidx_addr = 0;
     msix_vector_use(&n->parent_obj, cq->vector);
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+    uint64_t db_addr = le64_to_cpu(cmd->prp1);
+    uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+    int i;
+
+    /* Addresses should not be NULL and should be page aligned. */
+    if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+        eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+        return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+    }
+
+    /* This assumes all I/O queues are created before this command is handled.
+     * We skip the admin queues. */
+    for (i = 1; i < n->num_queues; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq != NULL) {
+            /* Submission queue tail pointer location, 2 * QID * stride. */
+            sq->db_addr = db_addr + 2 * i * 4;
+            sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+        }
+
+        if (cq != NULL) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride.
+             */
+            cq->db_addr = db_addr + (2 * i + 1) * 4;
+            cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+        }
+    }
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_SET_DB_MEMORY:
+        return nvme_set_db_memory(n, cmd);
     default:
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    if (sq->eventidx_addr) {
+        pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+                      &sq->tail, sizeof(sq->tail));
+    }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    if (sq->db_addr) {
+        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+                     &sq->tail, sizeof(sq->tail));
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
     NvmeCmd cmd;
     NvmeRequest *req;
 
+    nvme_update_sq_tail(sq);
+
     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
         addr = sq->dma_addr + sq->head * n->sqe_size;
         pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
             req->status = status;
             nvme_enqueue_req_completion(cq, req);
         }
+
+        nvme_update_sq_eventidx(sq);
+        nvme_update_sq_tail(sq);
     }
 }
 
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the head pointer. */
+        if (!cq->db_addr) {
+            cq->head = new_head;
+        }
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             return;
         }
 
-        sq->tail = new_tail;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the tail pointer. */
+        if (!sq->db_addr) {
+            sq->tail = new_tail;
+        }
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
     pci_conf = pci_dev->config;
     pci_conf[PCI_INTERRUPT_PIN] = 1;
     pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_vendor_id(pci_dev->config, n->vid);
+    pci_config_set_device_id(pci_dev->config, n->did);
     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(&n->parent_obj, 0x80);
 
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
     msix_uninit_exclusive_bar(pci_dev);
 }
 
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+    DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+    DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
     pc->exit = nvme_exit;
     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
     pc->vendor_id = PCI_VENDOR_ID_INTEL;
-    pc->device_id = 0x5845;
-    pc->revision = 1;
     pc->is_express = 1;
 
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+    NVME_ADM_CMD_SET_DB_MEMORY  = 0xC0,  /* Vendor specific. */
 };
 
 enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_INVALID_MEMORY_ADDRESS = 0x01C0,  /* Vendor extension. */
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
     QTAILQ_ENTRY(NvmeSQueue) entry;
+    /* Mapped memory location where the tail pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the tail pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the tail pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
     QEMUTimer   *timer;
     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+    /* Mapped memory location where the head pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the head pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the head pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
     uint32_t    num_queues;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint16_t    vid;
+    uint16_t    did;
 
     char            *serial;
     NvmeNamespace   *namespaces;
-- 
1.9.1

WARNING: multiple messages have this Message-ID (diff)
From: Ming Lin <mlin@kernel.org>
To: linux-nvme@lists.infradead.org, qemu-devel@nongnu.org
Cc: fes@google.com, keith.busch@intel.com, tytso@mit.edu,
	nab@linux-iscsi.org, virtualization@lists.linux-foundation.org,
	axboe@fb.com, digitaleric@google.com,
	Rob Nelson <rlnelson@google.com>, Christoph Hellwig <hch@lst.de>,
	Mihai Rusu <dizzy@google.com>
Subject: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
Date: Tue, 17 Nov 2015 21:47:04 -0800	[thread overview]
Message-ID: <1447825624-17011-3-git-send-email-mlin@kernel.org> (raw)
In-Reply-To: <1447825624-17011-1-git-send-email-mlin@kernel.org>

From: Mihai Rusu <dizzy@google.com>

This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.

See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html

On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
    -enable-kvm -m 2048 -smp 4 \
    -drive if=virtio,file=debian.raw,cache=none \
    -drive file=nvme.raw,if=none,id=nvme-dev \
    -device nvme,drive=nvme-dev,serial=nvme-serial

Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
    --numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
    --nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread

I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).

Signed-off-by: Mihai Rusu <dizzy@google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch@intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
---
 hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/block/nvme.h | 18 +++++++++++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
  */
 
+#include <exec/memory.h>
 #include <hw/block/block.h>
 #include <hw/hw.h>
 #include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return NVME_SUCCESS;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    if (cq->db_addr) {
+        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+                     &cq->head, sizeof(cq->head));
+    }
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
         NvmeSQueue *sq;
         hwaddr addr;
 
+        nvme_update_cq_head(cq);
+
         if (nvme_cq_full(cq)) {
             break;
         }
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+    sq->db_addr = 0;
+    sq->eventidx_addr = 0;
 
     assert(n->cq[cqid]);
     cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
+    cq->db_addr = 0;
+    cq->eventidx_addr = 0;
     msix_vector_use(&n->parent_obj, cq->vector);
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+    uint64_t db_addr = le64_to_cpu(cmd->prp1);
+    uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+    int i;
+
+    /* Addresses should not be NULL and should be page aligned. */
+    if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+        eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+        return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+    }
+
+    /* This assumes all I/O queues are created before this command is handled.
+     * We skip the admin queues. */
+    for (i = 1; i < n->num_queues; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq != NULL) {
+            /* Submission queue tail pointer location, 2 * QID * stride. */
+            sq->db_addr = db_addr + 2 * i * 4;
+            sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+        }
+
+        if (cq != NULL) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride.
+             */
+            cq->db_addr = db_addr + (2 * i + 1) * 4;
+            cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+        }
+    }
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_SET_DB_MEMORY:
+        return nvme_set_db_memory(n, cmd);
     default:
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    if (sq->eventidx_addr) {
+        pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+                      &sq->tail, sizeof(sq->tail));
+    }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    if (sq->db_addr) {
+        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+                     &sq->tail, sizeof(sq->tail));
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
     NvmeCmd cmd;
     NvmeRequest *req;
 
+    nvme_update_sq_tail(sq);
+
     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
         addr = sq->dma_addr + sq->head * n->sqe_size;
         pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
             req->status = status;
             nvme_enqueue_req_completion(cq, req);
         }
+
+        nvme_update_sq_eventidx(sq);
+        nvme_update_sq_tail(sq);
     }
 }
 
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the head pointer. */
+        if (!cq->db_addr) {
+            cq->head = new_head;
+        }
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             return;
         }
 
-        sq->tail = new_tail;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the tail pointer. */
+        if (!sq->db_addr) {
+            sq->tail = new_tail;
+        }
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
     pci_conf = pci_dev->config;
     pci_conf[PCI_INTERRUPT_PIN] = 1;
     pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_vendor_id(pci_dev->config, n->vid);
+    pci_config_set_device_id(pci_dev->config, n->did);
     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(&n->parent_obj, 0x80);
 
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
     msix_uninit_exclusive_bar(pci_dev);
 }
 
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+    DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+    DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
     pc->exit = nvme_exit;
     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
     pc->vendor_id = PCI_VENDOR_ID_INTEL;
-    pc->device_id = 0x5845;
-    pc->revision = 1;
     pc->is_express = 1;
 
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+    NVME_ADM_CMD_SET_DB_MEMORY  = 0xC0,  /* Vendor specific. */
 };
 
 enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_INVALID_MEMORY_ADDRESS = 0x01C0,  /* Vendor extension. */
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
     QTAILQ_ENTRY(NvmeSQueue) entry;
+    /* Mapped memory location where the tail pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the tail pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the tail pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
     QEMUTimer   *timer;
     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+    /* Mapped memory location where the head pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the head pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the head pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
     uint32_t    num_queues;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint16_t    vid;
+    uint16_t    did;
 
     char            *serial;
     NvmeNamespace   *namespaces;
-- 
1.9.1

  parent reply	other threads:[~2015-11-18  5:47 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-18  5:47 [RFC PATCH 0/2] Google extension to improve qemu-nvme performance Ming Lin
2015-11-18  5:47 ` [Qemu-devel] " Ming Lin
2015-11-18  5:47 ` Ming Lin
2015-11-18  5:47 ` [PATCH -kernel] nvme: improve performance for virtual NVMe devices Ming Lin
2015-11-18  5:47   ` [Qemu-devel] " Ming Lin
2015-11-18  5:47   ` Ming Lin
2015-11-18  5:47 ` Ming Lin [this message]
2015-11-18  5:47   ` [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension Ming Lin
2015-11-18  5:47   ` Ming Lin
2015-11-19 10:37   ` Paolo Bonzini
2015-11-19 10:37     ` [Qemu-devel] " Paolo Bonzini
2015-11-19 10:37     ` Paolo Bonzini
2015-11-20  8:11     ` Ming Lin
2015-11-20  8:11       ` [Qemu-devel] " Ming Lin
2015-11-20  8:11       ` Ming Lin
2015-11-20  8:58       ` Paolo Bonzini
2015-11-20  8:58         ` [Qemu-devel] " Paolo Bonzini
2015-11-20  8:58         ` Paolo Bonzini
2015-11-20 23:05         ` Ming Lin
2015-11-20 23:05           ` [Qemu-devel] " Ming Lin
2015-11-20 23:05           ` Ming Lin
2015-11-21 12:56           ` Paolo Bonzini
2015-11-21 12:56             ` [Qemu-devel] " Paolo Bonzini
2015-11-21 12:56             ` Paolo Bonzini
2015-11-22  7:45             ` Ming Lin
2015-11-22  7:45               ` [Qemu-devel] " Ming Lin
2015-11-22  7:45               ` Ming Lin
2015-11-24  6:29               ` Ming Lin
2015-11-24  6:29                 ` [Qemu-devel] " Ming Lin
2015-11-24  6:29                 ` Ming Lin
2015-11-24 11:01                 ` Paolo Bonzini
2015-11-24 11:01                   ` [Qemu-devel] " Paolo Bonzini
2015-11-24 11:01                   ` Paolo Bonzini

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1447825624-17011-3-git-send-email-mlin@kernel.org \
    --to=mlin@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.