* [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-18 5:47 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-18 5:47 UTC (permalink / raw)
To: linux-nvme, qemu-devel
Cc: fes, keith.busch, tytso, nab, virtualization, axboe, digitaleric,
Rob Nelson, Christoph Hellwig, Mihai Rusu
From: Mihai Rusu <dizzy@google.com>
This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.
See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html
On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
-enable-kvm -m 2048 -smp 4 \
-drive if=virtio,file=debian.raw,cache=none \
-drive file=nvme.raw,if=none,id=nvme-dev \
-device nvme,drive=nvme-dev,serial=nvme-serial
Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
--numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
--nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread
I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).
Signed-off-by: Mihai Rusu <dizzy@google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch@intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
---
hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
hw/block/nvme.h | 18 +++++++++++
2 files changed, 106 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
* -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
*/
+#include <exec/memory.h>
#include <hw/block/block.h>
#include <hw/hw.h>
#include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return NVME_SUCCESS;
}
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+ if (cq->db_addr) {
+ pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+ &cq->head, sizeof(cq->head));
+ }
+}
+
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
NvmeSQueue *sq;
hwaddr addr;
+ nvme_update_cq_head(cq);
+
if (nvme_cq_full(cq)) {
break;
}
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+ sq->db_addr = 0;
+ sq->eventidx_addr = 0;
assert(n->cq[cqid]);
cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
cq->head = cq->tail = 0;
QTAILQ_INIT(&cq->req_list);
QTAILQ_INIT(&cq->sq_list);
+ cq->db_addr = 0;
+ cq->eventidx_addr = 0;
msix_vector_use(&n->parent_obj, cq->vector);
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return NVME_SUCCESS;
}
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+ uint64_t db_addr = le64_to_cpu(cmd->prp1);
+ uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+ int i;
+
+ /* Addresses should not be NULL and should be page aligned. */
+ if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+ eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+ return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+ }
+
+ /* This assumes all I/O queues are created before this command is handled.
+ * We skip the admin queues. */
+ for (i = 1; i < n->num_queues; i++) {
+ NvmeSQueue *sq = n->sq[i];
+ NvmeCQueue *cq = n->cq[i];
+
+ if (sq != NULL) {
+ /* Submission queue tail pointer location, 2 * QID * stride. */
+ sq->db_addr = db_addr + 2 * i * 4;
+ sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+ }
+
+ if (cq != NULL) {
+ /* Completion queue head pointer location, (2 * QID + 1) * stride.
+ */
+ cq->db_addr = db_addr + (2 * i + 1) * 4;
+ cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+ }
+ }
+ return NVME_SUCCESS;
+}
+
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return nvme_set_feature(n, cmd, req);
case NVME_ADM_CMD_GET_FEATURES:
return nvme_get_feature(n, cmd, req);
+ case NVME_ADM_CMD_SET_DB_MEMORY:
+ return nvme_set_db_memory(n, cmd);
default:
return NVME_INVALID_OPCODE | NVME_DNR;
}
}
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+ if (sq->eventidx_addr) {
+ pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+ &sq->tail, sizeof(sq->tail));
+ }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+ if (sq->db_addr) {
+ pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+ &sq->tail, sizeof(sq->tail));
+ }
+}
+
static void nvme_process_sq(void *opaque)
{
NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
NvmeCmd cmd;
NvmeRequest *req;
+ nvme_update_sq_tail(sq);
+
while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
addr = sq->dma_addr + sq->head * n->sqe_size;
pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
req->status = status;
nvme_enqueue_req_completion(cq, req);
}
+
+ nvme_update_sq_eventidx(sq);
+ nvme_update_sq_tail(sq);
}
}
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
start_sqs = nvme_cq_full(cq) ? 1 : 0;
- cq->head = new_head;
+ /* When the mapped pointer memory area is setup, we don't rely on
+ * the MMIO written values to update the head pointer. */
+ if (!cq->db_addr) {
+ cq->head = new_head;
+ }
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
return;
}
- sq->tail = new_tail;
+ /* When the mapped pointer memory area is setup, we don't rely on
+ * the MMIO written values to update the tail pointer. */
+ if (!sq->db_addr) {
+ sq->tail = new_tail;
+ }
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
}
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
pci_conf = pci_dev->config;
pci_conf[PCI_INTERRUPT_PIN] = 1;
pci_config_set_prog_interface(pci_dev->config, 0x2);
+ pci_config_set_vendor_id(pci_dev->config, n->vid);
+ pci_config_set_device_id(pci_dev->config, n->did);
pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
pcie_endpoint_cap_init(&n->parent_obj, 0x80);
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
msix_uninit_exclusive_bar(pci_dev);
}
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
static Property nvme_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+ DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+ DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
DEFINE_PROP_END_OF_LIST(),
};
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->vendor_id = PCI_VENDOR_ID_INTEL;
- pc->device_id = 0x5845;
- pc->revision = 1;
pc->is_express = 1;
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
+ NVME_ADM_CMD_SET_DB_MEMORY = 0xC0, /* Vendor specific. */
};
enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
+ NVME_INVALID_MEMORY_ADDRESS = 0x01C0, /* Vendor extension. */
NVME_WRITE_FAULT = 0x0280,
NVME_UNRECOVERED_READ = 0x0281,
NVME_E2E_GUARD_ERROR = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
QTAILQ_ENTRY(NvmeSQueue) entry;
+ /* Mapped memory location where the tail pointer is stored by the guest
+ * without triggering MMIO exits. */
+ uint64_t db_addr;
+ /* virtio-like eventidx pointer, guest updates to the tail pointer that
+ * do not go over this value will not result in MMIO writes (but will
+ * still write the tail pointer to the "db_addr" location above). */
+ uint64_t eventidx_addr;
} NvmeSQueue;
typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
QEMUTimer *timer;
QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+ /* Mapped memory location where the head pointer is stored by the guest
+ * without triggering MMIO exits. */
+ uint64_t db_addr;
+ /* virtio-like eventidx pointer, guest updates to the head pointer that
+ * do not go over this value will not result in MMIO writes (but will
+ * still write the head pointer to the "db_addr" location above). */
+ uint64_t eventidx_addr;
} NvmeCQueue;
typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
uint32_t num_queues;
uint32_t max_q_ents;
uint64_t ns_size;
+ uint16_t vid;
+ uint16_t did;
char *serial;
NvmeNamespace *namespaces;
--
1.9.1
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-18 5:47 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-18 5:47 UTC (permalink / raw)
To: linux-nvme, qemu-devel
Cc: fes, keith.busch, tytso, virtualization, axboe, Rob Nelson,
Christoph Hellwig, Mihai Rusu
From: Mihai Rusu <dizzy@google.com>
This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.
See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html
On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
-enable-kvm -m 2048 -smp 4 \
-drive if=virtio,file=debian.raw,cache=none \
-drive file=nvme.raw,if=none,id=nvme-dev \
-device nvme,drive=nvme-dev,serial=nvme-serial
Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
--numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
--nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread
I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).
Signed-off-by: Mihai Rusu <dizzy@google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch@intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin@kernel.org>
---
hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
hw/block/nvme.h | 18 +++++++++++
2 files changed, 106 insertions(+), 4 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
* -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
*/
+#include <exec/memory.h>
#include <hw/block/block.h>
#include <hw/hw.h>
#include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return NVME_SUCCESS;
}
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+ if (cq->db_addr) {
+ pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+ &cq->head, sizeof(cq->head));
+ }
+}
+
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
NvmeSQueue *sq;
hwaddr addr;
+ nvme_update_cq_head(cq);
+
if (nvme_cq_full(cq)) {
break;
}
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+ sq->db_addr = 0;
+ sq->eventidx_addr = 0;
assert(n->cq[cqid]);
cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
cq->head = cq->tail = 0;
QTAILQ_INIT(&cq->req_list);
QTAILQ_INIT(&cq->sq_list);
+ cq->db_addr = 0;
+ cq->eventidx_addr = 0;
msix_vector_use(&n->parent_obj, cq->vector);
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return NVME_SUCCESS;
}
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+ uint64_t db_addr = le64_to_cpu(cmd->prp1);
+ uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+ int i;
+
+ /* Addresses should not be NULL and should be page aligned. */
+ if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+ eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+ return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+ }
+
+ /* This assumes all I/O queues are created before this command is handled.
+ * We skip the admin queues. */
+ for (i = 1; i < n->num_queues; i++) {
+ NvmeSQueue *sq = n->sq[i];
+ NvmeCQueue *cq = n->cq[i];
+
+ if (sq != NULL) {
+ /* Submission queue tail pointer location, 2 * QID * stride. */
+ sq->db_addr = db_addr + 2 * i * 4;
+ sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+ }
+
+ if (cq != NULL) {
+ /* Completion queue head pointer location, (2 * QID + 1) * stride.
+ */
+ cq->db_addr = db_addr + (2 * i + 1) * 4;
+ cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+ }
+ }
+ return NVME_SUCCESS;
+}
+
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return nvme_set_feature(n, cmd, req);
case NVME_ADM_CMD_GET_FEATURES:
return nvme_get_feature(n, cmd, req);
+ case NVME_ADM_CMD_SET_DB_MEMORY:
+ return nvme_set_db_memory(n, cmd);
default:
return NVME_INVALID_OPCODE | NVME_DNR;
}
}
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+ if (sq->eventidx_addr) {
+ pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+ &sq->tail, sizeof(sq->tail));
+ }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+ if (sq->db_addr) {
+ pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+ &sq->tail, sizeof(sq->tail));
+ }
+}
+
static void nvme_process_sq(void *opaque)
{
NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
NvmeCmd cmd;
NvmeRequest *req;
+ nvme_update_sq_tail(sq);
+
while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
addr = sq->dma_addr + sq->head * n->sqe_size;
pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
req->status = status;
nvme_enqueue_req_completion(cq, req);
}
+
+ nvme_update_sq_eventidx(sq);
+ nvme_update_sq_tail(sq);
}
}
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
start_sqs = nvme_cq_full(cq) ? 1 : 0;
- cq->head = new_head;
+ /* When the mapped pointer memory area is setup, we don't rely on
+ * the MMIO written values to update the head pointer. */
+ if (!cq->db_addr) {
+ cq->head = new_head;
+ }
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
return;
}
- sq->tail = new_tail;
+ /* When the mapped pointer memory area is setup, we don't rely on
+ * the MMIO written values to update the tail pointer. */
+ if (!sq->db_addr) {
+ sq->tail = new_tail;
+ }
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
}
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
pci_conf = pci_dev->config;
pci_conf[PCI_INTERRUPT_PIN] = 1;
pci_config_set_prog_interface(pci_dev->config, 0x2);
+ pci_config_set_vendor_id(pci_dev->config, n->vid);
+ pci_config_set_device_id(pci_dev->config, n->did);
pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
pcie_endpoint_cap_init(&n->parent_obj, 0x80);
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
msix_uninit_exclusive_bar(pci_dev);
}
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
static Property nvme_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+ DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+ DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
DEFINE_PROP_END_OF_LIST(),
};
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->vendor_id = PCI_VENDOR_ID_INTEL;
- pc->device_id = 0x5845;
- pc->revision = 1;
pc->is_express = 1;
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
+ NVME_ADM_CMD_SET_DB_MEMORY = 0xC0, /* Vendor specific. */
};
enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
NVME_CONFLICTING_ATTRS = 0x0180,
NVME_INVALID_PROT_INFO = 0x0181,
NVME_WRITE_TO_RO = 0x0182,
+ NVME_INVALID_MEMORY_ADDRESS = 0x01C0, /* Vendor extension. */
NVME_WRITE_FAULT = 0x0280,
NVME_UNRECOVERED_READ = 0x0281,
NVME_E2E_GUARD_ERROR = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
QTAILQ_ENTRY(NvmeSQueue) entry;
+ /* Mapped memory location where the tail pointer is stored by the guest
+ * without triggering MMIO exits. */
+ uint64_t db_addr;
+ /* virtio-like eventidx pointer, guest updates to the tail pointer that
+ * do not go over this value will not result in MMIO writes (but will
+ * still write the tail pointer to the "db_addr" location above). */
+ uint64_t eventidx_addr;
} NvmeSQueue;
typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
QEMUTimer *timer;
QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+ /* Mapped memory location where the head pointer is stored by the guest
+ * without triggering MMIO exits. */
+ uint64_t db_addr;
+ /* virtio-like eventidx pointer, guest updates to the head pointer that
+ * do not go over this value will not result in MMIO writes (but will
+ * still write the head pointer to the "db_addr" location above). */
+ uint64_t eventidx_addr;
} NvmeCQueue;
typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
uint32_t num_queues;
uint32_t max_q_ents;
uint64_t ns_size;
+ uint16_t vid;
+ uint16_t did;
char *serial;
NvmeNamespace *namespaces;
--
1.9.1
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-18 5:47 ` Ming Lin
(?)
@ 2015-11-19 10:37 ` Paolo Bonzini
-1 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-19 10:37 UTC (permalink / raw)
On 18/11/2015 06:47, Ming Lin wrote:
> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> - cq->head = new_head;
> + /* When the mapped pointer memory area is setup, we don't rely on
> + * the MMIO written values to update the head pointer. */
> + if (!cq->db_addr) {
> + cq->head = new_head;
> + }
You are still checking
if (new_head >= cq->size) {
return;
}
above. I think this is incorrect when the extension is present, and
furthermore it's the only case where val is being used.
If you're not using val, you could use ioeventfd for the MMIO. An
ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
quick and dirty measurements from kvm-unit-tests's vmexit.flat
benchmark, on two very different machines:
Haswell-EP Ivy Bridge i7
MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
You would need to allocate two eventfds for each qid, one for the sq and
one for the cq. Also, processing the queues is now bounced to the QEMU
iothread, so you can probably get rid of sq->timer and cq->timer.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-19 10:37 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-19 10:37 UTC (permalink / raw)
To: Ming Lin, linux-nvme, qemu-devel
Cc: fes, axboe, Rob Nelson, virtualization, keith.busch, tytso,
Christoph Hellwig, Mihai Rusu
On 18/11/2015 06:47, Ming Lin wrote:
> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> - cq->head = new_head;
> + /* When the mapped pointer memory area is setup, we don't rely on
> + * the MMIO written values to update the head pointer. */
> + if (!cq->db_addr) {
> + cq->head = new_head;
> + }
You are still checking
if (new_head >= cq->size) {
return;
}
above. I think this is incorrect when the extension is present, and
furthermore it's the only case where val is being used.
If you're not using val, you could use ioeventfd for the MMIO. An
ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
quick and dirty measurements from kvm-unit-tests's vmexit.flat
benchmark, on two very different machines:
Haswell-EP Ivy Bridge i7
MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
You would need to allocate two eventfds for each qid, one for the sq and
one for the cq. Also, processing the queues is now bounced to the QEMU
iothread, so you can probably get rid of sq->timer and cq->timer.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-19 10:37 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-19 10:37 UTC (permalink / raw)
To: Ming Lin, linux-nvme, qemu-devel
Cc: fes, axboe, Rob Nelson, virtualization, keith.busch, tytso,
Christoph Hellwig, Mihai Rusu
On 18/11/2015 06:47, Ming Lin wrote:
> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> - cq->head = new_head;
> + /* When the mapped pointer memory area is setup, we don't rely on
> + * the MMIO written values to update the head pointer. */
> + if (!cq->db_addr) {
> + cq->head = new_head;
> + }
You are still checking
if (new_head >= cq->size) {
return;
}
above. I think this is incorrect when the extension is present, and
furthermore it's the only case where val is being used.
If you're not using val, you could use ioeventfd for the MMIO. An
ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
quick and dirty measurements from kvm-unit-tests's vmexit.flat
benchmark, on two very different machines:
Haswell-EP Ivy Bridge i7
MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
You would need to allocate two eventfds for each qid, one for the sq and
one for the cq. Also, processing the queues is now bounced to the QEMU
iothread, so you can probably get rid of sq->timer and cq->timer.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-19 10:37 ` Paolo Bonzini
(?)
@ 2015-11-20 8:11 ` Ming Lin
-1 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 8:11 UTC (permalink / raw)
On Thu, 2015-11-19@11:37 +0100, Paolo Bonzini wrote:
>
> On 18/11/2015 06:47, Ming Lin wrote:
> > @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > }
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > - cq->head = new_head;
> > + /* When the mapped pointer memory area is setup, we don't rely on
> > + * the MMIO written values to update the head pointer. */
> > + if (!cq->db_addr) {
> > + cq->head = new_head;
> > + }
>
> You are still checking
>
> if (new_head >= cq->size) {
> return;
> }
>
> above. I think this is incorrect when the extension is present, and
> furthermore it's the only case where val is being used.
>
> If you're not using val, you could use ioeventfd for the MMIO. An
> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> benchmark, on two very different machines:
>
> Haswell-EP Ivy Bridge i7
> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>
> You would need to allocate two eventfds for each qid, one for the sq and
> one for the cq. Also, processing the queues is now bounced to the QEMU
> iothread, so you can probably get rid of sq->timer and cq->timer.
Here is a quick try.
Too late now, I'll test it morning.
Do you see obvious problem?
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e1c38d..d28690d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return NVME_SUCCESS;
}
+static void nvme_cq_notifier(EventNotifier *e)
+{
+ NvmeCQueue *cq =
+ container_of(e, NvmeCQueue, notifier);
+
+ nvme_post_cqes(cq);
+}
+
+static void nvme_init_cq_eventfd(NvmeCQueue *cq)
+{
+ NvmeCtrl *n = cq->ctrl;
+ uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&cq->notifier, 0);
+ event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+ NvmeSQueue *sq =
+ container_of(e, NvmeSQueue, notifier);
+
+ nvme_process_sq(sq);
+}
+
+static void nvme_init_sq_eventfd(NvmeSQueue *sq)
+{
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&sq->notifier, 0);
+ event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
+}
+
static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
{
uint64_t db_addr = le64_to_cpu(cmd->prp1);
@@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
/* Submission queue tail pointer location, 2 * QID * stride. */
sq->db_addr = db_addr + 2 * i * 4;
sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+ nvme_init_sq_eventfd(sq);
}
if (cq != NULL) {
@@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
*/
cq->db_addr = db_addr + (2 * i + 1) * 4;
cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+ nvme_init_cq_eventfd(cq);
}
}
return NVME_SUCCESS;
@@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
cq = n->cq[qid];
- if (new_head >= cq->size) {
+ if (!cq->db_addr && new_head >= cq->size) {
return;
}
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 82aeab4..608f202 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the tail pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeSQueue;
typedef struct NvmeCQueue {
@@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the head pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeCQueue;
typedef struct NvmeNamespace {
>
> Paolo
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 8:11 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 8:11 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
>
> On 18/11/2015 06:47, Ming Lin wrote:
> > @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > }
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > - cq->head = new_head;
> > + /* When the mapped pointer memory area is setup, we don't rely on
> > + * the MMIO written values to update the head pointer. */
> > + if (!cq->db_addr) {
> > + cq->head = new_head;
> > + }
>
> You are still checking
>
> if (new_head >= cq->size) {
> return;
> }
>
> above. I think this is incorrect when the extension is present, and
> furthermore it's the only case where val is being used.
>
> If you're not using val, you could use ioeventfd for the MMIO. An
> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> benchmark, on two very different machines:
>
> Haswell-EP Ivy Bridge i7
> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>
> You would need to allocate two eventfds for each qid, one for the sq and
> one for the cq. Also, processing the queues is now bounced to the QEMU
> iothread, so you can probably get rid of sq->timer and cq->timer.
Here is a quick try.
Too late now, I'll test it morning.
Do you see obvious problem?
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e1c38d..d28690d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return NVME_SUCCESS;
}
+static void nvme_cq_notifier(EventNotifier *e)
+{
+ NvmeCQueue *cq =
+ container_of(e, NvmeCQueue, notifier);
+
+ nvme_post_cqes(cq);
+}
+
+static void nvme_init_cq_eventfd(NvmeCQueue *cq)
+{
+ NvmeCtrl *n = cq->ctrl;
+ uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&cq->notifier, 0);
+ event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+ NvmeSQueue *sq =
+ container_of(e, NvmeSQueue, notifier);
+
+ nvme_process_sq(sq);
+}
+
+static void nvme_init_sq_eventfd(NvmeSQueue *sq)
+{
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&sq->notifier, 0);
+ event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
+}
+
static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
{
uint64_t db_addr = le64_to_cpu(cmd->prp1);
@@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
/* Submission queue tail pointer location, 2 * QID * stride. */
sq->db_addr = db_addr + 2 * i * 4;
sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+ nvme_init_sq_eventfd(sq);
}
if (cq != NULL) {
@@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
*/
cq->db_addr = db_addr + (2 * i + 1) * 4;
cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+ nvme_init_cq_eventfd(cq);
}
}
return NVME_SUCCESS;
@@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
cq = n->cq[qid];
- if (new_head >= cq->size) {
+ if (!cq->db_addr && new_head >= cq->size) {
return;
}
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 82aeab4..608f202 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the tail pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeSQueue;
typedef struct NvmeCQueue {
@@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the head pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeCQueue;
typedef struct NvmeNamespace {
>
> Paolo
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 8:11 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 8:11 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
>
> On 18/11/2015 06:47, Ming Lin wrote:
> > @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > }
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > - cq->head = new_head;
> > + /* When the mapped pointer memory area is setup, we don't rely on
> > + * the MMIO written values to update the head pointer. */
> > + if (!cq->db_addr) {
> > + cq->head = new_head;
> > + }
>
> You are still checking
>
> if (new_head >= cq->size) {
> return;
> }
>
> above. I think this is incorrect when the extension is present, and
> furthermore it's the only case where val is being used.
>
> If you're not using val, you could use ioeventfd for the MMIO. An
> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> benchmark, on two very different machines:
>
> Haswell-EP Ivy Bridge i7
> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>
> You would need to allocate two eventfds for each qid, one for the sq and
> one for the cq. Also, processing the queues is now bounced to the QEMU
> iothread, so you can probably get rid of sq->timer and cq->timer.
Here is a quick try.
Too late now, I'll test it morning.
Do you see obvious problem?
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 3e1c38d..d28690d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
return NVME_SUCCESS;
}
+static void nvme_cq_notifier(EventNotifier *e)
+{
+ NvmeCQueue *cq =
+ container_of(e, NvmeCQueue, notifier);
+
+ nvme_post_cqes(cq);
+}
+
+static void nvme_init_cq_eventfd(NvmeCQueue *cq)
+{
+ NvmeCtrl *n = cq->ctrl;
+ uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&cq->notifier, 0);
+ event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+ NvmeSQueue *sq =
+ container_of(e, NvmeSQueue, notifier);
+
+ nvme_process_sq(sq);
+}
+
+static void nvme_init_sq_eventfd(NvmeSQueue *sq)
+{
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
+
+ event_notifier_init(&sq->notifier, 0);
+ event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
+}
+
static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
{
uint64_t db_addr = le64_to_cpu(cmd->prp1);
@@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
/* Submission queue tail pointer location, 2 * QID * stride. */
sq->db_addr = db_addr + 2 * i * 4;
sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+ nvme_init_sq_eventfd(sq);
}
if (cq != NULL) {
@@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
*/
cq->db_addr = db_addr + (2 * i + 1) * 4;
cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+ nvme_init_cq_eventfd(cq);
}
}
return NVME_SUCCESS;
@@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
cq = n->cq[qid];
- if (new_head >= cq->size) {
+ if (!cq->db_addr && new_head >= cq->size) {
return;
}
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 82aeab4..608f202 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the tail pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeSQueue;
typedef struct NvmeCQueue {
@@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
* do not go over this value will not result in MMIO writes (but will
* still write the head pointer to the "db_addr" location above). */
uint64_t eventidx_addr;
+ EventNotifier notifier;
} NvmeCQueue;
typedef struct NvmeNamespace {
>
> Paolo
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-20 8:11 ` Ming Lin
(?)
@ 2015-11-20 8:58 ` Paolo Bonzini
-1 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-20 8:58 UTC (permalink / raw)
On 20/11/2015 09:11, Ming Lin wrote:
> On Thu, 2015-11-19@11:37 +0100, Paolo Bonzini wrote:
>>
>> On 18/11/2015 06:47, Ming Lin wrote:
>>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>>> }
>>>
>>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
>>> - cq->head = new_head;
>>> + /* When the mapped pointer memory area is setup, we don't rely on
>>> + * the MMIO written values to update the head pointer. */
>>> + if (!cq->db_addr) {
>>> + cq->head = new_head;
>>> + }
>>
>> You are still checking
>>
>> if (new_head >= cq->size) {
>> return;
>> }
>>
>> above. I think this is incorrect when the extension is present, and
>> furthermore it's the only case where val is being used.
>>
>> If you're not using val, you could use ioeventfd for the MMIO. An
>> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
>> quick and dirty measurements from kvm-unit-tests's vmexit.flat
>> benchmark, on two very different machines:
>>
>> Haswell-EP Ivy Bridge i7
>> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
>> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>>
>> You would need to allocate two eventfds for each qid, one for the sq and
>> one for the cq. Also, processing the queues is now bounced to the QEMU
>> iothread, so you can probably get rid of sq->timer and cq->timer.
>
> Here is a quick try.
> Too late now, I'll test it morning.
>
> Do you see obvious problem?
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3e1c38d..d28690d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> return NVME_SUCCESS;
> }
>
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> + NvmeCQueue *cq =
> + container_of(e, NvmeCQueue, notifier);
> +
> + nvme_post_cqes(cq);
> +}
> +
> +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> +{
> + NvmeCtrl *n = cq->ctrl;
> + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&cq->notifier, 0);
> + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
should be 0x1000 + offset, 4, false, 0, &cq->notifier
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> + NvmeSQueue *sq =
> + container_of(e, NvmeSQueue, notifier);
> +
> + nvme_process_sq(sq);
> +}
> +
> +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> +{
> + NvmeCtrl *n = sq->ctrl;
> + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&sq->notifier, 0);
> + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
Otherwise looks good!
Paolo
> +}
> +
> static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> {
> uint64_t db_addr = le64_to_cpu(cmd->prp1);
> @@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> /* Submission queue tail pointer location, 2 * QID * stride. */
> sq->db_addr = db_addr + 2 * i * 4;
> sq->eventidx_addr = eventidx_addr + 2 * i * 4;
> + nvme_init_sq_eventfd(sq);
> }
>
> if (cq != NULL) {
> @@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> */
> cq->db_addr = db_addr + (2 * i + 1) * 4;
> cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
> + nvme_init_cq_eventfd(cq);
> }
> }
> return NVME_SUCCESS;
> @@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> cq = n->cq[qid];
> - if (new_head >= cq->size) {
> + if (!cq->db_addr && new_head >= cq->size) {
> return;
> }
>
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 82aeab4..608f202 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the tail pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeSQueue;
>
> typedef struct NvmeCQueue {
> @@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the head pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeCQueue;
>
> typedef struct NvmeNamespace {
>
>>
>> Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 8:58 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-20 8:58 UTC (permalink / raw)
To: Ming Lin
Cc: fes, keith.busch, tytso, qemu-devel, linux-nvme, virtualization,
axboe, Rob Nelson, Christoph Hellwig, Mihai Rusu
On 20/11/2015 09:11, Ming Lin wrote:
> On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
>>
>> On 18/11/2015 06:47, Ming Lin wrote:
>>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>>> }
>>>
>>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
>>> - cq->head = new_head;
>>> + /* When the mapped pointer memory area is setup, we don't rely on
>>> + * the MMIO written values to update the head pointer. */
>>> + if (!cq->db_addr) {
>>> + cq->head = new_head;
>>> + }
>>
>> You are still checking
>>
>> if (new_head >= cq->size) {
>> return;
>> }
>>
>> above. I think this is incorrect when the extension is present, and
>> furthermore it's the only case where val is being used.
>>
>> If you're not using val, you could use ioeventfd for the MMIO. An
>> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
>> quick and dirty measurements from kvm-unit-tests's vmexit.flat
>> benchmark, on two very different machines:
>>
>> Haswell-EP Ivy Bridge i7
>> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
>> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>>
>> You would need to allocate two eventfds for each qid, one for the sq and
>> one for the cq. Also, processing the queues is now bounced to the QEMU
>> iothread, so you can probably get rid of sq->timer and cq->timer.
>
> Here is a quick try.
> Too late now, I'll test it morning.
>
> Do you see obvious problem?
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3e1c38d..d28690d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> return NVME_SUCCESS;
> }
>
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> + NvmeCQueue *cq =
> + container_of(e, NvmeCQueue, notifier);
> +
> + nvme_post_cqes(cq);
> +}
> +
> +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> +{
> + NvmeCtrl *n = cq->ctrl;
> + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&cq->notifier, 0);
> + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
should be 0x1000 + offset, 4, false, 0, &cq->notifier
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> + NvmeSQueue *sq =
> + container_of(e, NvmeSQueue, notifier);
> +
> + nvme_process_sq(sq);
> +}
> +
> +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> +{
> + NvmeCtrl *n = sq->ctrl;
> + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&sq->notifier, 0);
> + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
Otherwise looks good!
Paolo
> +}
> +
> static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> {
> uint64_t db_addr = le64_to_cpu(cmd->prp1);
> @@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> /* Submission queue tail pointer location, 2 * QID * stride. */
> sq->db_addr = db_addr + 2 * i * 4;
> sq->eventidx_addr = eventidx_addr + 2 * i * 4;
> + nvme_init_sq_eventfd(sq);
> }
>
> if (cq != NULL) {
> @@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> */
> cq->db_addr = db_addr + (2 * i + 1) * 4;
> cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
> + nvme_init_cq_eventfd(cq);
> }
> }
> return NVME_SUCCESS;
> @@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> cq = n->cq[qid];
> - if (new_head >= cq->size) {
> + if (!cq->db_addr && new_head >= cq->size) {
> return;
> }
>
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 82aeab4..608f202 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the tail pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeSQueue;
>
> typedef struct NvmeCQueue {
> @@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the head pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeCQueue;
>
> typedef struct NvmeNamespace {
>
>>
>> Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 8:58 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-20 8:58 UTC (permalink / raw)
To: Ming Lin
Cc: fes, keith.busch, tytso, qemu-devel, linux-nvme, virtualization,
axboe, Rob Nelson, Christoph Hellwig, Mihai Rusu
On 20/11/2015 09:11, Ming Lin wrote:
> On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
>>
>> On 18/11/2015 06:47, Ming Lin wrote:
>>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>>> }
>>>
>>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
>>> - cq->head = new_head;
>>> + /* When the mapped pointer memory area is setup, we don't rely on
>>> + * the MMIO written values to update the head pointer. */
>>> + if (!cq->db_addr) {
>>> + cq->head = new_head;
>>> + }
>>
>> You are still checking
>>
>> if (new_head >= cq->size) {
>> return;
>> }
>>
>> above. I think this is incorrect when the extension is present, and
>> furthermore it's the only case where val is being used.
>>
>> If you're not using val, you could use ioeventfd for the MMIO. An
>> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
>> quick and dirty measurements from kvm-unit-tests's vmexit.flat
>> benchmark, on two very different machines:
>>
>> Haswell-EP Ivy Bridge i7
>> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
>> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
>>
>> You would need to allocate two eventfds for each qid, one for the sq and
>> one for the cq. Also, processing the queues is now bounced to the QEMU
>> iothread, so you can probably get rid of sq->timer and cq->timer.
>
> Here is a quick try.
> Too late now, I'll test it morning.
>
> Do you see obvious problem?
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3e1c38d..d28690d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> return NVME_SUCCESS;
> }
>
> +static void nvme_cq_notifier(EventNotifier *e)
> +{
> + NvmeCQueue *cq =
> + container_of(e, NvmeCQueue, notifier);
> +
> + nvme_post_cqes(cq);
> +}
> +
> +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> +{
> + NvmeCtrl *n = cq->ctrl;
> + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&cq->notifier, 0);
> + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
should be 0x1000 + offset, 4, false, 0, &cq->notifier
> +}
> +
> +static void nvme_sq_notifier(EventNotifier *e)
> +{
> + NvmeSQueue *sq =
> + container_of(e, NvmeSQueue, notifier);
> +
> + nvme_process_sq(sq);
> +}
> +
> +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> +{
> + NvmeCtrl *n = sq->ctrl;
> + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> +
> + event_notifier_init(&sq->notifier, 0);
> + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> + memory_region_add_eventfd(&n->iomem,
> + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
Otherwise looks good!
Paolo
> +}
> +
> static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> {
> uint64_t db_addr = le64_to_cpu(cmd->prp1);
> @@ -565,6 +603,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> /* Submission queue tail pointer location, 2 * QID * stride. */
> sq->db_addr = db_addr + 2 * i * 4;
> sq->eventidx_addr = eventidx_addr + 2 * i * 4;
> + nvme_init_sq_eventfd(sq);
> }
>
> if (cq != NULL) {
> @@ -572,6 +611,7 @@ static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
> */
> cq->db_addr = db_addr + (2 * i + 1) * 4;
> cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
> + nvme_init_cq_eventfd(cq);
> }
> }
> return NVME_SUCCESS;
> @@ -793,7 +833,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> }
>
> cq = n->cq[qid];
> - if (new_head >= cq->size) {
> + if (!cq->db_addr && new_head >= cq->size) {
> return;
> }
>
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 82aeab4..608f202 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -667,6 +667,7 @@ typedef struct NvmeSQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the tail pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeSQueue;
>
> typedef struct NvmeCQueue {
> @@ -689,6 +690,7 @@ typedef struct NvmeCQueue {
> * do not go over this value will not result in MMIO writes (but will
> * still write the head pointer to the "db_addr" location above). */
> uint64_t eventidx_addr;
> + EventNotifier notifier;
> } NvmeCQueue;
>
> typedef struct NvmeNamespace {
>
>>
>> Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-20 8:58 ` Paolo Bonzini
(?)
@ 2015-11-20 23:05 ` Ming Lin
-1 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 23:05 UTC (permalink / raw)
On Fri, 2015-11-20@09:58 +0100, Paolo Bonzini wrote:
>
> On 20/11/2015 09:11, Ming Lin wrote:
> > On Thu, 2015-11-19@11:37 +0100, Paolo Bonzini wrote:
> >>
> >> On 18/11/2015 06:47, Ming Lin wrote:
> >>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >>> }
> >>>
> >>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> >>> - cq->head = new_head;
> >>> + /* When the mapped pointer memory area is setup, we don't rely on
> >>> + * the MMIO written values to update the head pointer. */
> >>> + if (!cq->db_addr) {
> >>> + cq->head = new_head;
> >>> + }
> >>
> >> You are still checking
> >>
> >> if (new_head >= cq->size) {
> >> return;
> >> }
> >>
> >> above. I think this is incorrect when the extension is present, and
> >> furthermore it's the only case where val is being used.
> >>
> >> If you're not using val, you could use ioeventfd for the MMIO. An
> >> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> >> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> >> benchmark, on two very different machines:
> >>
> >> Haswell-EP Ivy Bridge i7
> >> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> >> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
> >>
> >> You would need to allocate two eventfds for each qid, one for the sq and
> >> one for the cq. Also, processing the queues is now bounced to the QEMU
> >> iothread, so you can probably get rid of sq->timer and cq->timer.
> >
> > Here is a quick try.
> > Too late now, I'll test it morning.
> >
> > Do you see obvious problem?
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 3e1c38d..d28690d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > return NVME_SUCCESS;
> > }
> >
> > +static void nvme_cq_notifier(EventNotifier *e)
> > +{
> > + NvmeCQueue *cq =
> > + container_of(e, NvmeCQueue, notifier);
> > +
> > + nvme_post_cqes(cq);
> > +}
> > +
> > +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> > +{
> > + NvmeCtrl *n = cq->ctrl;
> > + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&cq->notifier, 0);
> > + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
>
> should be 0x1000 + offset, 4, false, 0, &cq->notifier
>
> > +}
> > +
> > +static void nvme_sq_notifier(EventNotifier *e)
> > +{
> > + NvmeSQueue *sq =
> > + container_of(e, NvmeSQueue, notifier);
> > +
> > + nvme_process_sq(sq);
> > +}
> > +
> > +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> > +{
> > + NvmeCtrl *n = sq->ctrl;
> > + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&sq->notifier, 0);
> > + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
>
> likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
It works. But for some unknown reason, when boots guest kernel, it stuck
for almost 1 minute.
[ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
[ 1.988187] clocksource: Switched to clocksource tsc
[ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
[ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
[ 3.450026] clocksource: Switched to clocksource refined-jiffies
[ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
Then it doesn't response input for almost 1 minute.
Without this patch, kernel loads quickly.
[ 1.351095] Freeing unused kernel memory: 2968K (ffffffff81d49000 - ffffffff8202f000)
[ 1.352039] Write protecting the kernel read-only data: 12288k
[ 1.353340] Freeing unused kernel memory: 216K (ffff8800017ca000 - ffff880001800000)
[ 1.354670] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.796272] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 1.809579] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 1.864834] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
[ 1.964181] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e58d9c595, max_idle_ns: 440795220169 ns
[ 1.965610] clocksource: Switched to clocksource tsc
[ 2.377965] random: dd urandom read with 19 bits of entropy available
void memory_region_add_eventfd(MemoryRegion *mr,
hwaddr addr,
unsigned size,
bool match_data,
uint64_t data,
EventNotifier *e)
Could you help to explain what "match_data" and "data" mean?
I use a real NVMe device as backend.
-drive file=/dev/nvme0n1,format=raw,if=none,id=D22 \
-device nvme,drive=D22,serial=1234
Here is the test results:
local NVMe: 860MB/s
qemu-nvme: 108MB/s
qemu-nvme+google-ext: 140MB/s
qemu-nvme-google-ext+eventfd: 190MB/s
root at wheezy:~# cat test.job
[global]
bs=4k
ioengine=libaio
iodepth=64
direct=1
runtime=60
time_based
norandommap
group_reporting
gtod_reduce=1
numjobs=8
[job1]
filename=/dev/nvme0n1
rw=read
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 23:05 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 23:05 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, keith.busch, tytso, qemu-devel, linux-nvme, virtualization,
axboe, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Fri, 2015-11-20 at 09:58 +0100, Paolo Bonzini wrote:
>
> On 20/11/2015 09:11, Ming Lin wrote:
> > On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
> >>
> >> On 18/11/2015 06:47, Ming Lin wrote:
> >>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >>> }
> >>>
> >>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> >>> - cq->head = new_head;
> >>> + /* When the mapped pointer memory area is setup, we don't rely on
> >>> + * the MMIO written values to update the head pointer. */
> >>> + if (!cq->db_addr) {
> >>> + cq->head = new_head;
> >>> + }
> >>
> >> You are still checking
> >>
> >> if (new_head >= cq->size) {
> >> return;
> >> }
> >>
> >> above. I think this is incorrect when the extension is present, and
> >> furthermore it's the only case where val is being used.
> >>
> >> If you're not using val, you could use ioeventfd for the MMIO. An
> >> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> >> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> >> benchmark, on two very different machines:
> >>
> >> Haswell-EP Ivy Bridge i7
> >> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> >> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
> >>
> >> You would need to allocate two eventfds for each qid, one for the sq and
> >> one for the cq. Also, processing the queues is now bounced to the QEMU
> >> iothread, so you can probably get rid of sq->timer and cq->timer.
> >
> > Here is a quick try.
> > Too late now, I'll test it morning.
> >
> > Do you see obvious problem?
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 3e1c38d..d28690d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > return NVME_SUCCESS;
> > }
> >
> > +static void nvme_cq_notifier(EventNotifier *e)
> > +{
> > + NvmeCQueue *cq =
> > + container_of(e, NvmeCQueue, notifier);
> > +
> > + nvme_post_cqes(cq);
> > +}
> > +
> > +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> > +{
> > + NvmeCtrl *n = cq->ctrl;
> > + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&cq->notifier, 0);
> > + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
>
> should be 0x1000 + offset, 4, false, 0, &cq->notifier
>
> > +}
> > +
> > +static void nvme_sq_notifier(EventNotifier *e)
> > +{
> > + NvmeSQueue *sq =
> > + container_of(e, NvmeSQueue, notifier);
> > +
> > + nvme_process_sq(sq);
> > +}
> > +
> > +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> > +{
> > + NvmeCtrl *n = sq->ctrl;
> > + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&sq->notifier, 0);
> > + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
>
> likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
It works. But for some unknown reason, when boots guest kernel, it stuck
for almost 1 minute.
[ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
[ 1.988187] clocksource: Switched to clocksource tsc
[ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
[ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
[ 3.450026] clocksource: Switched to clocksource refined-jiffies
[ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
Then it doesn't response input for almost 1 minute.
Without this patch, kernel loads quickly.
[ 1.351095] Freeing unused kernel memory: 2968K (ffffffff81d49000 - ffffffff8202f000)
[ 1.352039] Write protecting the kernel read-only data: 12288k
[ 1.353340] Freeing unused kernel memory: 216K (ffff8800017ca000 - ffff880001800000)
[ 1.354670] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.796272] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 1.809579] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 1.864834] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
[ 1.964181] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e58d9c595, max_idle_ns: 440795220169 ns
[ 1.965610] clocksource: Switched to clocksource tsc
[ 2.377965] random: dd urandom read with 19 bits of entropy available
void memory_region_add_eventfd(MemoryRegion *mr,
hwaddr addr,
unsigned size,
bool match_data,
uint64_t data,
EventNotifier *e)
Could you help to explain what "match_data" and "data" mean?
I use a real NVMe device as backend.
-drive file=/dev/nvme0n1,format=raw,if=none,id=D22 \
-device nvme,drive=D22,serial=1234
Here is the test results:
local NVMe: 860MB/s
qemu-nvme: 108MB/s
qemu-nvme+google-ext: 140MB/s
qemu-nvme-google-ext+eventfd: 190MB/s
root@wheezy:~# cat test.job
[global]
bs=4k
ioengine=libaio
iodepth=64
direct=1
runtime=60
time_based
norandommap
group_reporting
gtod_reduce=1
numjobs=8
[job1]
filename=/dev/nvme0n1
rw=read
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-20 23:05 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-20 23:05 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, keith.busch, tytso, qemu-devel, linux-nvme, virtualization,
axboe, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Fri, 2015-11-20 at 09:58 +0100, Paolo Bonzini wrote:
>
> On 20/11/2015 09:11, Ming Lin wrote:
> > On Thu, 2015-11-19 at 11:37 +0100, Paolo Bonzini wrote:
> >>
> >> On 18/11/2015 06:47, Ming Lin wrote:
> >>> @@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >>> }
> >>>
> >>> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> >>> - cq->head = new_head;
> >>> + /* When the mapped pointer memory area is setup, we don't rely on
> >>> + * the MMIO written values to update the head pointer. */
> >>> + if (!cq->db_addr) {
> >>> + cq->head = new_head;
> >>> + }
> >>
> >> You are still checking
> >>
> >> if (new_head >= cq->size) {
> >> return;
> >> }
> >>
> >> above. I think this is incorrect when the extension is present, and
> >> furthermore it's the only case where val is being used.
> >>
> >> If you're not using val, you could use ioeventfd for the MMIO. An
> >> ioeventfd cuts the MMIO cost by at least 55% and up to 70%. Here are
> >> quick and dirty measurements from kvm-unit-tests's vmexit.flat
> >> benchmark, on two very different machines:
> >>
> >> Haswell-EP Ivy Bridge i7
> >> MMIO memory write 5100 -> 2250 (55%) 7000 -> 3000 (58%)
> >> I/O port write 3800 -> 1150 (70%) 4100 -> 1800 (57%)
> >>
> >> You would need to allocate two eventfds for each qid, one for the sq and
> >> one for the cq. Also, processing the queues is now bounced to the QEMU
> >> iothread, so you can probably get rid of sq->timer and cq->timer.
> >
> > Here is a quick try.
> > Too late now, I'll test it morning.
> >
> > Do you see obvious problem?
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 3e1c38d..d28690d 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -543,6 +543,44 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > return NVME_SUCCESS;
> > }
> >
> > +static void nvme_cq_notifier(EventNotifier *e)
> > +{
> > + NvmeCQueue *cq =
> > + container_of(e, NvmeCQueue, notifier);
> > +
> > + nvme_post_cqes(cq);
> > +}
> > +
> > +static void nvme_init_cq_eventfd(NvmeCQueue *cq)
> > +{
> > + NvmeCtrl *n = cq->ctrl;
> > + uint16_t offset = (cq->cqid*2+1) * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&cq->notifier, 0);
> > + event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, cq->cqid*2+1, &cq->notifier);
>
> should be 0x1000 + offset, 4, false, 0, &cq->notifier
>
> > +}
> > +
> > +static void nvme_sq_notifier(EventNotifier *e)
> > +{
> > + NvmeSQueue *sq =
> > + container_of(e, NvmeSQueue, notifier);
> > +
> > + nvme_process_sq(sq);
> > +}
> > +
> > +static void nvme_init_sq_eventfd(NvmeSQueue *sq)
> > +{
> > + NvmeCtrl *n = sq->ctrl;
> > + uint16_t offset = sq->sqid * 2 * NVME_CAP_DSTRD(n->bar.cap);
> > +
> > + event_notifier_init(&sq->notifier, 0);
> > + event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
> > + memory_region_add_eventfd(&n->iomem,
> > + 0x1000 + offset, 4, true, sq->sqid * 2, &sq->notifier);
>
> likewise should be 0x1000 + offset, 4, false, 0, &sq->notifier
It works. But for some unknown reason, when boots guest kernel, it stuck
for almost 1 minute.
[ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
[ 1.988187] clocksource: Switched to clocksource tsc
[ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
[ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
[ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
[ 3.450026] clocksource: Switched to clocksource refined-jiffies
[ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
Then it doesn't response input for almost 1 minute.
Without this patch, kernel loads quickly.
[ 1.351095] Freeing unused kernel memory: 2968K (ffffffff81d49000 - ffffffff8202f000)
[ 1.352039] Write protecting the kernel read-only data: 12288k
[ 1.353340] Freeing unused kernel memory: 216K (ffff8800017ca000 - ffff880001800000)
[ 1.354670] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
[ 1.796272] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
[ 1.809579] EXT4-fs (vda1): re-mounted. Opts: (null)
[ 1.864834] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
[ 1.964181] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e58d9c595, max_idle_ns: 440795220169 ns
[ 1.965610] clocksource: Switched to clocksource tsc
[ 2.377965] random: dd urandom read with 19 bits of entropy available
void memory_region_add_eventfd(MemoryRegion *mr,
hwaddr addr,
unsigned size,
bool match_data,
uint64_t data,
EventNotifier *e)
Could you help to explain what "match_data" and "data" mean?
I use a real NVMe device as backend.
-drive file=/dev/nvme0n1,format=raw,if=none,id=D22 \
-device nvme,drive=D22,serial=1234
Here is the test results:
local NVMe: 860MB/s
qemu-nvme: 108MB/s
qemu-nvme+google-ext: 140MB/s
qemu-nvme-google-ext+eventfd: 190MB/s
root@wheezy:~# cat test.job
[global]
bs=4k
ioengine=libaio
iodepth=64
direct=1
runtime=60
time_based
norandommap
group_reporting
gtod_reduce=1
numjobs=8
[job1]
filename=/dev/nvme0n1
rw=read
^ permalink raw reply [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-20 23:05 ` Ming Lin
(?)
@ 2015-11-21 12:56 ` Paolo Bonzini
-1 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-21 12:56 UTC (permalink / raw)
On 21/11/2015 00:05, Ming Lin wrote:
> [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> [ 1.988187] clocksource: Switched to clocksource tsc
> [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
>
> Then it doesn't response input for almost 1 minute.
> Without this patch, kernel loads quickly.
Interesting. I guess there's time to debug it, since QEMU 2.6 is still
a few months away. In the meanwhile we can apply your patch as is,
apart from disabling the "if (new_head >= cq->size)" and the similar
one for "if (new_ tail >= sq->size".
But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
equivalent of:
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
Instead, you are just calling nvme_post_cqes, which is the equivalent of
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
fix the weird 1-minute delay.
Paolo
> void memory_region_add_eventfd(MemoryRegion *mr,
> hwaddr addr,
> unsigned size,
> bool match_data,
> uint64_t data,
> EventNotifier *e)
>
> Could you help to explain what "match_data" and "data" mean?
If match_data is true, the eventfd is only signalled if "data" is being written to memory.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-21 12:56 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-21 12:56 UTC (permalink / raw)
To: Ming Lin
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On 21/11/2015 00:05, Ming Lin wrote:
> [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> [ 1.988187] clocksource: Switched to clocksource tsc
> [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
>
> Then it doesn't response input for almost 1 minute.
> Without this patch, kernel loads quickly.
Interesting. I guess there's time to debug it, since QEMU 2.6 is still
a few months away. In the meanwhile we can apply your patch as is,
apart from disabling the "if (new_head >= cq->size)" and the similar
one for "if (new_ tail >= sq->size".
But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
equivalent of:
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
Instead, you are just calling nvme_post_cqes, which is the equivalent of
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
fix the weird 1-minute delay.
Paolo
> void memory_region_add_eventfd(MemoryRegion *mr,
> hwaddr addr,
> unsigned size,
> bool match_data,
> uint64_t data,
> EventNotifier *e)
>
> Could you help to explain what "match_data" and "data" mean?
If match_data is true, the eventfd is only signalled if "data" is being written to memory.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-21 12:56 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-21 12:56 UTC (permalink / raw)
To: Ming Lin
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On 21/11/2015 00:05, Ming Lin wrote:
> [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> [ 1.988187] clocksource: Switched to clocksource tsc
> [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
>
> Then it doesn't response input for almost 1 minute.
> Without this patch, kernel loads quickly.
Interesting. I guess there's time to debug it, since QEMU 2.6 is still
a few months away. In the meanwhile we can apply your patch as is,
apart from disabling the "if (new_head >= cq->size)" and the similar
one for "if (new_ tail >= sq->size".
But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
equivalent of:
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
Instead, you are just calling nvme_post_cqes, which is the equivalent of
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
fix the weird 1-minute delay.
Paolo
> void memory_region_add_eventfd(MemoryRegion *mr,
> hwaddr addr,
> unsigned size,
> bool match_data,
> uint64_t data,
> EventNotifier *e)
>
> Could you help to explain what "match_data" and "data" mean?
If match_data is true, the eventfd is only signalled if "data" is being written to memory.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-21 12:56 ` Paolo Bonzini
(?)
@ 2015-11-22 7:45 ` Ming Lin
-1 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-22 7:45 UTC (permalink / raw)
On Sat, 2015-11-21@13:56 +0100, Paolo Bonzini wrote:
>
> On 21/11/2015 00:05, Ming Lin wrote:
> > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > [ 1.988187] clocksource: Switched to clocksource tsc
> > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> >
> > Then it doesn't response input for almost 1 minute.
> > Without this patch, kernel loads quickly.
>
> Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> a few months away. In the meanwhile we can apply your patch as is,
> apart from disabling the "if (new_head >= cq->size)" and the similar
> one for "if (new_ tail >= sq->size".
>
> But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> equivalent of:
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> cq->head = new_head;
> if (start_sqs) {
> NvmeSQueue *sq;
> QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
>
> Instead, you are just calling nvme_post_cqes, which is the equivalent of
>
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>
> Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> fix the weird 1-minute delay.
I found it.
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 31572f2..f27fd35 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
NvmeCQueue *cq =
container_of(e, NvmeCQueue, notifier);
+ event_notifier_test_and_clear(&cq->notifier);
nvme_post_cqes(cq);
}
@@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
NvmeSQueue *sq =
container_of(e, NvmeSQueue, notifier);
+ event_notifier_test_and_clear(&sq->notifier);
nvme_process_sq(sq);
}
Here is new performance number:
qemu-nvme + google-ext + eventfd: 294MB/s
virtio-blk: 344MB/s
virtio-scsi: 296MB/s
It's almost same as virtio-scsi. Nice.
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-22 7:45 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-22 7:45 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Sat, 2015-11-21 at 13:56 +0100, Paolo Bonzini wrote:
>
> On 21/11/2015 00:05, Ming Lin wrote:
> > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > [ 1.988187] clocksource: Switched to clocksource tsc
> > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> >
> > Then it doesn't response input for almost 1 minute.
> > Without this patch, kernel loads quickly.
>
> Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> a few months away. In the meanwhile we can apply your patch as is,
> apart from disabling the "if (new_head >= cq->size)" and the similar
> one for "if (new_ tail >= sq->size".
>
> But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> equivalent of:
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> cq->head = new_head;
> if (start_sqs) {
> NvmeSQueue *sq;
> QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
>
> Instead, you are just calling nvme_post_cqes, which is the equivalent of
>
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>
> Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> fix the weird 1-minute delay.
I found it.
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 31572f2..f27fd35 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
NvmeCQueue *cq =
container_of(e, NvmeCQueue, notifier);
+ event_notifier_test_and_clear(&cq->notifier);
nvme_post_cqes(cq);
}
@@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
NvmeSQueue *sq =
container_of(e, NvmeSQueue, notifier);
+ event_notifier_test_and_clear(&sq->notifier);
nvme_process_sq(sq);
}
Here is new performance number:
qemu-nvme + google-ext + eventfd: 294MB/s
virtio-blk: 344MB/s
virtio-scsi: 296MB/s
It's almost same as virtio-scsi. Nice.
^ permalink raw reply related [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-22 7:45 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-22 7:45 UTC (permalink / raw)
To: Paolo Bonzini
Cc: fes, axboe, tytso, qemu-devel, linux-nvme, virtualization,
keith.busch, Rob Nelson, Christoph Hellwig, Mihai Rusu
On Sat, 2015-11-21 at 13:56 +0100, Paolo Bonzini wrote:
>
> On 21/11/2015 00:05, Ming Lin wrote:
> > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > [ 1.988187] clocksource: Switched to clocksource tsc
> > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> >
> > Then it doesn't response input for almost 1 minute.
> > Without this patch, kernel loads quickly.
>
> Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> a few months away. In the meanwhile we can apply your patch as is,
> apart from disabling the "if (new_head >= cq->size)" and the similar
> one for "if (new_ tail >= sq->size".
>
> But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> equivalent of:
>
> start_sqs = nvme_cq_full(cq) ? 1 : 0;
> cq->head = new_head;
> if (start_sqs) {
> NvmeSQueue *sq;
> QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> }
>
> Instead, you are just calling nvme_post_cqes, which is the equivalent of
>
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>
> Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> fix the weird 1-minute delay.
I found it.
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 31572f2..f27fd35 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
NvmeCQueue *cq =
container_of(e, NvmeCQueue, notifier);
+ event_notifier_test_and_clear(&cq->notifier);
nvme_post_cqes(cq);
}
@@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
NvmeSQueue *sq =
container_of(e, NvmeSQueue, notifier);
+ event_notifier_test_and_clear(&sq->notifier);
nvme_process_sq(sq);
}
Here is new performance number:
qemu-nvme + google-ext + eventfd: 294MB/s
virtio-blk: 344MB/s
virtio-scsi: 296MB/s
It's almost same as virtio-scsi. Nice.
^ permalink raw reply related [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-22 7:45 ` Ming Lin
(?)
@ 2015-11-24 6:29 ` Ming Lin
-1 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-24 6:29 UTC (permalink / raw)
On Sat, 2015-11-21@23:45 -0800, Ming Lin wrote:
> On Sat, 2015-11-21@13:56 +0100, Paolo Bonzini wrote:
> >
> > On 21/11/2015 00:05, Ming Lin wrote:
> > > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > > [ 1.988187] clocksource: Switched to clocksource tsc
> > > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> > >
> > > Then it doesn't response input for almost 1 minute.
> > > Without this patch, kernel loads quickly.
> >
> > Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> > a few months away. In the meanwhile we can apply your patch as is,
> > apart from disabling the "if (new_head >= cq->size)" and the similar
> > one for "if (new_ tail >= sq->size".
> >
> > But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> > equivalent of:
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > cq->head = new_head;
> > if (start_sqs) {
> > NvmeSQueue *sq;
> > QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> > timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> >
> > Instead, you are just calling nvme_post_cqes, which is the equivalent of
> >
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> >
> > Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> > fix the weird 1-minute delay.
>
> I found it.
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 31572f2..f27fd35 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
> NvmeCQueue *cq =
> container_of(e, NvmeCQueue, notifier);
>
> + event_notifier_test_and_clear(&cq->notifier);
> nvme_post_cqes(cq);
> }
>
> @@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
> NvmeSQueue *sq =
> container_of(e, NvmeSQueue, notifier);
>
> + event_notifier_test_and_clear(&sq->notifier);
> nvme_process_sq(sq);
> }
>
> Here is new performance number:
>
> qemu-nvme + google-ext + eventfd: 294MB/s
> virtio-blk: 344MB/s
> virtio-scsi: 296MB/s
>
> It's almost same as virtio-scsi. Nice.
(strip CC)
Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
in the main loop thread.
Could you help to explain why eventfd MMIO gets better performance?
call stack: regular MMIO
========================
nvme_mmio_write (qemu/hw/block/nvme.c:921)
memory_region_write_accessor (qemu/memory.c:451)
access_with_adjusted_size (qemu/memory.c:506)
memory_region_dispatch_write (qemu/memory.c:1158)
address_space_rw (qemu/exec.c:2547)
kvm_cpu_exec (qemu/kvm-all.c:1849)
qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
start_thread (pthread_create.c:312)
clone
call stack: eventfd MMIO
=========================
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_ctx_dispatch (qemu/async.c:232)
g_main_context_dispatch
glib_pollfds_poll (qemu/main-loop.c:213)
os_host_main_loop_wait (qemu/main-loop.c:257)
main_loop_wait (qemu/main-loop.c:504)
main_loop (qemu/vl.c:1920)
main (qemu/vl.c:4682)
__libc_start_main
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-24 6:29 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-24 6:29 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel, linux-nvme, virtualization
On Sat, 2015-11-21 at 23:45 -0800, Ming Lin wrote:
> On Sat, 2015-11-21 at 13:56 +0100, Paolo Bonzini wrote:
> >
> > On 21/11/2015 00:05, Ming Lin wrote:
> > > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > > [ 1.988187] clocksource: Switched to clocksource tsc
> > > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> > >
> > > Then it doesn't response input for almost 1 minute.
> > > Without this patch, kernel loads quickly.
> >
> > Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> > a few months away. In the meanwhile we can apply your patch as is,
> > apart from disabling the "if (new_head >= cq->size)" and the similar
> > one for "if (new_ tail >= sq->size".
> >
> > But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> > equivalent of:
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > cq->head = new_head;
> > if (start_sqs) {
> > NvmeSQueue *sq;
> > QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> > timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> >
> > Instead, you are just calling nvme_post_cqes, which is the equivalent of
> >
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> >
> > Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> > fix the weird 1-minute delay.
>
> I found it.
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 31572f2..f27fd35 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
> NvmeCQueue *cq =
> container_of(e, NvmeCQueue, notifier);
>
> + event_notifier_test_and_clear(&cq->notifier);
> nvme_post_cqes(cq);
> }
>
> @@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
> NvmeSQueue *sq =
> container_of(e, NvmeSQueue, notifier);
>
> + event_notifier_test_and_clear(&sq->notifier);
> nvme_process_sq(sq);
> }
>
> Here is new performance number:
>
> qemu-nvme + google-ext + eventfd: 294MB/s
> virtio-blk: 344MB/s
> virtio-scsi: 296MB/s
>
> It's almost same as virtio-scsi. Nice.
(strip CC)
Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
in the main loop thread.
Could you help to explain why eventfd MMIO gets better performance?
call stack: regular MMIO
========================
nvme_mmio_write (qemu/hw/block/nvme.c:921)
memory_region_write_accessor (qemu/memory.c:451)
access_with_adjusted_size (qemu/memory.c:506)
memory_region_dispatch_write (qemu/memory.c:1158)
address_space_rw (qemu/exec.c:2547)
kvm_cpu_exec (qemu/kvm-all.c:1849)
qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
start_thread (pthread_create.c:312)
clone
call stack: eventfd MMIO
=========================
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_ctx_dispatch (qemu/async.c:232)
g_main_context_dispatch
glib_pollfds_poll (qemu/main-loop.c:213)
os_host_main_loop_wait (qemu/main-loop.c:257)
main_loop_wait (qemu/main-loop.c:504)
main_loop (qemu/vl.c:1920)
main (qemu/vl.c:4682)
__libc_start_main
^ permalink raw reply [flat|nested] 33+ messages in thread* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-24 6:29 ` Ming Lin
0 siblings, 0 replies; 33+ messages in thread
From: Ming Lin @ 2015-11-24 6:29 UTC (permalink / raw)
To: Paolo Bonzini; +Cc: qemu-devel, linux-nvme, virtualization
On Sat, 2015-11-21 at 23:45 -0800, Ming Lin wrote:
> On Sat, 2015-11-21 at 13:56 +0100, Paolo Bonzini wrote:
> >
> > On 21/11/2015 00:05, Ming Lin wrote:
> > > [ 1.752129] Freeing unused kernel memory: 420K (ffff880001b97000 - ffff880001c00000)
> > > [ 1.986573] clocksource: tsc: mask: 0xffffffffffffffff max_cycles: 0x30e5c9bbf83, max_idle_ns: 440795378954 ns
> > > [ 1.988187] clocksource: Switched to clocksource tsc
> > > [ 3.235423] clocksource: timekeeping watchdog: Marking clocksource 'tsc' as unstable because the skew is too large:
> > > [ 3.358713] clocksource: 'refined-jiffies' wd_now: fffeddf3 wd_last: fffedd76 mask: ffffffff
> > > [ 3.410013] clocksource: 'tsc' cs_now: 3c121d4ec cs_last: 340888eb7 mask: ffffffffffffffff
> > > [ 3.450026] clocksource: Switched to clocksource refined-jiffies
> > > [ 7.696769] Adding 392188k swap on /dev/vda5. Priority:-1 extents:1 across:392188k
> > > [ 7.902174] EXT4-fs (vda1): re-mounted. Opts: (null)
> > > [ 8.734178] EXT4-fs (vda1): re-mounted. Opts: errors=remount-ro
> > >
> > > Then it doesn't response input for almost 1 minute.
> > > Without this patch, kernel loads quickly.
> >
> > Interesting. I guess there's time to debug it, since QEMU 2.6 is still
> > a few months away. In the meanwhile we can apply your patch as is,
> > apart from disabling the "if (new_head >= cq->size)" and the similar
> > one for "if (new_ tail >= sq->size".
> >
> > But, I have a possible culprit. In your nvme_cq_notifier you are not doing the
> > equivalent of:
> >
> > start_sqs = nvme_cq_full(cq) ? 1 : 0;
> > cq->head = new_head;
> > if (start_sqs) {
> > NvmeSQueue *sq;
> > QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
> > timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > }
> >
> > Instead, you are just calling nvme_post_cqes, which is the equivalent of
> >
> > timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> >
> > Adding a loop to nvme_cq_notifier, and having it call nvme_process_sq, might
> > fix the weird 1-minute delay.
>
> I found it.
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 31572f2..f27fd35 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -548,6 +548,7 @@ static void nvme_cq_notifier(EventNotifier *e)
> NvmeCQueue *cq =
> container_of(e, NvmeCQueue, notifier);
>
> + event_notifier_test_and_clear(&cq->notifier);
> nvme_post_cqes(cq);
> }
>
> @@ -567,6 +568,7 @@ static void nvme_sq_notifier(EventNotifier *e)
> NvmeSQueue *sq =
> container_of(e, NvmeSQueue, notifier);
>
> + event_notifier_test_and_clear(&sq->notifier);
> nvme_process_sq(sq);
> }
>
> Here is new performance number:
>
> qemu-nvme + google-ext + eventfd: 294MB/s
> virtio-blk: 344MB/s
> virtio-scsi: 296MB/s
>
> It's almost same as virtio-scsi. Nice.
(strip CC)
Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
in the main loop thread.
Could you help to explain why eventfd MMIO gets better performance?
call stack: regular MMIO
========================
nvme_mmio_write (qemu/hw/block/nvme.c:921)
memory_region_write_accessor (qemu/memory.c:451)
access_with_adjusted_size (qemu/memory.c:506)
memory_region_dispatch_write (qemu/memory.c:1158)
address_space_rw (qemu/exec.c:2547)
kvm_cpu_exec (qemu/kvm-all.c:1849)
qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
start_thread (pthread_create.c:312)
clone
call stack: eventfd MMIO
=========================
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_ctx_dispatch (qemu/async.c:232)
g_main_context_dispatch
glib_pollfds_poll (qemu/main-loop.c:213)
os_host_main_loop_wait (qemu/main-loop.c:257)
main_loop_wait (qemu/main-loop.c:504)
main_loop (qemu/vl.c:1920)
main (qemu/vl.c:4682)
__libc_start_main
^ permalink raw reply [flat|nested] 33+ messages in thread* [PATCH -qemu] nvme: support Google vendor extension
2015-11-24 6:29 ` Ming Lin
(?)
@ 2015-11-24 11:01 ` Paolo Bonzini
-1 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-24 11:01 UTC (permalink / raw)
On 24/11/2015 07:29, Ming Lin wrote:
>> Here is new performance number:
>>
>> qemu-nvme + google-ext + eventfd: 294MB/s
>> virtio-blk: 344MB/s
>> virtio-scsi: 296MB/s
>>
>> It's almost same as virtio-scsi. Nice.
Pretty good indeed.
> Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
> in the main loop thread.
>
> Could you help to explain why eventfd MMIO gets better performance?
Because VCPU latency is really everything if the I/O is very fast _or_
if the queue depth is high; signaling an eventfd is cheap enough to give
a noticeable boost in VCPU latency. Waking up a sleeping process is a
bit expensive, but if you manage to keep the iothread close to 100% CPU,
the main loop thread's poll() is usually quite cheap too.
> call stack: regular MMIO
> ========================
> nvme_mmio_write (qemu/hw/block/nvme.c:921)
> memory_region_write_accessor (qemu/memory.c:451)
> access_with_adjusted_size (qemu/memory.c:506)
> memory_region_dispatch_write (qemu/memory.c:1158)
> address_space_rw (qemu/exec.c:2547)
> kvm_cpu_exec (qemu/kvm-all.c:1849)
> qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
> start_thread (pthread_create.c:312)
> clone
>
> call stack: eventfd MMIO
> =========================
> nvme_sq_notifier (qemu/hw/block/nvme.c:598)
> aio_dispatch (qemu/aio-posix.c:329)
> aio_ctx_dispatch (qemu/async.c:232)
> g_main_context_dispatch
> glib_pollfds_poll (qemu/main-loop.c:213)
> os_host_main_loop_wait (qemu/main-loop.c:257)
> main_loop_wait (qemu/main-loop.c:504)
> main_loop (qemu/vl.c:1920)
> main (qemu/vl.c:4682)
> __libc_start_main
For comparison, here is the "iothread+eventfd MMIO" stack
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_poll (qemu/aio-posix.c:474)
iothread_run (qemu/iothread.c:170)
__libc_start_main
aio_poll is much more specialized than the main thread (which uses glib
and thus wraps aio_poll with a GSource adapter), and can be faster too.
(That said, things are still a bit in flux here. 2.6 will have pretty
heavy changes in this area, but the API will be the same).
Even more performance can be squeezed by adding a little bit of busy
waiting to aio_poll() before going to the blocking poll(). This avoids
very short idling and can improve things even more.
BTW, you may want to Cc qemu-block at nongnu.org in addition to
qemu-devel at nongnu.org. Most people are on both lists, but some notice
things faster if you write to the lower-traffic qemu-block mailing list.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [Qemu-devel] [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-24 11:01 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-24 11:01 UTC (permalink / raw)
To: Ming Lin; +Cc: qemu block, qemu-devel, linux-nvme, virtualization
On 24/11/2015 07:29, Ming Lin wrote:
>> Here is new performance number:
>>
>> qemu-nvme + google-ext + eventfd: 294MB/s
>> virtio-blk: 344MB/s
>> virtio-scsi: 296MB/s
>>
>> It's almost same as virtio-scsi. Nice.
Pretty good indeed.
> Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
> in the main loop thread.
>
> Could you help to explain why eventfd MMIO gets better performance?
Because VCPU latency is really everything if the I/O is very fast _or_
if the queue depth is high; signaling an eventfd is cheap enough to give
a noticeable boost in VCPU latency. Waking up a sleeping process is a
bit expensive, but if you manage to keep the iothread close to 100% CPU,
the main loop thread's poll() is usually quite cheap too.
> call stack: regular MMIO
> ========================
> nvme_mmio_write (qemu/hw/block/nvme.c:921)
> memory_region_write_accessor (qemu/memory.c:451)
> access_with_adjusted_size (qemu/memory.c:506)
> memory_region_dispatch_write (qemu/memory.c:1158)
> address_space_rw (qemu/exec.c:2547)
> kvm_cpu_exec (qemu/kvm-all.c:1849)
> qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
> start_thread (pthread_create.c:312)
> clone
>
> call stack: eventfd MMIO
> =========================
> nvme_sq_notifier (qemu/hw/block/nvme.c:598)
> aio_dispatch (qemu/aio-posix.c:329)
> aio_ctx_dispatch (qemu/async.c:232)
> g_main_context_dispatch
> glib_pollfds_poll (qemu/main-loop.c:213)
> os_host_main_loop_wait (qemu/main-loop.c:257)
> main_loop_wait (qemu/main-loop.c:504)
> main_loop (qemu/vl.c:1920)
> main (qemu/vl.c:4682)
> __libc_start_main
For comparison, here is the "iothread+eventfd MMIO" stack
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_poll (qemu/aio-posix.c:474)
iothread_run (qemu/iothread.c:170)
__libc_start_main
aio_poll is much more specialized than the main thread (which uses glib
and thus wraps aio_poll with a GSource adapter), and can be faster too.
(That said, things are still a bit in flux here. 2.6 will have pretty
heavy changes in this area, but the API will be the same).
Even more performance can be squeezed by adding a little bit of busy
waiting to aio_poll() before going to the blocking poll(). This avoids
very short idling and can improve things even more.
BTW, you may want to Cc qemu-block@nongnu.org in addition to
qemu-devel@nongnu.org. Most people are on both lists, but some notice
things faster if you write to the lower-traffic qemu-block mailing list.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH -qemu] nvme: support Google vendor extension
@ 2015-11-24 11:01 ` Paolo Bonzini
0 siblings, 0 replies; 33+ messages in thread
From: Paolo Bonzini @ 2015-11-24 11:01 UTC (permalink / raw)
To: Ming Lin; +Cc: qemu block, qemu-devel, linux-nvme, virtualization
On 24/11/2015 07:29, Ming Lin wrote:
>> Here is new performance number:
>>
>> qemu-nvme + google-ext + eventfd: 294MB/s
>> virtio-blk: 344MB/s
>> virtio-scsi: 296MB/s
>>
>> It's almost same as virtio-scsi. Nice.
Pretty good indeed.
> Looks like "regular MMIO" runs in vcpu thread, while "eventfd MMIO" runs
> in the main loop thread.
>
> Could you help to explain why eventfd MMIO gets better performance?
Because VCPU latency is really everything if the I/O is very fast _or_
if the queue depth is high; signaling an eventfd is cheap enough to give
a noticeable boost in VCPU latency. Waking up a sleeping process is a
bit expensive, but if you manage to keep the iothread close to 100% CPU,
the main loop thread's poll() is usually quite cheap too.
> call stack: regular MMIO
> ========================
> nvme_mmio_write (qemu/hw/block/nvme.c:921)
> memory_region_write_accessor (qemu/memory.c:451)
> access_with_adjusted_size (qemu/memory.c:506)
> memory_region_dispatch_write (qemu/memory.c:1158)
> address_space_rw (qemu/exec.c:2547)
> kvm_cpu_exec (qemu/kvm-all.c:1849)
> qemu_kvm_cpu_thread_fn (qemu/cpus.c:1050)
> start_thread (pthread_create.c:312)
> clone
>
> call stack: eventfd MMIO
> =========================
> nvme_sq_notifier (qemu/hw/block/nvme.c:598)
> aio_dispatch (qemu/aio-posix.c:329)
> aio_ctx_dispatch (qemu/async.c:232)
> g_main_context_dispatch
> glib_pollfds_poll (qemu/main-loop.c:213)
> os_host_main_loop_wait (qemu/main-loop.c:257)
> main_loop_wait (qemu/main-loop.c:504)
> main_loop (qemu/vl.c:1920)
> main (qemu/vl.c:4682)
> __libc_start_main
For comparison, here is the "iothread+eventfd MMIO" stack
nvme_sq_notifier (qemu/hw/block/nvme.c:598)
aio_dispatch (qemu/aio-posix.c:329)
aio_poll (qemu/aio-posix.c:474)
iothread_run (qemu/iothread.c:170)
__libc_start_main
aio_poll is much more specialized than the main thread (which uses glib
and thus wraps aio_poll with a GSource adapter), and can be faster too.
(That said, things are still a bit in flux here. 2.6 will have pretty
heavy changes in this area, but the API will be the same).
Even more performance can be squeezed by adding a little bit of busy
waiting to aio_poll() before going to the blocking poll(). This avoids
very short idling and can improve things even more.
BTW, you may want to Cc qemu-block@nongnu.org in addition to
qemu-devel@nongnu.org. Most people are on both lists, but some notice
things faster if you write to the lower-traffic qemu-block mailing list.
Paolo
^ permalink raw reply [flat|nested] 33+ messages in thread