* [PULL 1/6] hw/nvme: Implement shadow doorbell buffer support
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 8:43 ` [PULL 2/6] hw/nvme: Add trace events for shadow doorbell buffer Klaus Jensen
` (5 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Jinhao Fan, Klaus Jensen
From: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Implement Doorbel Buffer Config command (Section 5.7 in NVMe Spec 1.3)
and Shadow Doorbel buffer & EventIdx buffer handling logic (Section 7.13
in NVMe Spec 1.3). For queues created before the Doorbell Buffer Config
command, the nvme_dbbuf_config function tries to associate each existing
SQ and CQ with its Shadow Doorbel buffer and EventIdx buffer address.
Queues created after the Doorbell Buffer Config command will have the
doorbell buffers associated with them when they are initialized.
In nvme_process_sq and nvme_post_cqe, proactively check for Shadow
Doorbell buffer changes instead of wait for doorbell register changes.
This reduces the number of MMIOs.
In nvme_process_db(), update the shadow doorbell buffer value with
the doorbell register value if it is the admin queue. This is a hack
since hosts like Linux NVMe driver and SPDK do not use shadow
doorbell buffer for the admin queue. Copying the doorbell register
value to the shadow doorbell buffer allows us to support these hosts
as well as spec-compliant hosts that use shadow doorbell buffer for
the admin queue.
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
[k.jensen: rebased]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/nvme/ctrl.c | 115 ++++++++++++++++++++++++++++++++++++++++++-
hw/nvme/nvme.h | 8 +++
include/block/nvme.h | 2 +
3 files changed, 124 insertions(+), 1 deletion(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index ca335dd7da6d..46e8d54ef07a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -264,6 +264,7 @@ static const uint32_t nvme_cse_acs[256] = {
[NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
[NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
+ [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
[NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
};
@@ -1330,6 +1331,12 @@ static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
}
}
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+ pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
+ sizeof(cq->head));
+}
+
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@@ -1342,6 +1349,10 @@ static void nvme_post_cqes(void *opaque)
NvmeSQueue *sq;
hwaddr addr;
+ if (n->dbbuf_enabled) {
+ nvme_update_cq_head(cq);
+ }
+
if (nvme_cq_full(cq)) {
break;
}
@@ -4287,6 +4298,11 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+ if (n->dbbuf_enabled) {
+ sq->db_addr = n->dbbuf_dbs + (sqid << 3);
+ sq->ei_addr = n->dbbuf_eis + (sqid << 3);
+ }
+
assert(n->cq[cqid]);
cq = n->cq[cqid];
QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
@@ -4645,6 +4661,10 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
cq->head = cq->tail = 0;
QTAILQ_INIT(&cq->req_list);
QTAILQ_INIT(&cq->sq_list);
+ if (n->dbbuf_enabled) {
+ cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
+ cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
+ }
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
}
@@ -5988,6 +6008,50 @@ static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
}
}
+static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
+{
+ uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
+ uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
+ int i;
+
+ /* Address should be page aligned */
+ if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ /* Save shadow buffer base addr for use during queue creation */
+ n->dbbuf_dbs = dbs_addr;
+ n->dbbuf_eis = eis_addr;
+ n->dbbuf_enabled = true;
+
+ for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
+ NvmeSQueue *sq = n->sq[i];
+ NvmeCQueue *cq = n->cq[i];
+
+ if (sq) {
+ /*
+ * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
+ * nvme_process_db() uses this hard-coded way to calculate
+ * doorbell offsets. Be consistent with that here.
+ */
+ sq->db_addr = dbs_addr + (i << 3);
+ sq->ei_addr = eis_addr + (i << 3);
+ pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
+ sizeof(sq->tail));
+ }
+
+ if (cq) {
+ /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
+ cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
+ cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
+ pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
+ sizeof(cq->head));
+ }
+ }
+
+ return NVME_SUCCESS;
+}
+
static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
{
trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
@@ -6032,6 +6096,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
return nvme_ns_attachment(n, req);
case NVME_ADM_CMD_VIRT_MNGMT:
return nvme_virt_mngmt(n, req);
+ case NVME_ADM_CMD_DBBUF_CONFIG:
+ return nvme_dbbuf_config(n, req);
case NVME_ADM_CMD_FORMAT_NVM:
return nvme_format(n, req);
default:
@@ -6041,6 +6107,18 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_OPCODE | NVME_DNR;
}
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+ pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
+ sizeof(sq->tail));
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+ pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
+ sizeof(sq->tail));
+}
+
static void nvme_process_sq(void *opaque)
{
NvmeSQueue *sq = opaque;
@@ -6052,6 +6130,10 @@ static void nvme_process_sq(void *opaque)
NvmeCmd cmd;
NvmeRequest *req;
+ if (n->dbbuf_enabled) {
+ nvme_update_sq_tail(sq);
+ }
+
while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
addr = sq->dma_addr + sq->head * n->sqe_size;
if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
@@ -6075,6 +6157,11 @@ static void nvme_process_sq(void *opaque)
req->status = status;
nvme_enqueue_req_completion(cq, req);
}
+
+ if (n->dbbuf_enabled) {
+ nvme_update_sq_eventidx(sq);
+ nvme_update_sq_tail(sq);
+ }
}
}
@@ -6184,6 +6271,10 @@ static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
stl_le_p(&n->bar.intms, 0);
stl_le_p(&n->bar.intmc, 0);
stl_le_p(&n->bar.cc, 0);
+
+ n->dbbuf_dbs = 0;
+ n->dbbuf_eis = 0;
+ n->dbbuf_enabled = false;
}
static void nvme_ctrl_shutdown(NvmeCtrl *n)
@@ -6694,6 +6785,10 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
+ if (!qid && n->dbbuf_enabled) {
+ pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
+ sizeof(cq->head));
+ }
if (start_sqs) {
NvmeSQueue *sq;
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -6751,6 +6846,23 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
sq->tail = new_tail;
+ if (!qid && n->dbbuf_enabled) {
+ /*
+ * The spec states "the host shall also update the controller's
+ * corresponding doorbell property to match the value of that entry
+ * in the Shadow Doorbell buffer."
+ *
+ * Since this context is currently a VM trap, we can safely enforce
+ * the requirement from the device side in case the host is
+ * misbehaving.
+ *
+ * Note, we shouldn't have to do this, but various drivers
+ * including ones that run on Linux, are not updating Admin Queues,
+ * so we can't trust reading it for an appropriate sq tail.
+ */
+ pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
+ sizeof(sq->tail));
+ }
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
}
@@ -7231,7 +7343,8 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
id->mdts = n->params.mdts;
id->ver = cpu_to_le32(NVME_SPEC_VER);
- id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
+ id->oacs =
+ cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
id->cntrltype = 0x1;
/*
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 99437d39bb51..0711b9748c28 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -341,6 +341,7 @@ static inline const char *nvme_adm_opc_str(uint8_t opc)
case NVME_ADM_CMD_ASYNC_EV_REQ: return "NVME_ADM_CMD_ASYNC_EV_REQ";
case NVME_ADM_CMD_NS_ATTACHMENT: return "NVME_ADM_CMD_NS_ATTACHMENT";
case NVME_ADM_CMD_VIRT_MNGMT: return "NVME_ADM_CMD_VIRT_MNGMT";
+ case NVME_ADM_CMD_DBBUF_CONFIG: return "NVME_ADM_CMD_DBBUF_CONFIG";
case NVME_ADM_CMD_FORMAT_NVM: return "NVME_ADM_CMD_FORMAT_NVM";
default: return "NVME_ADM_CMD_UNKNOWN";
}
@@ -372,6 +373,8 @@ typedef struct NvmeSQueue {
uint32_t tail;
uint32_t size;
uint64_t dma_addr;
+ uint64_t db_addr;
+ uint64_t ei_addr;
QEMUTimer *timer;
NvmeRequest *io_req;
QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -389,6 +392,8 @@ typedef struct NvmeCQueue {
uint32_t vector;
uint32_t size;
uint64_t dma_addr;
+ uint64_t db_addr;
+ uint64_t ei_addr;
QEMUTimer *timer;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
@@ -445,6 +450,9 @@ typedef struct NvmeCtrl {
uint8_t smart_critical_warning;
uint32_t conf_msix_qsize;
uint32_t conf_ioqpairs;
+ uint64_t dbbuf_dbs;
+ uint64_t dbbuf_eis;
+ bool dbbuf_enabled;
struct {
MemoryRegion mem;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 373c70b5ca7f..351fd44ca8ca 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -596,6 +596,7 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
NVME_ADM_CMD_VIRT_MNGMT = 0x1c,
+ NVME_ADM_CMD_DBBUF_CONFIG = 0x7c,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
@@ -1141,6 +1142,7 @@ enum NvmeIdCtrlOacs {
NVME_OACS_FORMAT = 1 << 1,
NVME_OACS_FW = 1 << 2,
NVME_OACS_NS_MGMT = 1 << 3,
+ NVME_OACS_DBBUF = 1 << 8,
};
enum NvmeIdCtrlOncs {
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PULL 2/6] hw/nvme: Add trace events for shadow doorbell buffer
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
2022-07-15 8:43 ` [PULL 1/6] hw/nvme: Implement shadow doorbell buffer support Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 8:43 ` [PULL 3/6] hw/nvme: fix example serial in documentation Klaus Jensen
` (4 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Jinhao Fan, Klaus Jensen
From: Jinhao Fan <fanjinhao21s@ict.ac.cn>
When shadow doorbell buffer is enabled, doorbell registers are lazily
updated. The actual queue head and tail pointers are stored in Shadow
Doorbell buffers.
Add trace events for updates on the Shadow Doorbell buffers and EventIdx
buffers. Also add trace event for the Doorbell Buffer Config command.
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
[k.jensen: rebased]
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/nvme/ctrl.c | 5 +++++
hw/nvme/trace-events | 5 +++++
2 files changed, 10 insertions(+)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 46e8d54ef07a..55cb0ba1d591 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1335,6 +1335,7 @@ static void nvme_update_cq_head(NvmeCQueue *cq)
{
pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
+ trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
}
static void nvme_post_cqes(void *opaque)
@@ -6049,6 +6050,8 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
}
}
+ trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
+
return NVME_SUCCESS;
}
@@ -6111,12 +6114,14 @@ static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
{
pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
sizeof(sq->tail));
+ trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail);
}
static void nvme_update_sq_tail(NvmeSQueue *sq)
{
pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
+ trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail);
}
static void nvme_process_sq(void *opaque)
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index 065e1c891df4..fccb79f48973 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -3,6 +3,7 @@ pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
pci_nvme_irq_pin(void) "pulsing IRQ pin"
pci_nvme_irq_masked(void) "IRQ is masked"
pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
+pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_addr_cmb(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
pci_nvme_map_prp(uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
@@ -83,6 +84,8 @@ pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint32_t dw0, uint32_t dw1, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" dw0 0x%"PRIx32" dw1 0x%"PRIx32" status 0x%"PRIx16""
+pci_nvme_eventidx_cq(uint16_t cqid, uint16_t new_eventidx) "cqid %"PRIu16" new_eventidx %"PRIu16""
+pci_nvme_eventidx_sq(uint16_t sqid, uint16_t new_eventidx) "sqid %"PRIu16" new_eventidx %"PRIu16""
pci_nvme_mmio_read(uint64_t addr, unsigned size) "addr 0x%"PRIx64" size %d"
pci_nvme_mmio_write(uint64_t addr, uint64_t data, unsigned size) "addr 0x%"PRIx64" data 0x%"PRIx64" size %d"
pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""
@@ -99,6 +102,8 @@ pci_nvme_mmio_start_success(void) "setting controller enable bit succeeded"
pci_nvme_mmio_stopped(void) "cleared controller enable bit"
pci_nvme_mmio_shutdown_set(void) "shutdown bit set"
pci_nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
+pci_nvme_shadow_doorbell_cq(uint16_t cqid, uint16_t new_shadow_doorbell) "cqid %"PRIu16" new_shadow_doorbell %"PRIu16""
+pci_nvme_shadow_doorbell_sq(uint16_t sqid, uint16_t new_shadow_doorbell) "sqid %"PRIu16" new_shadow_doorbell %"PRIu16""
pci_nvme_open_zone(uint64_t slba, uint32_t zone_idx, int all) "open zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
pci_nvme_close_zone(uint64_t slba, uint32_t zone_idx, int all) "close zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
pci_nvme_finish_zone(uint64_t slba, uint32_t zone_idx, int all) "finish zone, slba=%"PRIu64", idx=%"PRIu32", all=%"PRIi32""
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PULL 3/6] hw/nvme: fix example serial in documentation
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
2022-07-15 8:43 ` [PULL 1/6] hw/nvme: Implement shadow doorbell buffer support Klaus Jensen
2022-07-15 8:43 ` [PULL 2/6] hw/nvme: Add trace events for shadow doorbell buffer Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 8:43 ` [PULL 4/6] hw/nvme: force nvme-ns param 'shared' to false if no nvme-subsys node Klaus Jensen
` (3 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Niklas Cassel, Klaus Jensen
From: Niklas Cassel <niklas.cassel@wdc.com>
The serial prop on the controller is actually describing the nvme
subsystem serial, which has to be identical for all controllers within
the same nvme subsystem.
This is enforced since commit a859eb9f8f64 ("hw/nvme: enforce common
serial per subsystem").
Fix the documentation, so that people copying the qemu command line
example won't get an error on qemu start.
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
docs/system/devices/nvme.rst | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docs/system/devices/nvme.rst b/docs/system/devices/nvme.rst
index aba253304e46..30f841ef6222 100644
--- a/docs/system/devices/nvme.rst
+++ b/docs/system/devices/nvme.rst
@@ -104,8 +104,8 @@ multipath I/O.
.. code-block:: console
-device nvme-subsys,id=nvme-subsys-0,nqn=subsys0
- -device nvme,serial=a,subsys=nvme-subsys-0
- -device nvme,serial=b,subsys=nvme-subsys-0
+ -device nvme,serial=deadbeef,subsys=nvme-subsys-0
+ -device nvme,serial=deadbeef,subsys=nvme-subsys-0
This will create an NVM subsystem with two controllers. Having controllers
linked to an ``nvme-subsys`` device allows additional ``nvme-ns`` parameters:
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PULL 4/6] hw/nvme: force nvme-ns param 'shared' to false if no nvme-subsys node
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
` (2 preceding siblings ...)
2022-07-15 8:43 ` [PULL 3/6] hw/nvme: fix example serial in documentation Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 8:43 ` [PULL 5/6] nvme: Fix misleading macro when mixed with ternary operator Klaus Jensen
` (2 subsequent siblings)
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Niklas Cassel, Klaus Jensen
From: Niklas Cassel <niklas.cassel@wdc.com>
Since commit 916b0f0b5264 ("hw/nvme: change nvme-ns 'shared' default")
the default value of nvme-ns param 'shared' is set to true, regardless
if there is a nvme-subsys node or not.
On a system without a nvme-subsys node, a namespace will never be able
to be attached to more than one controller, so for this configuration,
it is counterintuitive for this parameter to be set by default.
Force the nvme-ns param 'shared' to false for configurations where
there is no nvme-subsys node, as the namespace will never be able to
attach to more than one controller anyway.
Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/nvme/ns.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 870c3ca1a2f0..62a1f97be010 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -546,6 +546,8 @@ static void nvme_ns_realize(DeviceState *dev, Error **errp)
int i;
if (!n->subsys) {
+ /* If no subsys, the ns cannot be attached to more than one ctrl. */
+ ns->params.shared = false;
if (ns->params.detached) {
error_setg(errp, "detached requires that the nvme device is "
"linked to an nvme-subsys device");
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PULL 5/6] nvme: Fix misleading macro when mixed with ternary operator
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
` (3 preceding siblings ...)
2022-07-15 8:43 ` [PULL 4/6] hw/nvme: force nvme-ns param 'shared' to false if no nvme-subsys node Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 8:43 ` [PULL 6/6] hw/nvme: Use ioeventfd to handle doorbell updates Klaus Jensen
2022-07-15 19:09 ` [PULL 0/6] hw/nvme updates Peter Maydell
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Darren Kenny, Klaus Jensen
From: Darren Kenny <darren.kenny@oracle.com>
Using the Parfait source code analyser and issue was found in
hw/nvme/ctrl.c where the macros NVME_CAP_SET_CMBS and NVME_CAP_SET_PMRS
are called with a ternary operatore in the second parameter, resulting
in a potentially unexpected expansion of the form:
x ? a: b & FLAG_TEST
which will result in a different result to:
(x ? a: b) & FLAG_TEST.
The macros should wrap each of the parameters in brackets to ensure the
correct result on expansion.
Signed-off-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
include/block/nvme.h | 44 ++++++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 351fd44ca8ca..8027b7126bda 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -98,28 +98,28 @@ enum NvmeCapMask {
#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK)
#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK)
-#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \
- << CAP_MQES_SHIFT)
-#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \
- << CAP_CQR_SHIFT)
-#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \
- << CAP_AMS_SHIFT)
-#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \
- << CAP_TO_SHIFT)
-#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \
- << CAP_DSTRD_SHIFT)
-#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \
- << CAP_NSSRS_SHIFT)
-#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \
- << CAP_CSS_SHIFT)
-#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
- << CAP_MPSMIN_SHIFT)
-#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
- << CAP_MPSMAX_SHIFT)
-#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \
- << CAP_PMRS_SHIFT)
-#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \
- << CAP_CMBS_SHIFT)
+#define NVME_CAP_SET_MQES(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_MQES_MASK) << CAP_MQES_SHIFT)
+#define NVME_CAP_SET_CQR(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_CQR_MASK) << CAP_CQR_SHIFT)
+#define NVME_CAP_SET_AMS(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_AMS_MASK) << CAP_AMS_SHIFT)
+#define NVME_CAP_SET_TO(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_TO_MASK) << CAP_TO_SHIFT)
+#define NVME_CAP_SET_DSTRD(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_DSTRD_MASK) << CAP_DSTRD_SHIFT)
+#define NVME_CAP_SET_NSSRS(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_NSSRS_MASK) << CAP_NSSRS_SHIFT)
+#define NVME_CAP_SET_CSS(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_CSS_MASK) << CAP_CSS_SHIFT)
+#define NVME_CAP_SET_MPSMIN(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_MPSMIN_MASK) << CAP_MPSMIN_SHIFT)
+#define NVME_CAP_SET_MPSMAX(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_MPSMAX_MASK) << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_PMRS_MASK) << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val) \
+ ((cap) |= (uint64_t)((val) & CAP_CMBS_MASK) << CAP_CMBS_SHIFT)
enum NvmeCapCss {
NVME_CAP_CSS_NVM = 1 << 0,
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* [PULL 6/6] hw/nvme: Use ioeventfd to handle doorbell updates
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
` (4 preceding siblings ...)
2022-07-15 8:43 ` [PULL 5/6] nvme: Fix misleading macro when mixed with ternary operator Klaus Jensen
@ 2022-07-15 8:43 ` Klaus Jensen
2022-07-15 19:09 ` [PULL 0/6] hw/nvme updates Peter Maydell
6 siblings, 0 replies; 17+ messages in thread
From: Klaus Jensen @ 2022-07-15 8:43 UTC (permalink / raw)
To: Peter Maydell, qemu-devel
Cc: Hanna Reitz, Fam Zheng, Klaus Jensen, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Jinhao Fan, Klaus Jensen
From: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Add property "ioeventfd" which is enabled by default. When this is
enabled, updates on the doorbell registers will cause KVM to signal
an event to the QEMU main loop to handle the doorbell updates.
Therefore, instead of letting the vcpu thread run both guest VM and
IO emulation, we now use the main loop thread to do IO emulation and
thus the vcpu thread has more cycles for the guest VM.
Since ioeventfd does not tell us the exact value that is written, it is
only useful when shadow doorbell buffer is enabled, where we check
for the value in the shadow doorbell buffer when we get the doorbell
update event.
IOPS comparison on Linux 5.19-rc2: (Unit: KIOPS)
qd 1 4 16 64
qemu 35 121 176 153
ioeventfd 41 133 258 313
Changes since v3:
- Do not deregister ioeventfd when it was not enabled on a SQ/CQ
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/nvme/ctrl.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++-
hw/nvme/nvme.h | 5 +++
2 files changed, 117 insertions(+), 1 deletion(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 55cb0ba1d591..533ad14e7a61 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1400,7 +1400,14 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
- timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+
+ if (req->sq->ioeventfd_enabled) {
+ /* Post CQE directly since we are in main loop thread */
+ nvme_post_cqes(cq);
+ } else {
+ /* Schedule the timer to post CQE later since we are in vcpu thread */
+ timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
+ }
}
static void nvme_process_aers(void *opaque)
@@ -4226,10 +4233,82 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_OPCODE | NVME_DNR;
}
+static void nvme_cq_notifier(EventNotifier *e)
+{
+ NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
+ NvmeCtrl *n = cq->ctrl;
+
+ event_notifier_test_and_clear(&cq->notifier);
+
+ nvme_update_cq_head(cq);
+
+ if (cq->tail == cq->head) {
+ if (cq->irq_enabled) {
+ n->cq_pending--;
+ }
+
+ nvme_irq_deassert(n, cq);
+ }
+
+ nvme_post_cqes(cq);
+}
+
+static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
+{
+ NvmeCtrl *n = cq->ctrl;
+ uint16_t offset = (cq->cqid << 3) + (1 << 2);
+ int ret;
+
+ ret = event_notifier_init(&cq->notifier, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &cq->notifier);
+
+ return 0;
+}
+
+static void nvme_sq_notifier(EventNotifier *e)
+{
+ NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
+
+ event_notifier_test_and_clear(&sq->notifier);
+
+ nvme_process_sq(sq);
+}
+
+static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
+{
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t offset = sq->sqid << 3;
+ int ret;
+
+ ret = event_notifier_init(&sq->notifier, 0);
+ if (ret < 0) {
+ return ret;
+ }
+
+ event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
+ memory_region_add_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &sq->notifier);
+
+ return 0;
+}
+
static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
{
+ uint16_t offset = sq->sqid << 3;
+
n->sq[sq->sqid] = NULL;
timer_free(sq->timer);
+ if (sq->ioeventfd_enabled) {
+ memory_region_del_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &sq->notifier);
+ event_notifier_cleanup(&sq->notifier);
+ }
g_free(sq->io_req);
if (sq->sqid) {
g_free(sq);
@@ -4302,6 +4381,12 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
if (n->dbbuf_enabled) {
sq->db_addr = n->dbbuf_dbs + (sqid << 3);
sq->ei_addr = n->dbbuf_eis + (sqid << 3);
+
+ if (n->params.ioeventfd && sq->sqid != 0) {
+ if (!nvme_init_sq_ioeventfd(sq)) {
+ sq->ioeventfd_enabled = true;
+ }
+ }
}
assert(n->cq[cqid]);
@@ -4605,8 +4690,15 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
+ uint16_t offset = (cq->cqid << 3) + (1 << 2);
+
n->cq[cq->cqid] = NULL;
timer_free(cq->timer);
+ if (cq->ioeventfd_enabled) {
+ memory_region_del_eventfd(&n->iomem,
+ 0x1000 + offset, 4, false, 0, &cq->notifier);
+ event_notifier_cleanup(&cq->notifier);
+ }
if (msix_enabled(&n->parent_obj)) {
msix_vector_unuse(&n->parent_obj, cq->vector);
}
@@ -4665,6 +4757,12 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
if (n->dbbuf_enabled) {
cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
+
+ if (n->params.ioeventfd && cqid != 0) {
+ if (!nvme_init_cq_ioeventfd(cq)) {
+ cq->ioeventfd_enabled = true;
+ }
+ }
}
n->cq[cqid] = cq;
cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -6039,6 +6137,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
sq->ei_addr = eis_addr + (i << 3);
pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
sizeof(sq->tail));
+
+ if (n->params.ioeventfd && sq->sqid != 0) {
+ if (!nvme_init_sq_ioeventfd(sq)) {
+ sq->ioeventfd_enabled = true;
+ }
+ }
}
if (cq) {
@@ -6047,6 +6151,12 @@ static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
sizeof(cq->head));
+
+ if (n->params.ioeventfd && cq->cqid != 0) {
+ if (!nvme_init_cq_ioeventfd(cq)) {
+ cq->ioeventfd_enabled = true;
+ }
+ }
}
}
@@ -7554,6 +7664,7 @@ static Property nvme_props[] = {
DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
+ DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, true),
DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
params.auto_transition_zones, true),
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 0711b9748c28..79f5c281c223 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -376,6 +376,8 @@ typedef struct NvmeSQueue {
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
+ EventNotifier notifier;
+ bool ioeventfd_enabled;
NvmeRequest *io_req;
QTAILQ_HEAD(, NvmeRequest) req_list;
QTAILQ_HEAD(, NvmeRequest) out_req_list;
@@ -395,6 +397,8 @@ typedef struct NvmeCQueue {
uint64_t db_addr;
uint64_t ei_addr;
QEMUTimer *timer;
+ EventNotifier notifier;
+ bool ioeventfd_enabled;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
} NvmeCQueue;
@@ -417,6 +421,7 @@ typedef struct NvmeParams {
uint8_t zasl;
bool auto_transition_zones;
bool legacy_cmb;
+ bool ioeventfd;
uint8_t sriov_max_vfs;
uint16_t sriov_vq_flexible;
uint16_t sriov_vi_flexible;
--
2.36.1
^ permalink raw reply related [flat|nested] 17+ messages in thread
* Re: [PULL 0/6] hw/nvme updates
2022-07-15 8:43 [PULL 0/6] hw/nvme updates Klaus Jensen
` (5 preceding siblings ...)
2022-07-15 8:43 ` [PULL 6/6] hw/nvme: Use ioeventfd to handle doorbell updates Klaus Jensen
@ 2022-07-15 19:09 ` Peter Maydell
6 siblings, 0 replies; 17+ messages in thread
From: Peter Maydell @ 2022-07-15 19:09 UTC (permalink / raw)
To: Klaus Jensen
Cc: qemu-devel, Hanna Reitz, Fam Zheng, Kevin Wolf, Keith Busch,
Philippe Mathieu-Daudé, qemu-block, Stefan Hajnoczi,
Klaus Jensen
On Fri, 15 Jul 2022 at 09:43, Klaus Jensen <its@irrelevant.dk> wrote:
>
> From: Klaus Jensen <k.jensen@samsung.com>
>
> Hi,
>
> The following changes since commit 8482ab545e52f50facacfe1118b22b97462724ab:
>
> Merge tag 'qga-win32-pull-2022-07-13' of github.com:kostyanf14/qemu into staging (2022-07-14 14:52:16 +0100)
>
> are available in the Git repository at:
>
> git://git.infradead.org/qemu-nvme.git tags/nvme-next-pull-request
>
> for you to fetch changes up to 2e53b0b450246044efd27418c5d05ad6919deb87:
>
> hw/nvme: Use ioeventfd to handle doorbell updates (2022-07-15 10:40:33 +0200)
>
> ----------------------------------------------------------------
> hw/nvme updates
>
> performance improvements by Jinhao
> ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> * shadow doorbells
> * ioeventfd
>
> plus some misc fixes (Darren, Niklas).
>
Applied, thanks.
Please update the changelog at https://wiki.qemu.org/ChangeLog/7.1
for any user-visible changes.
-- PMM
^ permalink raw reply [flat|nested] 17+ messages in thread