* [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
@ 2018-06-22 11:22 Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 2/5] nvme: CQ/SQ proper validation & status code Shimi Gersner
` (5 more replies)
0 siblings, 6 replies; 12+ messages in thread
From: Shimi Gersner @ 2018-06-22 11:22 UTC (permalink / raw)
To: qemu-block, qemu-devel
Cc: Keith Busch, Kevin Wolf, Max Reitz, David Sariel, Shimi Gersner
PCI/e configuration currently does not meets specifications.
Patch includes various configuration changes to support specifications
- BAR2 to return zero when read and CMD.IOSE is not set.
- Expose NVME configuration through IO space (Optional).
- PCI Power Management v1.2.
- PCIe Function Level Reset.
- Disable QEMUs default use of PCIe Link Status (DLLLA).
- PCIe missing AOC compliance flag.
- Mask PCIe End-to-End TLP as RO (Unspecified by specification).
Change-Id: I9057f56266db16b013fa194730c998d4779cede3
Signed-off-by: Shimi Gersner <gersner@gmail.com>
---
hw/block/nvme.c | 59 +++++++++++++++++++++++++++++++++++++--
include/hw/pci/pci_regs.h | 1 +
2 files changed, 58 insertions(+), 2 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d5bf95b79b..9d5414c80f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -25,6 +25,7 @@
*/
#include "qemu/osdep.h"
+#include "qemu/range.h"
#include "hw/block/block.h"
#include "hw/hw.h"
#include "hw/pci/msix.h"
@@ -39,6 +40,9 @@
#include "trace.h"
#include "nvme.h"
+#define PCI_EXP_LNKCAP_AOC 0x00400000 /* ASPM Optionality Compliance (AOC) */
+#define PCI_EXP_DEVCAP2_CTDS 0x10 /* Completion Timeout Disable Supported (CTDS) */
+
#define NVME_GUEST_ERR(trace, fmt, ...) \
do { \
(trace_##trace)(__VA_ARGS__); \
@@ -1195,14 +1199,31 @@ static const MemoryRegionOps nvme_cmb_ops = {
},
};
+static uint32_t nvme_pci_read_config(PCIDevice *pci_dev, uint32_t addr, int len)
+{
+ uint32_t val; /* Value to be returned */
+
+ val = pci_default_read_config(pci_dev, addr, len);
+ if (ranges_overlap(addr, len, PCI_BASE_ADDRESS_2, 4) && (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_IO))) {
+ /* When CMD.IOSE is not set */
+ val = 0 ;
+ }
+
+ return val;
+}
+
static void nvme_realize(PCIDevice *pci_dev, Error **errp)
{
+ static const uint16_t nvme_pm_offset = 0x80;
+ static const uint16_t nvme_pcie_offset = nvme_pm_offset + PCI_PM_SIZEOF;
+
NvmeCtrl *n = NVME(pci_dev);
NvmeIdCtrl *id = &n->id_ctrl;
int i;
int64_t bs_size;
uint8_t *pci_conf;
+ uint8_t *pci_wmask;
if (!n->conf.blk) {
error_setg(errp, "drive property not set");
@@ -1226,14 +1247,43 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
}
pci_conf = pci_dev->config;
+ pci_wmask = pci_dev->wmask;
pci_conf[PCI_INTERRUPT_PIN] = 1;
pci_config_set_prog_interface(pci_dev->config, 0x2);
pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
- pcie_endpoint_cap_init(&n->parent_obj, 0x80);
+
+ // Configure the PMC capability
+ (void)pci_add_capability(pci_dev, PCI_CAP_ID_PM, nvme_pm_offset, PCI_PM_SIZEOF, errp);
+ if (NULL != *errp) {
+ return;
+ }
+
+ // - PCI Power Management v1.2, No PME support, No Soft Reset, Make state writeable
+ pci_set_word(pci_conf + nvme_pm_offset + PCI_PM_PMC, PCI_PM_CAP_VER_1_2);
+ pci_set_word(pci_conf + nvme_pm_offset + PCI_PM_CTRL, PCI_PM_CTRL_NO_SOFT_RESET);
+ pci_set_word(pci_wmask + nvme_pm_offset + PCI_PM_CTRL, PCI_PM_CTRL_STATE_MASK);
+
+ // Disable QEMU default QEMU_PCIE_LNKSTA_DLLLA to disabled active flag in the Link Status Register of PCIE
+ pci_dev->cap_present &= ~(QEMU_PCIE_LNKSTA_DLLLA);
+
+ // PCIE Capability
+ pcie_endpoint_cap_init(&n->parent_obj, nvme_pcie_offset);
+
+ // PCIE Function Level Reset (FLRC) as required by 1.2 spec
+ pcie_cap_flr_init(&n->parent_obj);
+
+ // PCIE Configured with L0s by default by QEMU, configure missing AOC flag required by compliance
+ pci_long_test_and_set_mask(pci_conf + pci_dev->exp.exp_cap + PCI_EXP_LNKCAP, PCI_EXP_LNKCAP_AOC);
+
+ // Compliance requires Completion Timeout Disable Supported (CTDS).
+ pci_long_test_and_set_mask(pci_conf + pci_dev->exp.exp_cap + PCI_EXP_DEVCAP2, PCI_EXP_DEVCAP2_CTDS);
+
+ // Make the End-End TLP Prefix readonly as NVME spec doesnt acknowledge this field
+ pci_word_test_and_clear_mask(pci_wmask + pci_dev->exp.exp_cap + PCI_EXP_DEVCTL2, PCI_EXP_DEVCTL2_EETLPPB);
n->num_namespaces = 1;
n->num_queues = 64;
- n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
+ n->reg_size = pow2ceil(0x2000 + 2 * (n->num_queues + 1) * 4);
n->ns_size = bs_size / (uint64_t)n->num_namespaces;
n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
@@ -1245,6 +1295,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
pci_register_bar(&n->parent_obj, 0,
PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
&n->iomem);
+
+ // Expose the NVME memory through Address Space IO (Optional by spec)
+ pci_register_bar(&n->parent_obj, 2, PCI_BASE_ADDRESS_SPACE_IO, &n->iomem);
+
msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4, NULL);
id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
@@ -1355,6 +1409,7 @@ static void nvme_class_init(ObjectClass *oc, void *data)
PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
pc->realize = nvme_realize;
+ pc->config_read = nvme_pci_read_config;
pc->exit = nvme_exit;
pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
pc->vendor_id = PCI_VENDOR_ID_INTEL;
diff --git a/include/hw/pci/pci_regs.h b/include/hw/pci/pci_regs.h
index 7a83142578..54dc918f54 100644
--- a/include/hw/pci/pci_regs.h
+++ b/include/hw/pci/pci_regs.h
@@ -1,3 +1,4 @@
#include "standard-headers/linux/pci_regs.h"
#define PCI_PM_CAP_VER_1_1 0x0002 /* PCI PM spec ver. 1.1 */
+#define PCI_PM_CAP_VER_1_2 0x0003 /* PCI PM spec ver. 1.2 */
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH 2/5] nvme: CQ/SQ proper validation & status code
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
@ 2018-06-22 11:22 ` Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 3/5] nvme: Proper state handling on enable/disable Shimi Gersner
` (4 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Shimi Gersner @ 2018-06-22 11:22 UTC (permalink / raw)
To: qemu-block, qemu-devel
Cc: Keith Busch, Kevin Wolf, Max Reitz, David Sariel, Shimi Gersner
Device fails to properly comply CQ/SQ id validation.
nvme_check_[cs]id was used for both validation of the id and
to check if the id is used. Function was split and into two
seperate functions and used properly on CQ/SQ creation/deletion.
When id check is failed a proper error should be returned as defined
by the sepecification.
Additionally, CQ creation failed to properly check irq vector number.
Change-Id: I3b6d8179ce567be4cd064c0be0ed69a740708096
Signed-off-by: Shimi Gersner <gersner@gmail.com>
---
hw/block/nvme.c | 40 +++++++++++++++++++++++++---------------
1 file changed, 25 insertions(+), 15 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 9d5414c80f..24a51d33ea 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -62,14 +62,24 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
}
}
-static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
+static int nvme_valid_sqid(NvmeCtrl *n, uint16_t sqid)
{
- return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
+ return sqid < n->num_queues;
}
-static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
+static int nvme_used_sqid(NvmeCtrl *n, uint16_t sqid)
{
- return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
+ return sqid < n->num_queues && n->sq[sqid] != NULL ? 1 : 0;
+}
+
+static int nvme_valid_cqid(NvmeCtrl *n, uint16_t cqid)
+{
+ return cqid < n->num_queues;
+}
+
+static int nvme_used_cqid(NvmeCtrl *n, uint16_t cqid)
+{
+ return cqid < n->num_queues && n->cq[cqid] != NULL ? 1 : 0;
}
static void nvme_inc_cq_tail(NvmeCQueue *cq)
@@ -433,7 +443,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
NvmeCQueue *cq;
uint16_t qid = le16_to_cpu(c->qid);
- if (unlikely(!qid || nvme_check_sqid(n, qid))) {
+ if (unlikely(!qid || !nvme_used_sqid(n, qid))) {
trace_nvme_err_invalid_del_sq(qid);
return NVME_INVALID_QID | NVME_DNR;
}
@@ -446,7 +456,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
assert(req->aiocb);
blk_aio_cancel(req->aiocb);
}
- if (!nvme_check_cqid(n, sq->cqid)) {
+ if (nvme_used_cqid(n, sq->cqid)) {
cq = n->cq[sq->cqid];
QTAILQ_REMOVE(&cq->sq_list, sq, entry);
@@ -504,11 +514,11 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
trace_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
- if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
+ if (unlikely(!cqid || !nvme_used_cqid(n, cqid))) {
trace_nvme_err_invalid_create_sq_cqid(cqid);
return NVME_INVALID_CQID | NVME_DNR;
}
- if (unlikely(!sqid || !nvme_check_sqid(n, sqid))) {
+ if (unlikely(!sqid || !nvme_valid_sqid(n, sqid) || nvme_used_sqid(n, sqid))) {
trace_nvme_err_invalid_create_sq_sqid(sqid);
return NVME_INVALID_QID | NVME_DNR;
}
@@ -546,9 +556,9 @@ static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
NvmeCQueue *cq;
uint16_t qid = le16_to_cpu(c->qid);
- if (unlikely(!qid || nvme_check_cqid(n, qid))) {
+ if (unlikely(!qid || !nvme_used_cqid(n, qid))) {
trace_nvme_err_invalid_del_cq_cqid(qid);
- return NVME_INVALID_CQID | NVME_DNR;
+ return NVME_INVALID_QID | NVME_DNR;
}
cq = n->cq[qid];
@@ -592,9 +602,9 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
trace_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
NVME_CQ_FLAGS_IEN(qflags) != 0);
- if (unlikely(!cqid || !nvme_check_cqid(n, cqid))) {
+ if (unlikely(!cqid || !nvme_valid_cqid(n, cqid) || nvme_used_cqid(n, cqid))) {
trace_nvme_err_invalid_create_cq_cqid(cqid);
- return NVME_INVALID_CQID | NVME_DNR;
+ return NVME_INVALID_QID | NVME_DNR;
}
if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
trace_nvme_err_invalid_create_cq_size(qsize);
@@ -604,7 +614,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
trace_nvme_err_invalid_create_cq_addr(prp1);
return NVME_INVALID_FIELD | NVME_DNR;
}
- if (unlikely(vector > n->num_queues)) {
+ if (unlikely(vector >= n->num_queues)) {
trace_nvme_err_invalid_create_cq_vector(vector);
return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
}
@@ -1091,7 +1101,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
NvmeCQueue *cq;
qid = (addr - (0x1000 + (1 << 2))) >> 3;
- if (unlikely(nvme_check_cqid(n, qid))) {
+ if (unlikely(!nvme_used_cqid(n, qid))) {
NVME_GUEST_ERR(nvme_ub_db_wr_invalid_cq,
"completion queue doorbell write"
" for nonexistent queue,"
@@ -1129,7 +1139,7 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
NvmeSQueue *sq;
qid = (addr - 0x1000) >> 3;
- if (unlikely(nvme_check_sqid(n, qid))) {
+ if (unlikely(!nvme_used_sqid(n, qid))) {
NVME_GUEST_ERR(nvme_ub_db_wr_invalid_sq,
"submission queue doorbell write"
" for nonexistent queue,"
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH 3/5] nvme: Proper state handling on enable/disable
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 2/5] nvme: CQ/SQ proper validation & status code Shimi Gersner
@ 2018-06-22 11:22 ` Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 4/5] nvme: Fix phantom irq raise Shimi Gersner
` (3 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Shimi Gersner @ 2018-06-22 11:22 UTC (permalink / raw)
To: qemu-block, qemu-devel
Cc: Keith Busch, Kevin Wolf, Max Reitz, David Sariel, Shimi Gersner
When device transitions from CC.EN=1 to CC.EN=0 (Controller Reset) it should
properly reset its state, currently device only partially clears its state.
Following actions were added to controller reset phase
- Clear CC and CSTS completely
- Clear and deassert irqs
- On shutdown, leave state as is and only notify on shutdown completion.
Change-Id: Ia0bc29775b12586c3aab2ca217603051062c3efe
Signed-off-by: Shimi Gersner <gersner@gmail.com>
---
hw/block/nvme.c | 67 ++++++++++++++++++++++++++------------------
include/block/nvme.h | 5 ++++
2 files changed, 44 insertions(+), 28 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 24a51d33ea..206d8428fd 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -805,7 +805,7 @@ static void nvme_process_sq(void *opaque)
}
}
-static void nvme_clear_ctrl(NvmeCtrl *n)
+static void nvme_clear_ctrl(NvmeCtrl *n, bool reset)
{
int i;
@@ -820,8 +820,18 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
}
}
+ if (reset) {
+ // Reset clears all except for AWA, ASW, ACQ
+ n->bar.cc = 0;
+ n->bar.csts = 0;
+ }
+
+ // Update the IRQ status
+ n->bar.intmc = n->bar.intms = 0;
+ n->irq_status = 0;
+ nvme_irq_check(n);
+
blk_flush(n->conf.blk);
- n->bar.cc = 0;
}
static int nvme_start_ctrl(NvmeCtrl *n)
@@ -963,16 +973,14 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
nvme_irq_check(n);
break;
case 0x14: /* CC */
- trace_nvme_mmio_cfg(data & 0xffffffff);
- /* Windows first sends data, then sends enable bit */
- if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
- !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
- {
- n->bar.cc = data;
- }
+ trace_nvme_mmio_cfg(data & NVME_CC_WR_MASK);
+
+ uint32_t previous_cc = n->bar.cc;
+
+ // CC is all writeable
+ n->bar.cc = data & NVME_CC_WR_MASK;
- if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
- n->bar.cc = data;
+ if (NVME_CC_EN(data) && !NVME_CC_EN(previous_cc)) {
if (unlikely(nvme_start_ctrl(n))) {
trace_nvme_err_startfail();
n->bar.csts = NVME_CSTS_FAILED;
@@ -980,21 +988,24 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
trace_nvme_mmio_start_success();
n->bar.csts = NVME_CSTS_READY;
}
- } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
+
+ } else if (!NVME_CC_EN(data) && NVME_CC_EN(previous_cc)) {
trace_nvme_mmio_stopped();
- nvme_clear_ctrl(n);
- n->bar.csts &= ~NVME_CSTS_READY;
+ nvme_clear_ctrl(n, true);
+
}
- if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
+
+ if (NVME_CC_SHN(data) && !(NVME_CC_SHN(previous_cc))) {
trace_nvme_mmio_shutdown_set();
- nvme_clear_ctrl(n);
- n->bar.cc = data;
+ nvme_clear_ctrl(n, false);
n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
- } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
+
+ } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(previous_cc)) {
trace_nvme_mmio_shutdown_cleared();
n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
- n->bar.cc = data;
+
}
+
break;
case 0x1C: /* CSTS */
if (data & (1 << 4)) {
@@ -1016,23 +1027,23 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
}
break;
case 0x24: /* AQA */
- n->bar.aqa = data & 0xffffffff;
- trace_nvme_mmio_aqattr(data & 0xffffffff);
+ n->bar.aqa = data & NVME_AQA_WR_MASK;
+ trace_nvme_mmio_aqattr(n->bar.aqa);
break;
case 0x28: /* ASQ */
- n->bar.asq = data;
- trace_nvme_mmio_asqaddr(data);
+ n->bar.asq = data & NVME_ASQ_WR_MASK;
+ trace_nvme_mmio_asqaddr(n->bar.asq);
break;
case 0x2c: /* ASQ hi */
- n->bar.asq |= data << 32;
+ n->bar.asq |= (data << 32) & NVME_ASQ_WR_MASK;
trace_nvme_mmio_asqaddr_hi(data, n->bar.asq);
break;
case 0x30: /* ACQ */
- trace_nvme_mmio_acqaddr(data);
- n->bar.acq = data;
+ n->bar.acq = data & NVME_ACQ_WR_MASK;
+ trace_nvme_mmio_acqaddr(n->bar.acq);
break;
case 0x34: /* ACQ hi */
- n->bar.acq |= data << 32;
+ n->bar.acq |= (data << 32) & NVME_ACQ_WR_MASK;
trace_nvme_mmio_acqaddr_hi(data, n->bar.acq);
break;
case 0x38: /* CMBLOC */
@@ -1390,7 +1401,7 @@ static void nvme_exit(PCIDevice *pci_dev)
{
NvmeCtrl *n = NVME(pci_dev);
- nvme_clear_ctrl(n);
+ nvme_clear_ctrl(n, true);
g_free(n->namespaces);
g_free(n->cq);
g_free(n->sq);
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 849a6f3fa3..4da6740543 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -90,6 +90,11 @@ enum NvmeCcMask {
CC_IOCQES_MASK = 0xf,
};
+#define NVME_CC_WR_MASK (0x00fffff1)
+#define NVME_AQA_WR_MASK (0x0fff0fff)
+#define NVME_ASQ_WR_MASK (0xfffffffffffff000)
+#define NVME_ACQ_WR_MASK (0xfffffffffffff000)
+
#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK)
#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK)
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH 4/5] nvme: Fix phantom irq raise
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 2/5] nvme: CQ/SQ proper validation & status code Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 3/5] nvme: Proper state handling on enable/disable Shimi Gersner
@ 2018-06-22 11:22 ` Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 5/5] nvme: Missing MSI message upon partial CQ read Shimi Gersner
` (2 subsequent siblings)
5 siblings, 0 replies; 12+ messages in thread
From: Shimi Gersner @ 2018-06-22 11:22 UTC (permalink / raw)
To: qemu-block, qemu-devel
Cc: Keith Busch, Kevin Wolf, Max Reitz, David Sariel, Shimi Gersner
CQ irq is asserted when requested are ready for the guest to
collect. On edge cases, irq is raised when there is no actual
request pending on the queue.
Irq is raised for a CQ from nvme_post_cqes uncondtionally
if there are processes requests or not. nvme_post_cqes is
triggered through timer in two cases
- Completion of sync or async request.
- CQ was emptied by the guest and pending responses may be queued.
Consider the following flow
- Async request is made while CQ is full.
- Guest reads entire CQ and triggers a DB. nvme_post_cqes is scheduled
and executed through timer.
- Async request is complete and nvme_post_cqes is scheduled
and executed through timer.
As nvme_post_cqes raises the irq unconditionally, it leads to
a raise of the irq while no pending CQ request is ready.
Issue is fixed by adding a fast path exit to nvme_post_cqes when it has
no pending processing hence no need to raise the irq.
Change-Id: Ib7af6c1bcb63d03022d9b57e079fdb2cf954e7dc
Signed-off-by: Shimi Gersner <gersner@gmail.com>
---
hw/block/nvme.c | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 206d8428fd..f639d7ae73 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -265,6 +265,11 @@ static void nvme_post_cqes(void *opaque)
NvmeCtrl *n = cq->ctrl;
NvmeRequest *req, *next;
+ // Fast path if nothing to be processed
+ if (QTAILQ_EMPTY(&cq->req_list)) {
+ return;
+ }
+
QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
NvmeSQueue *sq;
hwaddr addr;
@@ -1130,14 +1135,21 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
return;
}
+ /*
+ CLARIFICATION
+ When CQ was full *before* the db write, nvme_post_cqes skipped processing of responses and
+ as a side effect SQ was unable to process new requests (As they are limited by size).
+ When CQ is cleared, spawn both processing of all CQs and SQs. As call to timer_mod
+ is serial, first handle the CQ to clear any pending requests and then clear the associated SQs.
+ */
start_sqs = nvme_cq_full(cq) ? 1 : 0;
cq->head = new_head;
if (start_sqs) {
NvmeSQueue *sq;
+ timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
- timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
}
if (cq->tail == cq->head) {
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* [Qemu-devel] [PATCH 5/5] nvme: Missing MSI message upon partial CQ read
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
` (2 preceding siblings ...)
2018-06-22 11:22 ` [Qemu-devel] [PATCH 4/5] nvme: Fix phantom irq raise Shimi Gersner
@ 2018-06-22 11:22 ` Shimi Gersner
2018-07-12 11:47 ` [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Kevin Wolf
2018-07-15 6:20 ` Daniel Verkamp
5 siblings, 0 replies; 12+ messages in thread
From: Shimi Gersner @ 2018-06-22 11:22 UTC (permalink / raw)
To: qemu-block, qemu-devel
Cc: Keith Busch, Kevin Wolf, Max Reitz, David Sariel, Shimi Gersner
When MSIx interrupts are enabled, each CQ is associated with
a vector which is expected to be called upon ready data
in the CQ buffer. After guest reads from the buffer, it updates
the processed buffer position using a CQ dedicated DB.
Upon reading only partial data from the CQ buffer, device
fails to re-send a new MSI message stating buffer still has
unread data. The fix ensures a new message will be sent.
Change-Id: I10190be127b8dcbd89732cfb95ea37faf8c5779b
Signed-off-by: Shimi Gersner <gersner@gmail.com>
---
hw/block/nvme.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index f639d7ae73..e2dca6e57f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1152,8 +1152,12 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
}
}
+ // When we have more messages, we should make sure irq is asserted. When MSIx is used
+ // this will make sure another notification is sent to the guest.
if (cq->tail == cq->head) {
nvme_irq_deassert(n, cq);
+ } else {
+ nvme_irq_assert(n, cq);
}
} else {
/* Submission queue doorbell write */
--
2.17.1
^ permalink raw reply related [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
` (3 preceding siblings ...)
2018-06-22 11:22 ` [Qemu-devel] [PATCH 5/5] nvme: Missing MSI message upon partial CQ read Shimi Gersner
@ 2018-07-12 11:47 ` Kevin Wolf
2018-07-13 7:40 ` David Sariel
2018-07-15 6:20 ` Daniel Verkamp
5 siblings, 1 reply; 12+ messages in thread
From: Kevin Wolf @ 2018-07-12 11:47 UTC (permalink / raw)
To: Shimi Gersner
Cc: qemu-block, qemu-devel, Keith Busch, Max Reitz, David Sariel
Am 22.06.2018 um 13:22 hat Shimi Gersner geschrieben:
> PCI/e configuration currently does not meets specifications.
>
> Patch includes various configuration changes to support specifications
> - BAR2 to return zero when read and CMD.IOSE is not set.
> - Expose NVME configuration through IO space (Optional).
> - PCI Power Management v1.2.
> - PCIe Function Level Reset.
> - Disable QEMUs default use of PCIe Link Status (DLLLA).
> - PCIe missing AOC compliance flag.
> - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
>
> Change-Id: I9057f56266db16b013fa194730c998d4779cede3
What is the meaning of this Change-Id tag?
> Signed-off-by: Shimi Gersner <gersner@gmail.com>
Keith, can you have a look at this series?
Kevin
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-07-12 11:47 ` [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Kevin Wolf
@ 2018-07-13 7:40 ` David Sariel
0 siblings, 0 replies; 12+ messages in thread
From: David Sariel @ 2018-07-13 7:40 UTC (permalink / raw)
To: Kevin Wolf; +Cc: Shimi Gersner, qemu-block, qemu-devel, Keith Busch, Max Reitz
Our bad. Change-Id tag snuck into those from gerrit
(https://review.gerrithub.io/c/davidsaOpenu/qemu/+/415434).
Took a note to replace this line with "[PATCH v2]" but, I guess, it makes
sense
if additional comments will follow, right?
Thanks for taking a look.
On 12 July 2018 at 14:47, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 22.06.2018 um 13:22 hat Shimi Gersner geschrieben:
> > PCI/e configuration currently does not meets specifications.
> >
> > Patch includes various configuration changes to support specifications
> > - BAR2 to return zero when read and CMD.IOSE is not set.
> > - Expose NVME configuration through IO space (Optional).
> > - PCI Power Management v1.2.
> > - PCIe Function Level Reset.
> > - Disable QEMUs default use of PCIe Link Status (DLLLA).
> > - PCIe missing AOC compliance flag.
> > - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
> >
> > Change-Id: I9057f56266db16b013fa194730c998d4779cede3
>
> What is the meaning of this Change-Id tag?
>
> > Signed-off-by: Shimi Gersner <gersner@gmail.com>
>
> Keith, can you have a look at this series?
>
> Kevin
>
--
מדעי המחשב, האוניברסיטה הפתוחה
CS Dept, Open University of Israel
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
` (4 preceding siblings ...)
2018-07-12 11:47 ` [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Kevin Wolf
@ 2018-07-15 6:20 ` Daniel Verkamp
2018-08-26 21:49 ` Gersner
5 siblings, 1 reply; 12+ messages in thread
From: Daniel Verkamp @ 2018-07-15 6:20 UTC (permalink / raw)
To: Shimi Gersner
Cc: qemu-block, qemu-devel, Keith Busch, Kevin Wolf, David Sariel,
Max Reitz
On Fri, Jun 22, 2018 at 4:22 AM, Shimi Gersner <gersner@gmail.com> wrote:
> PCI/e configuration currently does not meets specifications.
>
> Patch includes various configuration changes to support specifications
> - BAR2 to return zero when read and CMD.IOSE is not set.
> - Expose NVME configuration through IO space (Optional).
> - PCI Power Management v1.2.
> - PCIe Function Level Reset.
> - Disable QEMUs default use of PCIe Link Status (DLLLA).
> - PCIe missing AOC compliance flag.
> - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
[...]
> n->num_namespaces = 1;
> n->num_queues = 64;
> - n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
> + n->reg_size = pow2ceil(0x2000 + 2 * (n->num_queues + 1) * 4);
The existing math looks OK to me (maybe already 4 bytes larger than
necessary?). The controller registers should be 0x1000 bytes for the
fixed register set, then 2 * 4 bytes for each doorbell pair (+ 1 is
for the admin queue, and CAP.DSTRD is set to 0, so there is no extra
padding between queues). The result is also rounded up to a power of
two, so the result will typically be 8 KB. What is the rationale for
this change?
> n->ns_size = bs_size / (uint64_t)n->num_namespaces;
>
> n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> @@ -1245,6 +1295,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> pci_register_bar(&n->parent_obj, 0,
> PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
> &n->iomem);
> +
> + // Expose the NVME memory through Address Space IO (Optional by spec)
> + pci_register_bar(&n->parent_obj, 2, PCI_BASE_ADDRESS_SPACE_IO, &n->iomem);
This looks like it will register the whole 4KB+ NVMe register set
(n->iomem) as an I/O space BAR, but this doesn't match the spec (see
NVMe 1.3c section 3.2, "Index/Data Pair registers"). The I/O BAR (if
implemented) should just contain two 4-byte registers, Index and Data,
that can be used to indirectly access the NVMe register set. (Just
for curiosity, do you know of any software that uses this feature? It
could be useful for testing the implementation.)
Other minor nitpicks:
- Comment style is inconsistent with the rest of the file (// vs /* */)
- PCIe and NVMe are incorrectly capitalized a few times in comments
Thanks,
-- Daniel
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-07-15 6:20 ` Daniel Verkamp
@ 2018-08-26 21:49 ` Gersner
2018-08-30 15:45 ` Daniel Verkamp
0 siblings, 1 reply; 12+ messages in thread
From: Gersner @ 2018-08-26 21:49 UTC (permalink / raw)
To: daniel; +Cc: qemu-block, qemu-devel, Keith Busch, Kevin Wolf, David S,
Max Reitz
Hi Daniel,
Thanks for taking a look. Comments are inline.
Gersner.
On Sun, Jul 15, 2018 at 9:21 AM Daniel Verkamp <daniel@drv.nu> wrote:
> On Fri, Jun 22, 2018 at 4:22 AM, Shimi Gersner <gersner@gmail.com> wrote:
> > PCI/e configuration currently does not meets specifications.
> >
> > Patch includes various configuration changes to support specifications
> > - BAR2 to return zero when read and CMD.IOSE is not set.
> > - Expose NVME configuration through IO space (Optional).
> > - PCI Power Management v1.2.
> > - PCIe Function Level Reset.
> > - Disable QEMUs default use of PCIe Link Status (DLLLA).
> > - PCIe missing AOC compliance flag.
> > - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
> [...]
> > n->num_namespaces = 1;
> > n->num_queues = 64;
> > - n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
> > + n->reg_size = pow2ceil(0x2000 + 2 * (n->num_queues + 1) * 4);
>
> The existing math looks OK to me (maybe already 4 bytes larger than
> necessary?). The controller registers should be 0x1000 bytes for the
> fixed register set, then 2 * 4 bytes for each doorbell pair (+ 1 is
> for the admin queue, and CAP.DSTRD is set to 0, so there is no extra
> padding between queues). The result is also rounded up to a power of
> two, so the result will typically be 8 KB. What is the rationale for
> this change?
>
You are correct, this change shouldn't be here. It was made during tests
against the
nvme compliance which failed here.
The specification states that bits 0 to 13 are RO, which implicitly suggests
that the minimal size of this BAR should be at least 16K as this is a
standard
way to determine the BAR size on PCI (AFAIK). On the contrary it doesn't
fit very well with the memory mapped laid out on 3.1 Register Definition
of the 1.3b nvme spec. Any idea?
Additionally it does look like it has an extra 4 bytes.
>
> > n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> >
> > n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> > @@ -1245,6 +1295,10 @@ static void nvme_realize(PCIDevice *pci_dev,
> Error **errp)
> > pci_register_bar(&n->parent_obj, 0,
> > PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
> > &n->iomem);
> > +
> > + // Expose the NVME memory through Address Space IO (Optional by
> spec)
> > + pci_register_bar(&n->parent_obj, 2, PCI_BASE_ADDRESS_SPACE_IO,
> &n->iomem);
>
> This looks like it will register the whole 4KB+ NVMe register set
> (n->iomem) as an I/O space BAR, but this doesn't match the spec (see
> NVMe 1.3c section 3.2, "Index/Data Pair registers"). The I/O BAR (if
> implemented) should just contain two 4-byte registers, Index and Data,
> that can be used to indirectly access the NVMe register set. (Just
> for curiosity, do you know of any software that uses this feature? It
> could be useful for testing the implementation.)
>
Very nice catch. We indeed totally miss-interpreted the specification here.
We use the compliance suit for sanity, but it doesn't actually use this
feature,
just verifies the configuration of the registers.
We will go with rolling back this feature as this is optional.
Question - I'm having hard time to determine from the specification - As
this is optional, how device drivers determine if it is available? Does it
simply the CMD.IOSE flag from the PCI? And if so, what is
the standard way in QEMU to disable the flag completely?
>
> Other minor nitpicks:
> - Comment style is inconsistent with the rest of the file (// vs /* */)
> - PCIe and NVMe are incorrectly capitalized a few times in comments
>
Thanks.
>
> Thanks,
> -- Daniel
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-08-26 21:49 ` Gersner
@ 2018-08-30 15:45 ` Daniel Verkamp
2018-09-12 19:53 ` Gersner
0 siblings, 1 reply; 12+ messages in thread
From: Daniel Verkamp @ 2018-08-30 15:45 UTC (permalink / raw)
To: Shimi Gersner
Cc: qemu-block, qemu-devel, Keith Busch, Kevin Wolf, David Sariel,
Max Reitz
Hi Shimi,
On Sun, Aug 26, 2018 at 2:50 PM Gersner <gersner@gmail.com> wrote:
>
> Hi Daniel,
> Thanks for taking a look. Comments are inline.
>
> Gersner.
>
> On Sun, Jul 15, 2018 at 9:21 AM Daniel Verkamp <daniel@drv.nu> wrote:
>>
>> On Fri, Jun 22, 2018 at 4:22 AM, Shimi Gersner <gersner@gmail.com> wrote:
>> > PCI/e configuration currently does not meets specifications.
>> >
>> > Patch includes various configuration changes to support specifications
>> > - BAR2 to return zero when read and CMD.IOSE is not set.
>> > - Expose NVME configuration through IO space (Optional).
>> > - PCI Power Management v1.2.
>> > - PCIe Function Level Reset.
>> > - Disable QEMUs default use of PCIe Link Status (DLLLA).
>> > - PCIe missing AOC compliance flag.
>> > - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
>> [...]
>> > n->num_namespaces = 1;
>> > n->num_queues = 64;
>> > - n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
>> > + n->reg_size = pow2ceil(0x2000 + 2 * (n->num_queues + 1) * 4);
>>
>> The existing math looks OK to me (maybe already 4 bytes larger than
>> necessary?). The controller registers should be 0x1000 bytes for the
>> fixed register set, then 2 * 4 bytes for each doorbell pair (+ 1 is
>> for the admin queue, and CAP.DSTRD is set to 0, so there is no extra
>> padding between queues). The result is also rounded up to a power of
>> two, so the result will typically be 8 KB. What is the rationale for
>> this change?
>
> You are correct, this change shouldn't be here. It was made during tests against the
> nvme compliance which failed here.
If the NVMe compliance test requires it, that is probably sufficient
reason to change it - although it would be interesting to hear their
rationale.
> The specification states that bits 0 to 13 are RO, which implicitly suggests
> that the minimal size of this BAR should be at least 16K as this is a standard
> way to determine the BAR size on PCI (AFAIK). On the contrary it doesn't
> fit very well with the memory mapped laid out on 3.1 Register Definition
> of the 1.3b nvme spec. Any idea?
You're right, the MLBAR section of the NVMe spec does seem to indicate
that up to bit 13 should be reserved/RO. This is also probably a good
enough reason to make the change, as long as this reason is provided.
>
> Additionally it does look like it has an extra 4 bytes.
>
>>
>>
>> > n->ns_size = bs_size / (uint64_t)n->num_namespaces;
>> >
>> > n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>> > @@ -1245,6 +1295,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>> > pci_register_bar(&n->parent_obj, 0,
>> > PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
>> > &n->iomem);
>> > +
>> > + // Expose the NVME memory through Address Space IO (Optional by spec)
>> > + pci_register_bar(&n->parent_obj, 2, PCI_BASE_ADDRESS_SPACE_IO, &n->iomem);
>>
>> This looks like it will register the whole 4KB+ NVMe register set
>> (n->iomem) as an I/O space BAR, but this doesn't match the spec (see
>> NVMe 1.3c section 3.2, "Index/Data Pair registers"). The I/O BAR (if
>> implemented) should just contain two 4-byte registers, Index and Data,
>> that can be used to indirectly access the NVMe register set. (Just
>> for curiosity, do you know of any software that uses this feature? It
>> could be useful for testing the implementation.)
>
> Very nice catch. We indeed totally miss-interpreted the specification here.
>
> We use the compliance suit for sanity, but it doesn't actually use this feature,
> just verifies the configuration of the registers.
>
> We will go with rolling back this feature as this is optional.
This is probably the right move; I don't know of any real hardware
devices that implement it (and therefore I doubt any software will
require it). However, it should not be too difficult to add an
implementation of this feature; if you are interested, take a look at
the ahci_idp_read/ahci_idp_write functions - the NVMe index/data pair
should be very similar.
> Question - I'm having hard time to determine from the specification - As
> this is optional, how device drivers determine if it is available? Does it
> simply the CMD.IOSE flag from the PCI? And if so, what is
> the standard way in QEMU to disable the flag completely?
Based on the wording in NVMe 1.3 section 3.2, it seems like the only
indication of support for Index/Data Pair is whether BAR2 is filled
out. I believe PCI defines unused BARs to be all 0s, so software
should be able to use this to determine whether the feature is
provided by a particular NVMe PCIe device, and removing the
pci_register_bar() call should be enough to accomplish this from the
QEMU side.
Thanks,
-- Daniel
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-08-30 15:45 ` Daniel Verkamp
@ 2018-09-12 19:53 ` Gersner
2018-09-12 21:21 ` Eric Blake
0 siblings, 1 reply; 12+ messages in thread
From: Gersner @ 2018-09-12 19:53 UTC (permalink / raw)
To: daniel; +Cc: qemu-block, qemu-devel, Keith Busch, Kevin Wolf, David S,
Max Reitz
Hi Daniel,
Sorry for the long round-trips, we had a busy month. We have implemented
all the changes. Waiting for a final clarification.
Should the new patches be posted on this thread or a new one?
Thanks for you time.
Gersner.
On Thu, Aug 30, 2018 at 6:45 PM Daniel Verkamp <daniel@drv.nu> wrote:
> Hi Shimi,
>
> On Sun, Aug 26, 2018 at 2:50 PM Gersner <gersner@gmail.com> wrote:
> >
> > Hi Daniel,
> > Thanks for taking a look. Comments are inline.
> >
> > Gersner.
> >
> > On Sun, Jul 15, 2018 at 9:21 AM Daniel Verkamp <daniel@drv.nu> wrote:
> >>
> >> On Fri, Jun 22, 2018 at 4:22 AM, Shimi Gersner <gersner@gmail.com>
> wrote:
> >> > PCI/e configuration currently does not meets specifications.
> >> >
> >> > Patch includes various configuration changes to support specifications
> >> > - BAR2 to return zero when read and CMD.IOSE is not set.
> >> > - Expose NVME configuration through IO space (Optional).
> >> > - PCI Power Management v1.2.
> >> > - PCIe Function Level Reset.
> >> > - Disable QEMUs default use of PCIe Link Status (DLLLA).
> >> > - PCIe missing AOC compliance flag.
> >> > - Mask PCIe End-to-End TLP as RO (Unspecified by specification).
> >> [...]
> >> > n->num_namespaces = 1;
> >> > n->num_queues = 64;
> >> > - n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
> >> > + n->reg_size = pow2ceil(0x2000 + 2 * (n->num_queues + 1) * 4);
> >>
> >> The existing math looks OK to me (maybe already 4 bytes larger than
> >> necessary?). The controller registers should be 0x1000 bytes for the
> >> fixed register set, then 2 * 4 bytes for each doorbell pair (+ 1 is
> >> for the admin queue, and CAP.DSTRD is set to 0, so there is no extra
> >> padding between queues). The result is also rounded up to a power of
> >> two, so the result will typically be 8 KB. What is the rationale for
> >> this change?
> >
> > You are correct, this change shouldn't be here. It was made during tests
> against the
> > nvme compliance which failed here.
>
> If the NVMe compliance test requires it, that is probably sufficient
> reason to change it - although it would be interesting to hear their
> rationale.
>
> > The specification states that bits 0 to 13 are RO, which implicitly
> suggests
> > that the minimal size of this BAR should be at least 16K as this is a
> standard
> > way to determine the BAR size on PCI (AFAIK). On the contrary it doesn't
> > fit very well with the memory mapped laid out on 3.1 Register Definition
> > of the 1.3b nvme spec. Any idea?
>
> You're right, the MLBAR section of the NVMe spec does seem to indicate
> that up to bit 13 should be reserved/RO. This is also probably a good
> enough reason to make the change, as long as this reason is provided.
>
> >
> > Additionally it does look like it has an extra 4 bytes.
> >
> >>
> >>
> >> > n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> >> >
> >> > n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> >> > @@ -1245,6 +1295,10 @@ static void nvme_realize(PCIDevice *pci_dev,
> Error **errp)
> >> > pci_register_bar(&n->parent_obj, 0,
> >> > PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
> >> > &n->iomem);
> >> > +
> >> > + // Expose the NVME memory through Address Space IO (Optional by
> spec)
> >> > + pci_register_bar(&n->parent_obj, 2, PCI_BASE_ADDRESS_SPACE_IO,
> &n->iomem);
> >>
> >> This looks like it will register the whole 4KB+ NVMe register set
> >> (n->iomem) as an I/O space BAR, but this doesn't match the spec (see
> >> NVMe 1.3c section 3.2, "Index/Data Pair registers"). The I/O BAR (if
> >> implemented) should just contain two 4-byte registers, Index and Data,
> >> that can be used to indirectly access the NVMe register set. (Just
> >> for curiosity, do you know of any software that uses this feature? It
> >> could be useful for testing the implementation.)
> >
> > Very nice catch. We indeed totally miss-interpreted the specification
> here.
> >
> > We use the compliance suit for sanity, but it doesn't actually use this
> feature,
> > just verifies the configuration of the registers.
> >
> > We will go with rolling back this feature as this is optional.
>
> This is probably the right move; I don't know of any real hardware
> devices that implement it (and therefore I doubt any software will
> require it). However, it should not be too difficult to add an
> implementation of this feature; if you are interested, take a look at
> the ahci_idp_read/ahci_idp_write functions - the NVMe index/data pair
> should be very similar.
>
Nice, It does seem easy to implement. We decided to go without due to lack
of real-world software we could test against the new facility.
>
> > Question - I'm having hard time to determine from the specification - As
> > this is optional, how device drivers determine if it is available? Does
> it
> > simply the CMD.IOSE flag from the PCI? And if so, what is
> > the standard way in QEMU to disable the flag completely?
>
> Based on the wording in NVMe 1.3 section 3.2, it seems like the only
> indication of support for Index/Data Pair is whether BAR2 is filled
> out. I believe PCI defines unused BARs to be all 0s, so software
> should be able to use this to determine whether the feature is
> provided by a particular NVMe PCIe device, and removing the
> pci_register_bar() call should be enough to accomplish this from the
> QEMU side.
>
Agree, that was indeed what I also understood from the spec. On the contrary
for some reason the compliance
<https://github.com/nvmecompliance/tnvme/blob/30a19c6829b571b2c7bdf854daf0351bfe038032/GrpPciRegisters/allPciRegs_r10b.cpp#L382>
tests assume that IOSE==1 means
that the Index/Data pair is enabled. They probably had a good reason.
I'm not even sure who turns the IOSE flag up, I found no relevant code in
QEMU.
>
> Thanks,
> -- Daniel
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification
2018-09-12 19:53 ` Gersner
@ 2018-09-12 21:21 ` Eric Blake
0 siblings, 0 replies; 12+ messages in thread
From: Eric Blake @ 2018-09-12 21:21 UTC (permalink / raw)
To: Gersner, daniel
Cc: Kevin Wolf, qemu-block, qemu-devel, Max Reitz, Keith Busch,
David S
On 9/12/18 2:53 PM, Gersner wrote:
> Hi Daniel,
>
> Sorry for the long round-trips, we had a busy month. We have implemented
> all the changes. Waiting for a final clarification.
>
> Should the new patches be posted on this thread or a new one?
Best to post a v2 as a new top-level thread (our CI tools don't spot
patch revisions buried in reply to an existing thread).
Also, it's best to avoid top-posting on technical lists, as it's harder
to follow the flow of your email.
--
Eric Blake, Principal Software Engineer
Red Hat, Inc. +1-919-301-3266
Virtualization: qemu.org | libvirt.org
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2018-09-12 21:21 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-22 11:22 [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 2/5] nvme: CQ/SQ proper validation & status code Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 3/5] nvme: Proper state handling on enable/disable Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 4/5] nvme: Fix phantom irq raise Shimi Gersner
2018-06-22 11:22 ` [Qemu-devel] [PATCH 5/5] nvme: Missing MSI message upon partial CQ read Shimi Gersner
2018-07-12 11:47 ` [Qemu-devel] [PATCH 1/5] nvme: PCI/e configuration from specification Kevin Wolf
2018-07-13 7:40 ` David Sariel
2018-07-15 6:20 ` Daniel Verkamp
2018-08-26 21:49 ` Gersner
2018-08-30 15:45 ` Daniel Verkamp
2018-09-12 19:53 ` Gersner
2018-09-12 21:21 ` Eric Blake
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).