From: Klaus Jensen <its@irrelevant.dk>
To: qemu-devel@nongnu.org
Cc: Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk>,
qemu-block@nongnu.org, Jinhao Fan <fanjinhao21s@ict.ac.cn>,
stefanha@gmail.com, Klaus Jensen <k.jensen@samsung.com>
Subject: [PATCH v3 2/2] hw/nvme: use KVM irqfd when available
Date: Thu, 25 Aug 2022 22:14:54 +0200 [thread overview]
Message-ID: <20220825201454.259190-3-its@irrelevant.dk> (raw)
In-Reply-To: <20220825201454.259190-1-its@irrelevant.dk>
From: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Use KVM's irqfd to send interrupts when possible. This approach is
thread safe. Moreover, it does not have the inter-thread communication
overhead of plain event notifiers since handler callback are called
in the same system call as irqfd write.
Signed-off-by: Jinhao Fan <fanjinhao21s@ict.ac.cn>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
hw/nvme/ctrl.c | 145 ++++++++++++++++++++++++++++++++++++++++++-
hw/nvme/nvme.h | 3 +
hw/nvme/trace-events | 3 +
3 files changed, 149 insertions(+), 2 deletions(-)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 51792f395597..396f3f0cddbd 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -192,6 +192,7 @@
#include "qapi/error.h"
#include "qapi/visitor.h"
#include "sysemu/sysemu.h"
+#include "sysemu/kvm.h"
#include "sysemu/block-backend.h"
#include "sysemu/hostmem.h"
#include "hw/pci/msix.h"
@@ -1377,8 +1378,115 @@ static void nvme_deassert_notifier_read(EventNotifier *e)
}
}
+static int nvme_kvm_vector_use(NvmeCtrl *n, NvmeCQueue *cq, uint32_t vector)
+{
+ KVMRouteChange c = kvm_irqchip_begin_route_changes(kvm_state);
+ int ret;
+
+ ret = kvm_irqchip_add_msi_route(&c, vector, &n->parent_obj);
+ if (ret < 0) {
+ return ret;
+ }
+
+ kvm_irqchip_commit_route_changes(&c);
+
+ cq->virq = ret;
+
+ return 0;
+}
+
+static int nvme_kvm_vector_unmask(PCIDevice *pci_dev, unsigned vector,
+ MSIMessage msg)
+{
+ NvmeCtrl *n = NVME(pci_dev);
+ int ret;
+
+ trace_pci_nvme_irq_unmask(vector, msg.address, msg.data);
+
+ for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+ NvmeCQueue *cq = n->cq[i];
+
+ if (!cq) {
+ continue;
+ }
+
+ if (cq->vector == vector) {
+ if (cq->msg.data != msg.data || cq->msg.address != msg.address) {
+ ret = kvm_irqchip_update_msi_route(kvm_state, cq->virq, msg,
+ pci_dev);
+ if (ret < 0) {
+ return ret;
+ }
+
+ kvm_irqchip_commit_routes(kvm_state);
+
+ cq->msg = msg;
+ }
+
+ ret = kvm_irqchip_add_irqfd_notifier_gsi(kvm_state,
+ &cq->assert_notifier,
+ NULL, cq->virq);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void nvme_kvm_vector_mask(PCIDevice *pci_dev, unsigned vector)
+{
+ NvmeCtrl *n = NVME(pci_dev);
+
+ trace_pci_nvme_irq_mask(vector);
+
+ for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+ NvmeCQueue *cq = n->cq[i];
+
+ if (!cq) {
+ continue;
+ }
+
+ if (cq->vector == vector) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+ &cq->assert_notifier,
+ cq->virq);
+ }
+ }
+}
+
+static void nvme_kvm_vector_poll(PCIDevice *pci_dev, unsigned int vector_start,
+ unsigned int vector_end)
+{
+ NvmeCtrl *n = NVME(pci_dev);
+
+ trace_pci_nvme_irq_poll(vector_start, vector_end);
+
+ for (uint32_t i = 1; i <= n->params.max_ioqpairs; i++) {
+ NvmeCQueue *cq = n->cq[i];
+
+ if (!cq) {
+ continue;
+ }
+
+ if (!msix_is_masked(pci_dev, cq->vector)) {
+ continue;
+ }
+
+ if (cq->vector >= vector_start && cq->vector <= vector_end) {
+ if (event_notifier_test_and_clear(&cq->assert_notifier)) {
+ msix_set_pending(pci_dev, i);
+ }
+ }
+ }
+}
+
+
static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
{
+ bool with_irqfd = msix_enabled(&n->parent_obj) &&
+ kvm_msi_via_irqfd_enabled();
int ret;
ret = event_notifier_init(&cq->assert_notifier, 0);
@@ -1386,12 +1494,27 @@ static void nvme_init_irq_notifier(NvmeCtrl *n, NvmeCQueue *cq)
return;
}
- event_notifier_set_handler(&cq->assert_notifier,
- nvme_assert_notifier_read);
+ if (with_irqfd) {
+ ret = nvme_kvm_vector_use(n, cq, cq->vector);
+ if (ret < 0) {
+ event_notifier_cleanup(&cq->assert_notifier);
+
+ return;
+ }
+ } else {
+ event_notifier_set_handler(&cq->assert_notifier,
+ nvme_assert_notifier_read);
+ }
if (!msix_enabled(&n->parent_obj)) {
ret = event_notifier_init(&cq->deassert_notifier, 0);
if (ret < 0) {
+ if (with_irqfd) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+ &cq->assert_notifier,
+ cq->virq);
+ }
+
event_notifier_set_handler(&cq->assert_notifier, NULL);
event_notifier_cleanup(&cq->assert_notifier);
@@ -4764,6 +4887,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
{
+ bool with_irqfd = msix_enabled(&n->parent_obj) &&
+ kvm_msi_via_irqfd_enabled();
uint16_t offset = (cq->cqid << 3) + (1 << 2);
n->cq[cq->cqid] = NULL;
@@ -4775,6 +4900,12 @@ static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
event_notifier_cleanup(&cq->notifier);
}
if (cq->assert_notifier.initialized) {
+ if (with_irqfd) {
+ kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state,
+ &cq->assert_notifier,
+ cq->virq);
+ kvm_irqchip_release_virq(kvm_state, cq->virq);
+ }
event_notifier_set_handler(&cq->assert_notifier, NULL);
event_notifier_cleanup(&cq->assert_notifier);
}
@@ -6528,6 +6659,9 @@ static int nvme_start_ctrl(NvmeCtrl *n)
uint32_t page_size = 1 << page_bits;
NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
+ bool with_irqfd = msix_enabled(&n->parent_obj) &&
+ kvm_msi_via_irqfd_enabled();
+
if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
le16_to_cpu(sctrl->nvq),
@@ -6617,6 +6751,12 @@ static int nvme_start_ctrl(NvmeCtrl *n)
nvme_select_iocs(n);
+ if (with_irqfd) {
+ return msix_set_vector_notifiers(PCI_DEVICE(n), nvme_kvm_vector_unmask,
+ nvme_kvm_vector_mask,
+ nvme_kvm_vector_poll);
+ }
+
return 0;
}
@@ -7734,6 +7874,7 @@ static void nvme_exit(PCIDevice *pci_dev)
pcie_sriov_pf_exit(pci_dev);
}
+ msix_unset_vector_notifiers(pci_dev);
msix_uninit(pci_dev, &n->bar0, &n->bar0);
memory_region_del_subregion(&n->bar0, &n->iomem);
}
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 4850d3e9653a..b0b986b02426 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -20,6 +20,7 @@
#include "qemu/uuid.h"
#include "hw/pci/pci.h"
+#include "hw/pci/msi.h"
#include "hw/block/block.h"
#include "block/nvme.h"
@@ -396,10 +397,12 @@ typedef struct NvmeCQueue {
uint64_t dma_addr;
uint64_t db_addr;
uint64_t ei_addr;
+ int virq;
QEMUTimer *timer;
EventNotifier notifier;
EventNotifier assert_notifier;
EventNotifier deassert_notifier;
+ MSIMessage msg;
bool ioeventfd_enabled;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index fccb79f48973..b11fcf4a651d 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -2,6 +2,9 @@
pci_nvme_irq_msix(uint32_t vector) "raising MSI-X IRQ vector %u"
pci_nvme_irq_pin(void) "pulsing IRQ pin"
pci_nvme_irq_masked(void) "IRQ is masked"
+pci_nvme_irq_mask(uint32_t vector) "IRQ %u gets masked"
+pci_nvme_irq_unmask(uint32_t vector, uint64_t addr, uint32_t data) "IRQ %u gets unmasked, addr=0x%"PRIx64" data=0x%"PRIu32""
+pci_nvme_irq_poll(uint32_t vector_start, uint32_t vector_end) "IRQ poll, start=0x%"PRIu32" end=0x%"PRIu32""
pci_nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
pci_nvme_dbbuf_config(uint64_t dbs_addr, uint64_t eis_addr) "dbs_addr=0x%"PRIx64" eis_addr=0x%"PRIx64""
pci_nvme_map_addr(uint64_t addr, uint64_t len) "addr 0x%"PRIx64" len %"PRIu64""
--
2.37.2
next prev parent reply other threads:[~2022-08-25 20:21 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-08-25 20:14 [PATCH v3 0/2] hw/nvme: add irqfd support Klaus Jensen
2022-08-25 20:14 ` [PATCH v3 1/2] hw/nvme: support irq(de)assertion with eventfd Klaus Jensen
2022-08-25 20:14 ` Klaus Jensen [this message]
2022-08-26 2:03 ` [PATCH v3 0/2] hw/nvme: add irqfd support Jinhao Fan
2022-08-26 6:50 ` Klaus Jensen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220825201454.259190-3-its@irrelevant.dk \
--to=its@irrelevant.dk \
--cc=fanjinhao21s@ict.ac.cn \
--cc=k.jensen@samsung.com \
--cc=kbusch@kernel.org \
--cc=qemu-block@nongnu.org \
--cc=qemu-devel@nongnu.org \
--cc=stefanha@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.