* [PATCH] KVM: optionally post write on ioeventfd write
@ 2026-03-06 12:56 Thanos Makatos
2026-03-12 15:02 ` David Woodhouse
2026-03-23 15:01 ` Thanos Makatos
0 siblings, 2 replies; 12+ messages in thread
From: Thanos Makatos @ 2026-03-06 12:56 UTC (permalink / raw)
To: seanjc@google.com
Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org,
Thanos Makatos
Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
ioeventfd that results in the value written by the guest to be copied
to user-supplied memory instead of being discarded.
The goal of this new mechanism is to speed up doorbell writes on NVMe
controllers emulated outside of the VMM. Currently, a doorbell write to
an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
VMM communicating the event, along with the doorbell value, to the NVMe
controller emulation task. With POST_WRITE, the NVMe emulation task is
directly notified of the doorbell write and can find the doorbell value
in a known location, without involving VMM.
Add tests for this new functionality.
LLM (claude-4.6-opus-high) was used mainly for the tests and to a
lesser extent for pre-reviewing this patch.
Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
---
Documentation/virt/kvm/api.rst | 13 +-
include/uapi/linux/kvm.h | 6 +-
tools/testing/selftests/kvm/Makefile.kvm | 1 +
tools/testing/selftests/kvm/ioeventfd_test.c | 624 +++++++++++++++++++
virt/kvm/eventfd.c | 23 +
virt/kvm/kvm_main.c | 1 +
6 files changed, 666 insertions(+), 2 deletions(-)
create mode 100644 tools/testing/selftests/kvm/ioeventfd_test.c
diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 6f85e1b321dd..b8d030f03101 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2109,7 +2109,8 @@ provided event instead of triggering an exit.
__u32 len; /* 0, 1, 2, 4, or 8 bytes */
__s32 fd;
__u32 flags;
- __u8 pad[36];
+ __aligned_u64 post_addr; /* address to write to if POST_WRITE is set */
+ __u8 pad[24];
};
For the special case of virtio-ccw devices on s390, the ioevent is matched
@@ -2122,6 +2123,7 @@ The following flags are defined::
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+ #define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioeventfd_flag_nr_post_write)
If datamatch flag is set, the event will be signaled only if the written value
to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -2134,6 +2136,15 @@ the kernel will ignore the length of guest write and may get a faster vmexit.
The speedup may only apply to specific architectures, but the ioeventfd will
work anyway.
+With KVM_IOEVENTFD_FLAG_POST_WRITE, the value being written is copied to the
+userspace address specified by post_addr, and the eventfd is signaled. The
+copy is guaranteed to complete before the eventfd is signaled, so a userspace
+reader that wakes on the eventfd will observe the written value. When multiple
+vCPUs write to the same ioeventfd concurrently, the value at post_addr reflects
+one of the writes. If the copy to post_addr fails (e.g. the memory has been
+unmapped), the eventfd is not signaled and the write is reported to userspace
+as a regular MMIO/PIO exit.
+
4.60 KVM_DIRTY_TLB
------------------
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 65500f5db379..55b8683a856f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -639,6 +639,7 @@ enum {
kvm_ioeventfd_flag_nr_deassign,
kvm_ioeventfd_flag_nr_virtio_ccw_notify,
kvm_ioeventfd_flag_nr_fast_mmio,
+ kvm_ioeventfd_flag_nr_post_write,
kvm_ioeventfd_flag_nr_max,
};
@@ -647,6 +648,7 @@ enum {
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioeventfd_flag_nr_post_write)
#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
@@ -656,7 +658,8 @@ struct kvm_ioeventfd {
__u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
__s32 fd;
__u32 flags;
- __u8 pad[36];
+ __aligned_u64 post_addr; /* address to write to if POST_WRITE is set */
+ __u8 pad[24];
};
#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
@@ -985,6 +988,7 @@ struct kvm_enable_cap {
#define KVM_CAP_ARM_SEA_TO_USER 245
#define KVM_CAP_S390_USER_OPEREXEC 246
#define KVM_CAP_S390_KEYOP 247
+#define KVM_CAP_IOEVENTFD_POST_WRITE 248
struct kvm_irq_routing_irqchip {
__u32 irqchip;
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index fdec90e85467..7ab470981c31 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -64,6 +64,7 @@ TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
TEST_GEN_PROGS_COMMON += kvm_page_table_test
TEST_GEN_PROGS_COMMON += set_memory_region_test
+TEST_GEN_PROGS_COMMON += ioeventfd_test
# Compiled test targets
TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
diff --git a/tools/testing/selftests/kvm/ioeventfd_test.c b/tools/testing/selftests/kvm/ioeventfd_test.c
new file mode 100644
index 000000000000..24875a2562d4
--- /dev/null
+++ b/tools/testing/selftests/kvm/ioeventfd_test.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ioeventfd_test.c - Tests for KVM_IOEVENTFD_FLAG_POST_WRITE.
+ *
+ * Tests that when KVM_IOEVENTFD_FLAG_POST_WRITE is set and the MMIO/PIO
+ * address is written to, the value is copied to the user-provided address
+ * and the eventfd is signaled. Also tests negative cases and interactions
+ * with DATAMATCH.
+ *
+ * Copyright Nutanix, 2026
+ *
+ * Author: Thanos Makatos <thanos.makatos@nutanix.com>
+ */
+
+#include <errno.h>
+#include <poll.h>
+#include <string.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "ucall_common.h"
+
+#define MMIO_GPA (1UL << 30)
+#define PIO_PORT 0xe000
+#define TEST_VAL 0xDEADBEEFCAFEBABEULL
+#define MATCH_VAL 0x42U
+#define NOMATCH_VAL (MATCH_VAL + 1)
+#define POISON_VAL 0xFFFFFFFFU
+
+/*
+ * Check that the most recent vCPU exit is a ucall (delivered as KVM_EXIT_IO
+ * on x86) matching @expected_cmd. The caller must have already called
+ * vcpu_run().
+ *
+ * @expected_cmd: UCALL_SYNC, UCALL_DONE, etc.
+ * @expected_stage: for UCALL_SYNC, the stage number passed by GUEST_SYNC().
+ * Ignored for other ucall types.
+ *
+ * Aborts the test on UCALL_ABORT (a guest-side assertion failure).
+ */
+static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_cmd,
+ uint64_t expected_stage)
+{
+ struct ucall uc;
+
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+ switch (get_ucall(vcpu, &uc)) {
+ case UCALL_ABORT:
+ REPORT_GUEST_ASSERT(uc);
+ break;
+ case UCALL_SYNC:
+ TEST_ASSERT(expected_cmd == UCALL_SYNC,
+ "got UCALL_SYNC, expected %lu",
+ expected_cmd);
+ TEST_ASSERT(uc.args[1] == expected_stage,
+ "expected stage %lu, got %lu",
+ expected_stage, uc.args[1]);
+ break;
+ case UCALL_DONE:
+ TEST_ASSERT(expected_cmd == UCALL_DONE,
+ "got UCALL_DONE, expected %lu",
+ expected_cmd);
+ break;
+ default:
+ TEST_FAIL("unexpected ucall %lu", uc.cmd);
+ }
+}
+
+/*
+ * Verify that KVM_IOEVENTFD rejects invalid POST_WRITE configurations:
+ * - len=0: the kernel needs a non-zero length to know how many bytes to copy.
+ * - post_addr=NULL: there is no destination for the copy.
+ * - post_addr outside the process address space: access_ok() rejects it.
+ * All three must fail with EINVAL.
+ */
+static void test_post_write_negative(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vm *vm;
+ uint64_t dummy;
+ int ret;
+ int fd;
+
+ vm = vm_create_barebones();
+ fd = kvm_new_eventfd();
+
+ /* length cannot be zero */
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = MMIO_GPA,
+ .len = 0,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+ .post_addr = (u64)&dummy,
+ };
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(ret && errno == EINVAL,
+ "len=0: expected EINVAL, got ret=%d errno=%d", ret, errno);
+
+ /* post_addr cannot be NULL */
+ ioeventfd.len = 4;
+ ioeventfd.post_addr = 0ULL;
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(ret && errno == EINVAL,
+ "NULL post_addr: expected EINVAL, got ret=%d errno=%d",
+ ret, errno);
+
+ /* bogus post_addr */
+ ioeventfd.post_addr = (u64)0xdeaddeaddeaddeadULL;
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(ret && errno == EINVAL,
+ "bad post_addr: expected EINVAL, got ret=%d errno=%d",
+ ret, errno);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+#define DEFINE_GUEST_WRITE_FN(suffix, type) \
+static void guest_code_w##suffix(void) { \
+ *(volatile type *)MMIO_GPA = (type)TEST_VAL; \
+ GUEST_DONE(); \
+}
+
+DEFINE_GUEST_WRITE_FN(1, uint8_t)
+DEFINE_GUEST_WRITE_FN(2, uint16_t)
+DEFINE_GUEST_WRITE_FN(4, uint32_t)
+DEFINE_GUEST_WRITE_FN(8, uint64_t)
+
+/*
+ * Verify that ioeventfd_write copies exactly @width bytes to post_addr for
+ * each supported MMIO write width (1, 2, 4, 8). The guest writes the low
+ * @width bytes of TEST_VAL; the host checks that exactly those bytes land
+ * at post_addr and the eventfd is signaled.
+ */
+static void test_post_write_width(int width, void (*guest_fn)(void))
+{
+ uint64_t actual, expected, count;
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ int fd, ret;
+
+ /* need to initialize to 0 because the guest writes the low @width bytes */
+ actual = 0;
+ expected = 0;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_fn);
+ virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+ fd = kvm_new_eventfd();
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = MMIO_GPA,
+ .len = width,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ ret = read(fd, &count, sizeof(count));
+ TEST_ASSERT(ret == sizeof(count),
+ "eventfd read failed: ret=%d errno=%d", ret, errno);
+
+ memcpy(&expected, &(uint64_t){TEST_VAL}, width);
+ TEST_ASSERT_EQ(actual, expected);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+static void guest_code_datamatch(void)
+{
+ *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+ GUEST_SYNC(1);
+ *(volatile uint32_t *)MMIO_GPA = NOMATCH_VAL;
+ GUEST_SYNC(2);
+ GUEST_DONE();
+}
+
+/*
+ * Test the interaction between DATAMATCH and POST_WRITE. When both flags are
+ * set, ioeventfd_write should only fire (signal eventfd + copy value) when the
+ * written value matches datamatch. A non-matching write must leave the eventfd
+ * unsignaled and post_addr untouched, and fall through to KVM_EXIT_MMIO.
+ */
+static void test_post_write_datamatch(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct pollfd pfd;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_datamatch);
+ virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+ run = vcpu->run;
+
+ fd = kvm_new_eventfd();
+ pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .datamatch = MATCH_VAL,
+ .addr = MMIO_GPA,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+ KVM_IOEVENTFD_FLAG_DATAMATCH,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ /*
+ * Guest writes MATCH_VAL → ioeventfd fires (value copied, eventfd
+ * signaled), vCPU continues, then GUEST_SYNC(1).
+ */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, 1);
+ TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+ "eventfd read failed: errno=%d", errno);
+ TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+ actual = POISON_VAL;
+
+ /*
+ * Guest writes NOMATCH_VAL → ioeventfd_in_range() returns false, bus
+ * returns -EOPNOTSUPP → KVM_EXIT_MMIO to userspace.
+ */
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+ TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
+ TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
+ "expected MMIO at 0x%lx, got 0x%llx",
+ MMIO_GPA, run->mmio.phys_addr);
+
+ /* Re-enter: KVM completes the MMIO, guest runs to GUEST_SYNC(2). */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, 2);
+
+ TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+ "eventfd should not be signaled after non-matching write");
+ TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+static void guest_code_multi(void)
+{
+ *(volatile uint32_t *)MMIO_GPA = 0x11111111;
+ GUEST_SYNC(1);
+ *(volatile uint32_t *)MMIO_GPA = 0x22222222;
+ GUEST_SYNC(2);
+ *(volatile uint32_t *)MMIO_GPA = 0x33333333;
+ GUEST_SYNC(3);
+ GUEST_DONE();
+}
+
+/*
+ * Verify that post_addr is updated on every MMIO write, not just the first.
+ * The guest writes three distinct values in sequence; the host checks after
+ * each one that post_addr holds the latest value and the eventfd is signaled
+ * each time.
+ */
+static void test_post_write_multi(void)
+{
+ static const uint32_t expected[] = {
+ 0x11111111, 0x22222222, 0x33333333,
+ };
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret, i;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi);
+ virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+ fd = kvm_new_eventfd();
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = MMIO_GPA,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ for (i = 0; i < ARRAY_SIZE(expected); i++) {
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, i + 1);
+ TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+ "eventfd read failed: errno=%d", errno);
+ TEST_ASSERT_EQ(actual, expected[i]);
+ }
+
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+static void guest_code_multi_nosync(void)
+{
+ *(volatile uint32_t *)MMIO_GPA = 0x11111111;
+ *(volatile uint32_t *)MMIO_GPA = 0x22222222;
+ *(volatile uint32_t *)MMIO_GPA = 0x33333333;
+ GUEST_DONE();
+}
+
+/*
+ * Variant of the multi-write test where the guest performs three consecutive
+ * MMIO writes with no GUEST_SYNC in between. All three are handled in-kernel
+ * by ioeventfd before the vCPU exits at GUEST_DONE. Verify that:
+ * - post_addr reflects the last written value (0x33333333).
+ * - A single eventfd read() returns a counter of 3 (one signal per write).
+ */
+static void test_post_write_multi_nosync(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi_nosync);
+ virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+ fd = kvm_new_eventfd();
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = MMIO_GPA,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ ret = read(fd, &count, sizeof(count));
+ TEST_ASSERT(ret == sizeof(count),
+ "eventfd read failed: ret=%d errno=%d", ret, errno);
+ TEST_ASSERT_EQ(count, (uint64_t)3);
+ TEST_ASSERT_EQ(actual, (uint32_t)0x33333333);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+static void guest_code_deassign(void)
+{
+ *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+ GUEST_SYNC(1);
+ *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+ GUEST_DONE();
+}
+
+/*
+ * Verify that deassigning an ioeventfd with POST_WRITE fully removes it from
+ * the I/O bus.
+ */
+static void test_post_write_deassign(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct pollfd pfd;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_deassign);
+ virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+ run = vcpu->run;
+
+ fd = kvm_new_eventfd();
+ pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = MMIO_GPA,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD assign failed: %s", strerror(errno));
+
+ /*
+ * Guest writes MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
+ */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, 1);
+ TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+ "eventfd read failed: errno=%d", errno);
+ TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+ /* Deassign the ioeventfd. */
+ ioeventfd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD deassign failed: %s", strerror(errno));
+
+ actual = POISON_VAL;
+
+ /*
+ * Guest writes MATCH_VAL again → no handler on the bus →
+ * KVM_EXIT_MMIO to userspace.
+ */
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+ TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
+ TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
+ "expected MMIO at 0x%lx, got 0x%llx",
+ MMIO_GPA, run->mmio.phys_addr);
+
+ /* Re-enter: KVM completes MMIO, guest runs to GUEST_DONE. */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+ "eventfd should not be signaled after deassign");
+ TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+#ifdef __x86_64__
+static void guest_code_pio(void)
+{
+ outl(PIO_PORT, (uint32_t)TEST_VAL);
+ GUEST_DONE();
+}
+
+/*
+ * Verify that POST_WRITE works on the PIO bus (KVM_PIO_BUS), not just MMIO.
+ * The guest does an outl to PIO_PORT; the host checks that the written value
+ * is copied to post_addr and the eventfd is signaled.
+ */
+static void test_post_write_pio(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_vm *vm;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio);
+
+ fd = kvm_new_eventfd();
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .addr = PIO_PORT,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+ KVM_IOEVENTFD_FLAG_PIO,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ ret = read(fd, &count, sizeof(count));
+ TEST_ASSERT(ret == sizeof(count),
+ "eventfd read failed: ret=%d errno=%d", ret, errno);
+
+ TEST_ASSERT_EQ(actual, (uint32_t)TEST_VAL);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+
+static void guest_code_pio_datamatch(void)
+{
+ outl(PIO_PORT, MATCH_VAL);
+ GUEST_SYNC(1);
+ outl(PIO_PORT, NOMATCH_VAL);
+ GUEST_SYNC(2);
+ GUEST_DONE();
+}
+
+/*
+ * Test POST_WRITE + PIO + DATAMATCH together. When all three flags are set,
+ * the ioeventfd should only fire when the outl value matches datamatch.
+ * A non-matching outl must fall through to KVM_EXIT_IO (port I/O exit),
+ * leaving the eventfd unsignaled and post_addr untouched.
+ */
+static void test_post_write_pio_datamatch(void)
+{
+ struct kvm_ioeventfd ioeventfd;
+ struct kvm_vcpu *vcpu;
+ struct kvm_run *run;
+ struct kvm_vm *vm;
+ struct pollfd pfd;
+ uint64_t count;
+ uint32_t actual;
+ int fd, ret;
+
+ actual = POISON_VAL;
+
+ vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio_datamatch);
+ run = vcpu->run;
+
+ fd = kvm_new_eventfd();
+ pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+ ioeventfd = (struct kvm_ioeventfd) {
+ .datamatch = MATCH_VAL,
+ .addr = PIO_PORT,
+ .len = 4,
+ .fd = fd,
+ .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+ KVM_IOEVENTFD_FLAG_PIO |
+ KVM_IOEVENTFD_FLAG_DATAMATCH,
+ .post_addr = (u64)&actual,
+ };
+
+ ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+ TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+ /*
+ * Guest does outl MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
+ */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, 1);
+ TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+ "eventfd read failed: errno=%d", errno);
+ TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+ actual = POISON_VAL;
+
+ /*
+ * Guest does outl NOMATCH_VAL → no match → KVM_EXIT_IO.
+ */
+ vcpu_run(vcpu);
+ TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+ TEST_ASSERT(run->io.direction == KVM_EXIT_IO_OUT,
+ "expected PIO write");
+ TEST_ASSERT(run->io.port == PIO_PORT,
+ "expected PIO at 0x%x, got 0x%x",
+ PIO_PORT, run->io.port);
+
+ /* Re-enter: guest continues to GUEST_SYNC(2). */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_SYNC, 2);
+
+ TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+ "eventfd should not be signaled after non-matching PIO write");
+ TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+ /* GUEST_DONE */
+ vcpu_run(vcpu);
+ assert_ucall(vcpu, UCALL_DONE, 0);
+
+ close(fd);
+ kvm_vm_free(vm);
+}
+#endif
+
+int main(void)
+{
+ TEST_REQUIRE(kvm_check_cap(KVM_CAP_IOEVENTFD_POST_WRITE));
+
+ test_post_write_negative();
+
+ test_post_write_width(1, guest_code_w1);
+ test_post_write_width(2, guest_code_w2);
+ test_post_write_width(4, guest_code_w4);
+ test_post_write_width(8, guest_code_w8);
+
+ test_post_write_datamatch();
+ test_post_write_multi();
+ test_post_write_multi_nosync();
+ test_post_write_deassign();
+
+#ifdef __x86_64__
+ test_post_write_pio();
+ test_post_write_pio_datamatch();
+#endif
+
+ return 0;
+}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 0e8b8a2c5b79..22bc49a41503 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -741,6 +741,7 @@ struct _ioeventfd {
struct kvm_io_device dev;
u8 bus_idx;
bool wildcard;
+ void __user *post_addr;
};
static inline struct _ioeventfd *
@@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
+ if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len))
+ return -EFAULT;
+
eventfd_signal(p->eventfd);
return 0;
}
@@ -866,6 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
{
struct eventfd_ctx *eventfd;
+ void __user *post_addr;
struct _ioeventfd *p;
int ret;
@@ -873,6 +878,16 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
+ post_addr = u64_to_user_ptr(args->post_addr);
+ if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
+ (!args->len || !post_addr ||
+ args->post_addr != untagged_addr(args->post_addr) ||
+ !access_ok(post_addr, args->len))) {
+ /* In KVM’s ABI, post_addr must be non‑NULL. */
+ ret = -EINVAL;
+ goto fail;
+ }
+
p = kzalloc_obj(*p, GFP_KERNEL_ACCOUNT);
if (!p) {
ret = -ENOMEM;
@@ -891,6 +906,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
else
p->wildcard = true;
+ if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
+ p->post_addr = post_addr;
+
mutex_lock(&kvm->slots_lock);
/* Verify that there isn't a match already */
@@ -942,6 +960,11 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
mutex_lock(&kvm->slots_lock);
list_for_each_entry(p, &kvm->ioeventfds, list) {
+ /*
+ * No need to match post_addr, ioeventfd_check_collision
+ * prevents duplicate registrations that only differ by
+ * post_addr.
+ */
if (p->bus_idx != bus_idx ||
p->eventfd != eventfd ||
p->addr != args->addr ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..02abca5c49df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4883,6 +4883,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
case KVM_CAP_IRQFD:
#endif
case KVM_CAP_IOEVENTFD_ANY_LENGTH:
+ case KVM_CAP_IOEVENTFD_POST_WRITE:
case KVM_CAP_CHECK_EXTENSION_VM:
case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_HALT_POLL:
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-06 12:56 [PATCH] KVM: optionally post write on ioeventfd write Thanos Makatos
@ 2026-03-12 15:02 ` David Woodhouse
2026-03-12 16:12 ` Thanos Makatos
2026-03-23 15:01 ` Thanos Makatos
1 sibling, 1 reply; 12+ messages in thread
From: David Woodhouse @ 2026-03-12 15:02 UTC (permalink / raw)
To: Thanos Makatos, seanjc@google.com, Stamatis, Ilias, Paul Durrant,
graf
Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org
[-- Attachment #1: Type: text/plain, Size: 1685 bytes --]
On Fri, 2026-03-06 at 12:56 +0000, Thanos Makatos wrote:
> Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
> ioeventfd that results in the value written by the guest to be copied
> to user-supplied memory instead of being discarded.
>
> The goal of this new mechanism is to speed up doorbell writes on NVMe
> controllers emulated outside of the VMM. Currently, a doorbell write to
> an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> VMM communicating the event, along with the doorbell value, to the NVMe
> controller emulation task. With POST_WRITE, the NVMe emulation task is
> directly notified of the doorbell write and can find the doorbell value
> in a known location, without involving VMM.
>
> Add tests for this new functionality.
>
> LLM (claude-4.6-opus-high) was used mainly for the tests and to a
> lesser extent for pre-reviewing this patch.
>
> Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
Interesting. Ilias posted a coalesced MMIO patch series¹ a couple of
years ago which addressed similar requirements. IIRC his primary focus
back then was emulating an Intel i82559 NIC, and he identified NVMe
doorbells as another potential use case.
More recently, I think Alex is looking at ways to enable doorbells
directly between guests with as little interaction from the L0
hypervisor as possible.
I'd love to see if this KVM_IOEVENTFD_FLAG_POST_WRITE works for the
i82559 emulation use case. I think back-to-back writes are discarded
with this model, while Ilias's patches would convey each one?
¹ https://lore.kernel.org/all/20240820133333.1724191-1-ilstam@amazon.com/
[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-12 15:02 ` David Woodhouse
@ 2026-03-12 16:12 ` Thanos Makatos
2026-04-21 14:44 ` Sean Christopherson
0 siblings, 1 reply; 12+ messages in thread
From: Thanos Makatos @ 2026-03-12 16:12 UTC (permalink / raw)
To: David Woodhouse, seanjc@google.com, Stamatis, Ilias, Paul Durrant,
graf@amazon.de
Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org
> -----Original Message-----
> From: David Woodhouse <dwmw2@infradead.org>
> Sent: 12 March 2026 15:02
> To: Thanos Makatos <thanos.makatos@nutanix.com>; seanjc@google.com;
> Stamatis, Ilias <ilstam@amazon.com>; Paul Durrant <paul@xen.org>;
> graf@amazon.de
> Cc: pbonzini@redhat.com; John Levon <john.levon@nutanix.com>;
> kvm@vger.kernel.org
> Subject: Re: [PATCH] KVM: optionally post write on ioeventfd write
>
> On Fri, 2026-03-06 at 12:56 +0000, Thanos Makatos wrote:
> > Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
> > ioeventfd that results in the value written by the guest to be copied
> > to user-supplied memory instead of being discarded.
> >
> > The goal of this new mechanism is to speed up doorbell writes on NVMe
> > controllers emulated outside of the VMM. Currently, a doorbell write to
> > an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> > VMM communicating the event, along with the doorbell value, to the NVMe
> > controller emulation task. With POST_WRITE, the NVMe emulation task is
> > directly notified of the doorbell write and can find the doorbell value
> > in a known location, without involving VMM.
> >
> > Add tests for this new functionality.
> >
> > LLM (claude-4.6-opus-high) was used mainly for the tests and to a
> > lesser extent for pre-reviewing this patch.
> >
> > Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
>
> Interesting. Ilias posted a coalesced MMIO patch series¹ a couple of
> years ago which addressed similar requirements. IIRC his primary focus
> back then was emulating an Intel i82559 NIC, and he identified NVMe
> doorbells as another potential use case.
>
> More recently, I think Alex is looking at ways to enable doorbells
> directly between guests with as little interaction from the L0
> hypervisor as possible.
Interesting, are there any patches / pointers to this work?
>
> I'd love to see if this KVM_IOEVENTFD_FLAG_POST_WRITE works for the
> i82559 emulation use case. I think back-to-back writes are discarded
> with this model, while Ilias's patches would convey each one?
Yes, they're discarded, only the last write is visible, and this is by design to fit the NVMe doorbell use case.
ioregionfd, https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com, was a similar proposal for forwarding MMIO writes without discarding back-to-back writes.
>
>
> ¹ https://lore.kernel.org/all/20240820133333.1724191-1-
> ilstam@amazon.com/
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-12 16:12 ` Thanos Makatos
@ 2026-04-21 14:44 ` Sean Christopherson
0 siblings, 0 replies; 12+ messages in thread
From: Sean Christopherson @ 2026-04-21 14:44 UTC (permalink / raw)
To: Thanos Makatos
Cc: David Woodhouse, Ilias Stamatis, Paul Durrant, graf@amazon.de,
pbonzini@redhat.com, John Levon, kvm@vger.kernel.org
On Thu, Mar 12, 2026, Thanos Makatos wrote:
> > From: David Woodhouse <dwmw2@infradead.org>
> > On Fri, 2026-03-06 at 12:56 +0000, Thanos Makatos wrote:
> > > Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
> > > ioeventfd that results in the value written by the guest to be copied
> > > to user-supplied memory instead of being discarded.
> > >
> > > The goal of this new mechanism is to speed up doorbell writes on NVMe
> > > controllers emulated outside of the VMM. Currently, a doorbell write to
> > > an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> > > VMM communicating the event, along with the doorbell value, to the NVMe
> > > controller emulation task. With POST_WRITE, the NVMe emulation task is
> > > directly notified of the doorbell write and can find the doorbell value
> > > in a known location, without involving VMM.
...
> > I'd love to see if this KVM_IOEVENTFD_FLAG_POST_WRITE works for the
> > i82559 emulation use case. I think back-to-back writes are discarded
> > with this model, while Ilias's patches would convey each one?
Do you happen to know the requirements for i82559 emulation? I tried reading the
spec and QEMU's code, and that was just a waste of ~20 minutes :-)
> Yes, they're discarded, only the last write is visible, and this is by design
> to fit the NVMe doorbell use case.
I wouldn't say the design is specifically to fit the NVMe doorbell use case,
rather that KVM doesn't need to convey each write to support NVMe doorbells, and
forwarding only the most recent value is a massive "win" for complexity.
Which, for me, is also the argument for accepting KVM_IOEVENTFD_FLAG_POST_WRITE
even if there are a limited number of use cases: it's simple (and performant)
enough that it's probably worth supporting even if similar functionality can be
implemented via polling on coalesced I/O buffers. I.e. maintaing both doesn't
seem too onerous, if that's where we wend up.
> ioregionfd,
> https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com,
> was a similar proposal for forwarding MMIO writes without discarding
> back-to-back writes.
And if you were curious what the code looked like:
https://lore.kernel.org/kvm/cover.1613828726.git.eafanasova@gmail.com
^ permalink raw reply [flat|nested] 12+ messages in thread
* RE: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-06 12:56 [PATCH] KVM: optionally post write on ioeventfd write Thanos Makatos
2026-03-12 15:02 ` David Woodhouse
@ 2026-03-23 15:01 ` Thanos Makatos
2026-04-10 19:11 ` Thanos Makatos
1 sibling, 1 reply; 12+ messages in thread
From: Thanos Makatos @ 2026-03-23 15:01 UTC (permalink / raw)
To: seanjc@google.com, Paolo Bonzini; +Cc: John Levon, kvm@vger.kernel.org
Hi Paolo, Sean,
Just a gentle ping on this, happy to follow up if needed.
> -----Original Message-----
> From: Thanos Makatos <thanos.makatos@nutanix.com>
> Sent: 06 March 2026 12:57
> To: seanjc@google.com
> Cc: pbonzini@redhat.com; John Levon <john.levon@nutanix.com>;
> kvm@vger.kernel.org; Thanos Makatos <thanos.makatos@nutanix.com>
> Subject: [PATCH] KVM: optionally post write on ioeventfd write
>
> Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
> ioeventfd that results in the value written by the guest to be copied
> to user-supplied memory instead of being discarded.
>
> The goal of this new mechanism is to speed up doorbell writes on NVMe
> controllers emulated outside of the VMM. Currently, a doorbell write to
> an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> VMM communicating the event, along with the doorbell value, to the NVMe
> controller emulation task. With POST_WRITE, the NVMe emulation task is
> directly notified of the doorbell write and can find the doorbell value
> in a known location, without involving VMM.
>
> Add tests for this new functionality.
>
> LLM (claude-4.6-opus-high) was used mainly for the tests and to a
> lesser extent for pre-reviewing this patch.
>
> Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
> ---
> Documentation/virt/kvm/api.rst | 13 +-
> include/uapi/linux/kvm.h | 6 +-
> tools/testing/selftests/kvm/Makefile.kvm | 1 +
> tools/testing/selftests/kvm/ioeventfd_test.c | 624 +++++++++++++++++++
> virt/kvm/eventfd.c | 23 +
> virt/kvm/kvm_main.c | 1 +
> 6 files changed, 666 insertions(+), 2 deletions(-)
> create mode 100644 tools/testing/selftests/kvm/ioeventfd_test.c
>
> diff --git a/Documentation/virt/kvm/api.rst
> b/Documentation/virt/kvm/api.rst
> index 6f85e1b321dd..b8d030f03101 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -2109,7 +2109,8 @@ provided event instead of triggering an exit.
> __u32 len; /* 0, 1, 2, 4, or 8 bytes */
> __s32 fd;
> __u32 flags;
> - __u8 pad[36];
> + __aligned_u64 post_addr; /* address to write to if POST_WRITE is set
> */
> + __u8 pad[24];
> };
>
> For the special case of virtio-ccw devices on s390, the ioevent is matched
> @@ -2122,6 +2123,7 @@ The following flags are defined::
> #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 <<
> kvm_ioeventfd_flag_nr_deassign)
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> + #define KVM_IOEVENTFD_FLAG_POST_WRITE (1 <<
> kvm_ioeventfd_flag_nr_post_write)
>
> If datamatch flag is set, the event will be signaled only if the written value
> to the registered address is equal to datamatch in struct kvm_ioeventfd.
> @@ -2134,6 +2136,15 @@ the kernel will ignore the length of guest write
> and may get a faster vmexit.
> The speedup may only apply to specific architectures, but the ioeventfd will
> work anyway.
>
> +With KVM_IOEVENTFD_FLAG_POST_WRITE, the value being written is copied
> to the
> +userspace address specified by post_addr, and the eventfd is signaled. The
> +copy is guaranteed to complete before the eventfd is signaled, so a userspace
> +reader that wakes on the eventfd will observe the written value. When
> multiple
> +vCPUs write to the same ioeventfd concurrently, the value at post_addr
> reflects
> +one of the writes. If the copy to post_addr fails (e.g. the memory has been
> +unmapped), the eventfd is not signaled and the write is reported to
> userspace
> +as a regular MMIO/PIO exit.
> +
> 4.60 KVM_DIRTY_TLB
> ------------------
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 65500f5db379..55b8683a856f 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -639,6 +639,7 @@ enum {
> kvm_ioeventfd_flag_nr_deassign,
> kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> kvm_ioeventfd_flag_nr_fast_mmio,
> + kvm_ioeventfd_flag_nr_post_write,
> kvm_ioeventfd_flag_nr_max,
> };
>
> @@ -647,6 +648,7 @@ enum {
> #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 <<
> kvm_ioeventfd_flag_nr_deassign)
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 <<
> kvm_ioeventfd_flag_nr_post_write)
>
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 <<
> kvm_ioeventfd_flag_nr_max) - 1)
>
> @@ -656,7 +658,8 @@ struct kvm_ioeventfd {
> __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
> __s32 fd;
> __u32 flags;
> - __u8 pad[36];
> + __aligned_u64 post_addr; /* address to write to if POST_WRITE is set
> */
> + __u8 pad[24];
> };
>
> #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
> @@ -985,6 +988,7 @@ struct kvm_enable_cap {
> #define KVM_CAP_ARM_SEA_TO_USER 245
> #define KVM_CAP_S390_USER_OPEREXEC 246
> #define KVM_CAP_S390_KEYOP 247
> +#define KVM_CAP_IOEVENTFD_POST_WRITE 248
>
> struct kvm_irq_routing_irqchip {
> __u32 irqchip;
> diff --git a/tools/testing/selftests/kvm/Makefile.kvm
> b/tools/testing/selftests/kvm/Makefile.kvm
> index fdec90e85467..7ab470981c31 100644
> --- a/tools/testing/selftests/kvm/Makefile.kvm
> +++ b/tools/testing/selftests/kvm/Makefile.kvm
> @@ -64,6 +64,7 @@ TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
> TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
> TEST_GEN_PROGS_COMMON += kvm_page_table_test
> TEST_GEN_PROGS_COMMON += set_memory_region_test
> +TEST_GEN_PROGS_COMMON += ioeventfd_test
>
> # Compiled test targets
> TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
> diff --git a/tools/testing/selftests/kvm/ioeventfd_test.c
> b/tools/testing/selftests/kvm/ioeventfd_test.c
> new file mode 100644
> index 000000000000..24875a2562d4
> --- /dev/null
> +++ b/tools/testing/selftests/kvm/ioeventfd_test.c
> @@ -0,0 +1,624 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * ioeventfd_test.c - Tests for KVM_IOEVENTFD_FLAG_POST_WRITE.
> + *
> + * Tests that when KVM_IOEVENTFD_FLAG_POST_WRITE is set and the
> MMIO/PIO
> + * address is written to, the value is copied to the user-provided address
> + * and the eventfd is signaled. Also tests negative cases and interactions
> + * with DATAMATCH.
> + *
> + * Copyright Nutanix, 2026
> + *
> + * Author: Thanos Makatos <thanos.makatos@nutanix.com>
> + */
> +
> +#include <errno.h>
> +#include <poll.h>
> +#include <string.h>
> +
> +#include "kvm_util.h"
> +#include "processor.h"
> +#include "ucall_common.h"
> +
> +#define MMIO_GPA (1UL << 30)
> +#define PIO_PORT 0xe000
> +#define TEST_VAL 0xDEADBEEFCAFEBABEULL
> +#define MATCH_VAL 0x42U
> +#define NOMATCH_VAL (MATCH_VAL + 1)
> +#define POISON_VAL 0xFFFFFFFFU
> +
> +/*
> + * Check that the most recent vCPU exit is a ucall (delivered as KVM_EXIT_IO
> + * on x86) matching @expected_cmd. The caller must have already called
> + * vcpu_run().
> + *
> + * @expected_cmd: UCALL_SYNC, UCALL_DONE, etc.
> + * @expected_stage: for UCALL_SYNC, the stage number passed by
> GUEST_SYNC().
> + * Ignored for other ucall types.
> + *
> + * Aborts the test on UCALL_ABORT (a guest-side assertion failure).
> + */
> +static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_cmd,
> + uint64_t expected_stage)
> +{
> + struct ucall uc;
> +
> + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> +
> + switch (get_ucall(vcpu, &uc)) {
> + case UCALL_ABORT:
> + REPORT_GUEST_ASSERT(uc);
> + break;
> + case UCALL_SYNC:
> + TEST_ASSERT(expected_cmd == UCALL_SYNC,
> + "got UCALL_SYNC, expected %lu",
> + expected_cmd);
> + TEST_ASSERT(uc.args[1] == expected_stage,
> + "expected stage %lu, got %lu",
> + expected_stage, uc.args[1]);
> + break;
> + case UCALL_DONE:
> + TEST_ASSERT(expected_cmd == UCALL_DONE,
> + "got UCALL_DONE, expected %lu",
> + expected_cmd);
> + break;
> + default:
> + TEST_FAIL("unexpected ucall %lu", uc.cmd);
> + }
> +}
> +
> +/*
> + * Verify that KVM_IOEVENTFD rejects invalid POST_WRITE configurations:
> + * - len=0: the kernel needs a non-zero length to know how many bytes to
> copy.
> + * - post_addr=NULL: there is no destination for the copy.
> + * - post_addr outside the process address space: access_ok() rejects it.
> + * All three must fail with EINVAL.
> + */
> +static void test_post_write_negative(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vm *vm;
> + uint64_t dummy;
> + int ret;
> + int fd;
> +
> + vm = vm_create_barebones();
> + fd = kvm_new_eventfd();
> +
> + /* length cannot be zero */
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = MMIO_GPA,
> + .len = 0,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> + .post_addr = (u64)&dummy,
> + };
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(ret && errno == EINVAL,
> + "len=0: expected EINVAL, got ret=%d errno=%d", ret, errno);
> +
> + /* post_addr cannot be NULL */
> + ioeventfd.len = 4;
> + ioeventfd.post_addr = 0ULL;
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(ret && errno == EINVAL,
> + "NULL post_addr: expected EINVAL, got ret=%d errno=%d",
> + ret, errno);
> +
> + /* bogus post_addr */
> + ioeventfd.post_addr = (u64)0xdeaddeaddeaddeadULL;
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(ret && errno == EINVAL,
> + "bad post_addr: expected EINVAL, got ret=%d errno=%d",
> + ret, errno);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +#define DEFINE_GUEST_WRITE_FN(suffix, type) \
> +static void guest_code_w##suffix(void) { \
> + *(volatile type *)MMIO_GPA = (type)TEST_VAL; \
> + GUEST_DONE(); \
> +}
> +
> +DEFINE_GUEST_WRITE_FN(1, uint8_t)
> +DEFINE_GUEST_WRITE_FN(2, uint16_t)
> +DEFINE_GUEST_WRITE_FN(4, uint32_t)
> +DEFINE_GUEST_WRITE_FN(8, uint64_t)
> +
> +/*
> + * Verify that ioeventfd_write copies exactly @width bytes to post_addr for
> + * each supported MMIO write width (1, 2, 4, 8). The guest writes the low
> + * @width bytes of TEST_VAL; the host checks that exactly those bytes land
> + * at post_addr and the eventfd is signaled.
> + */
> +static void test_post_write_width(int width, void (*guest_fn)(void))
> +{
> + uint64_t actual, expected, count;
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_vm *vm;
> + int fd, ret;
> +
> + /* need to initialize to 0 because the guest writes the low @width
> bytes */
> + actual = 0;
> + expected = 0;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_fn);
> + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> +
> + fd = kvm_new_eventfd();
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = MMIO_GPA,
> + .len = width,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + ret = read(fd, &count, sizeof(count));
> + TEST_ASSERT(ret == sizeof(count),
> + "eventfd read failed: ret=%d errno=%d", ret, errno);
> +
> + memcpy(&expected, &(uint64_t){TEST_VAL}, width);
> + TEST_ASSERT_EQ(actual, expected);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +static void guest_code_datamatch(void)
> +{
> + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> + GUEST_SYNC(1);
> + *(volatile uint32_t *)MMIO_GPA = NOMATCH_VAL;
> + GUEST_SYNC(2);
> + GUEST_DONE();
> +}
> +
> +/*
> + * Test the interaction between DATAMATCH and POST_WRITE. When both
> flags are
> + * set, ioeventfd_write should only fire (signal eventfd + copy value) when the
> + * written value matches datamatch. A non-matching write must leave the
> eventfd
> + * unsignaled and post_addr untouched, and fall through to
> KVM_EXIT_MMIO.
> + */
> +static void test_post_write_datamatch(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_run *run;
> + struct kvm_vm *vm;
> + struct pollfd pfd;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_datamatch);
> + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> + run = vcpu->run;
> +
> + fd = kvm_new_eventfd();
> + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .datamatch = MATCH_VAL,
> + .addr = MMIO_GPA,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> + KVM_IOEVENTFD_FLAG_DATAMATCH,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + /*
> + * Guest writes MATCH_VAL → ioeventfd fires (value copied, eventfd
> + * signaled), vCPU continues, then GUEST_SYNC(1).
> + */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, 1);
> + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> + "eventfd read failed: errno=%d", errno);
> + TEST_ASSERT_EQ(actual, MATCH_VAL);
> +
> + actual = POISON_VAL;
> +
> + /*
> + * Guest writes NOMATCH_VAL → ioeventfd_in_range() returns false,
> bus
> + * returns -EOPNOTSUPP → KVM_EXIT_MMIO to userspace.
> + */
> + vcpu_run(vcpu);
> + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
> + TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
> + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
> + "expected MMIO at 0x%lx, got 0x%llx",
> + MMIO_GPA, run->mmio.phys_addr);
> +
> + /* Re-enter: KVM completes the MMIO, guest runs to
> GUEST_SYNC(2). */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, 2);
> +
> + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> + "eventfd should not be signaled after non-matching write");
> + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> +
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +static void guest_code_multi(void)
> +{
> + *(volatile uint32_t *)MMIO_GPA = 0x11111111;
> + GUEST_SYNC(1);
> + *(volatile uint32_t *)MMIO_GPA = 0x22222222;
> + GUEST_SYNC(2);
> + *(volatile uint32_t *)MMIO_GPA = 0x33333333;
> + GUEST_SYNC(3);
> + GUEST_DONE();
> +}
> +
> +/*
> + * Verify that post_addr is updated on every MMIO write, not just the first.
> + * The guest writes three distinct values in sequence; the host checks after
> + * each one that post_addr holds the latest value and the eventfd is signaled
> + * each time.
> + */
> +static void test_post_write_multi(void)
> +{
> + static const uint32_t expected[] = {
> + 0x11111111, 0x22222222, 0x33333333,
> + };
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_vm *vm;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret, i;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi);
> + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> +
> + fd = kvm_new_eventfd();
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = MMIO_GPA,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + for (i = 0; i < ARRAY_SIZE(expected); i++) {
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, i + 1);
> + TEST_ASSERT(read(fd, &count, sizeof(count)) ==
> sizeof(count),
> + "eventfd read failed: errno=%d", errno);
> + TEST_ASSERT_EQ(actual, expected[i]);
> + }
> +
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +static void guest_code_multi_nosync(void)
> +{
> + *(volatile uint32_t *)MMIO_GPA = 0x11111111;
> + *(volatile uint32_t *)MMIO_GPA = 0x22222222;
> + *(volatile uint32_t *)MMIO_GPA = 0x33333333;
> + GUEST_DONE();
> +}
> +
> +/*
> + * Variant of the multi-write test where the guest performs three consecutive
> + * MMIO writes with no GUEST_SYNC in between. All three are handled in-
> kernel
> + * by ioeventfd before the vCPU exits at GUEST_DONE. Verify that:
> + * - post_addr reflects the last written value (0x33333333).
> + * - A single eventfd read() returns a counter of 3 (one signal per write).
> + */
> +static void test_post_write_multi_nosync(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_vm *vm;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi_nosync);
> + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> +
> + fd = kvm_new_eventfd();
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = MMIO_GPA,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + ret = read(fd, &count, sizeof(count));
> + TEST_ASSERT(ret == sizeof(count),
> + "eventfd read failed: ret=%d errno=%d", ret, errno);
> + TEST_ASSERT_EQ(count, (uint64_t)3);
> + TEST_ASSERT_EQ(actual, (uint32_t)0x33333333);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +static void guest_code_deassign(void)
> +{
> + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> + GUEST_SYNC(1);
> + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> + GUEST_DONE();
> +}
> +
> +/*
> + * Verify that deassigning an ioeventfd with POST_WRITE fully removes it
> from
> + * the I/O bus.
> + */
> +static void test_post_write_deassign(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_run *run;
> + struct kvm_vm *vm;
> + struct pollfd pfd;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_deassign);
> + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> + run = vcpu->run;
> +
> + fd = kvm_new_eventfd();
> + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = MMIO_GPA,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD assign failed: %s",
> strerror(errno));
> +
> + /*
> + * Guest writes MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
> + */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, 1);
> + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> + "eventfd read failed: errno=%d", errno);
> + TEST_ASSERT_EQ(actual, MATCH_VAL);
> +
> + /* Deassign the ioeventfd. */
> + ioeventfd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD deassign failed: %s",
> strerror(errno));
> +
> + actual = POISON_VAL;
> +
> + /*
> + * Guest writes MATCH_VAL again → no handler on the bus →
> + * KVM_EXIT_MMIO to userspace.
> + */
> + vcpu_run(vcpu);
> + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
> + TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
> + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
> + "expected MMIO at 0x%lx, got 0x%llx",
> + MMIO_GPA, run->mmio.phys_addr);
> +
> + /* Re-enter: KVM completes MMIO, guest runs to GUEST_DONE. */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> + "eventfd should not be signaled after deassign");
> + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +#ifdef __x86_64__
> +static void guest_code_pio(void)
> +{
> + outl(PIO_PORT, (uint32_t)TEST_VAL);
> + GUEST_DONE();
> +}
> +
> +/*
> + * Verify that POST_WRITE works on the PIO bus (KVM_PIO_BUS), not just
> MMIO.
> + * The guest does an outl to PIO_PORT; the host checks that the written value
> + * is copied to post_addr and the eventfd is signaled.
> + */
> +static void test_post_write_pio(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_vm *vm;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio);
> +
> + fd = kvm_new_eventfd();
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .addr = PIO_PORT,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> + KVM_IOEVENTFD_FLAG_PIO,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + ret = read(fd, &count, sizeof(count));
> + TEST_ASSERT(ret == sizeof(count),
> + "eventfd read failed: ret=%d errno=%d", ret, errno);
> +
> + TEST_ASSERT_EQ(actual, (uint32_t)TEST_VAL);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +
> +static void guest_code_pio_datamatch(void)
> +{
> + outl(PIO_PORT, MATCH_VAL);
> + GUEST_SYNC(1);
> + outl(PIO_PORT, NOMATCH_VAL);
> + GUEST_SYNC(2);
> + GUEST_DONE();
> +}
> +
> +/*
> + * Test POST_WRITE + PIO + DATAMATCH together. When all three flags are
> set,
> + * the ioeventfd should only fire when the outl value matches datamatch.
> + * A non-matching outl must fall through to KVM_EXIT_IO (port I/O exit),
> + * leaving the eventfd unsignaled and post_addr untouched.
> + */
> +static void test_post_write_pio_datamatch(void)
> +{
> + struct kvm_ioeventfd ioeventfd;
> + struct kvm_vcpu *vcpu;
> + struct kvm_run *run;
> + struct kvm_vm *vm;
> + struct pollfd pfd;
> + uint64_t count;
> + uint32_t actual;
> + int fd, ret;
> +
> + actual = POISON_VAL;
> +
> + vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio_datamatch);
> + run = vcpu->run;
> +
> + fd = kvm_new_eventfd();
> + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> +
> + ioeventfd = (struct kvm_ioeventfd) {
> + .datamatch = MATCH_VAL,
> + .addr = PIO_PORT,
> + .len = 4,
> + .fd = fd,
> + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> + KVM_IOEVENTFD_FLAG_PIO |
> + KVM_IOEVENTFD_FLAG_DATAMATCH,
> + .post_addr = (u64)&actual,
> + };
> +
> + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> +
> + /*
> + * Guest does outl MATCH_VAL → ioeventfd fires, then
> GUEST_SYNC(1).
> + */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, 1);
> + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> + "eventfd read failed: errno=%d", errno);
> + TEST_ASSERT_EQ(actual, MATCH_VAL);
> +
> + actual = POISON_VAL;
> +
> + /*
> + * Guest does outl NOMATCH_VAL → no match → KVM_EXIT_IO.
> + */
> + vcpu_run(vcpu);
> + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> + TEST_ASSERT(run->io.direction == KVM_EXIT_IO_OUT,
> + "expected PIO write");
> + TEST_ASSERT(run->io.port == PIO_PORT,
> + "expected PIO at 0x%x, got 0x%x",
> + PIO_PORT, run->io.port);
> +
> + /* Re-enter: guest continues to GUEST_SYNC(2). */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_SYNC, 2);
> +
> + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> + "eventfd should not be signaled after non-matching PIO
> write");
> + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> +
> + /* GUEST_DONE */
> + vcpu_run(vcpu);
> + assert_ucall(vcpu, UCALL_DONE, 0);
> +
> + close(fd);
> + kvm_vm_free(vm);
> +}
> +#endif
> +
> +int main(void)
> +{
> +
> TEST_REQUIRE(kvm_check_cap(KVM_CAP_IOEVENTFD_POST_WRITE
> ));
> +
> + test_post_write_negative();
> +
> + test_post_write_width(1, guest_code_w1);
> + test_post_write_width(2, guest_code_w2);
> + test_post_write_width(4, guest_code_w4);
> + test_post_write_width(8, guest_code_w8);
> +
> + test_post_write_datamatch();
> + test_post_write_multi();
> + test_post_write_multi_nosync();
> + test_post_write_deassign();
> +
> +#ifdef __x86_64__
> + test_post_write_pio();
> + test_post_write_pio_datamatch();
> +#endif
> +
> + return 0;
> +}
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 0e8b8a2c5b79..22bc49a41503 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -741,6 +741,7 @@ struct _ioeventfd {
> struct kvm_io_device dev;
> u8 bus_idx;
> bool wildcard;
> + void __user *post_addr;
> };
>
> static inline struct _ioeventfd *
> @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct
> kvm_io_device *this, gpa_t addr,
> if (!ioeventfd_in_range(p, addr, len, val))
> return -EOPNOTSUPP;
>
> + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val,
> len))
> + return -EFAULT;
> +
> eventfd_signal(p->eventfd);
> return 0;
> }
> @@ -866,6 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
> {
>
> struct eventfd_ctx *eventfd;
> + void __user *post_addr;
> struct _ioeventfd *p;
> int ret;
>
> @@ -873,6 +878,16 @@ static int kvm_assign_ioeventfd_idx(struct kvm
> *kvm,
> if (IS_ERR(eventfd))
> return PTR_ERR(eventfd);
>
> + post_addr = u64_to_user_ptr(args->post_addr);
> + if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
> + (!args->len || !post_addr ||
> + args->post_addr != untagged_addr(args->post_addr) ||
> + !access_ok(post_addr, args->len))) {
> + /* In KVM’s ABI, post_addr must be non‑NULL. */
> + ret = -EINVAL;
> + goto fail;
> + }
> +
> p = kzalloc_obj(*p, GFP_KERNEL_ACCOUNT);
> if (!p) {
> ret = -ENOMEM;
> @@ -891,6 +906,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
> else
> p->wildcard = true;
>
> + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
> + p->post_addr = post_addr;
> +
> mutex_lock(&kvm->slots_lock);
>
> /* Verify that there isn't a match already */
> @@ -942,6 +960,11 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm,
> enum kvm_bus bus_idx,
> mutex_lock(&kvm->slots_lock);
>
> list_for_each_entry(p, &kvm->ioeventfds, list) {
> + /*
> + * No need to match post_addr, ioeventfd_check_collision
> + * prevents duplicate registrations that only differ by
> + * post_addr.
> + */
> if (p->bus_idx != bus_idx ||
> p->eventfd != eventfd ||
> p->addr != args->addr ||
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1bc1da66b4b0..02abca5c49df 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -4883,6 +4883,7 @@ static int
> kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
> case KVM_CAP_IRQFD:
> #endif
> case KVM_CAP_IOEVENTFD_ANY_LENGTH:
> + case KVM_CAP_IOEVENTFD_POST_WRITE:
> case KVM_CAP_CHECK_EXTENSION_VM:
> case KVM_CAP_ENABLE_CAP_VM:
> case KVM_CAP_HALT_POLL:
> --
> 2.47.3
^ permalink raw reply [flat|nested] 12+ messages in thread* RE: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-23 15:01 ` Thanos Makatos
@ 2026-04-10 19:11 ` Thanos Makatos
2026-04-21 14:45 ` Sean Christopherson
0 siblings, 1 reply; 12+ messages in thread
From: Thanos Makatos @ 2026-04-10 19:11 UTC (permalink / raw)
To: Thanos Makatos, seanjc@google.com, Paolo Bonzini
Cc: John Levon, kvm@vger.kernel.org
Hi Paolo, Sean,
Following up on this, please let me know whether this is something I
should revise/resend.
> -----Original Message-----
> From: Thanos Makatos <thanos.makatos@nutanix.com>
> Sent: 23 March 2026 17:02
> To: seanjc@google.com; Paolo Bonzini <pbonzini@redhat.com>
> Cc: John Levon <john.levon@nutanix.com>; kvm@vger.kernel.org
> Subject: RE: [PATCH] KVM: optionally post write on ioeventfd write
>
> Hi Paolo, Sean,
>
> Just a gentle ping on this, happy to follow up if needed.
>
> > -----Original Message-----
> > From: Thanos Makatos <thanos.makatos@nutanix.com>
> > Sent: 06 March 2026 12:57
> > To: seanjc@google.com
> > Cc: pbonzini@redhat.com; John Levon <john.levon@nutanix.com>;
> > kvm@vger.kernel.org; Thanos Makatos <thanos.makatos@nutanix.com>
> > Subject: [PATCH] KVM: optionally post write on ioeventfd write
> >
> > Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
> > ioeventfd that results in the value written by the guest to be copied
> > to user-supplied memory instead of being discarded.
> >
> > The goal of this new mechanism is to speed up doorbell writes on NVMe
> > controllers emulated outside of the VMM. Currently, a doorbell write to
> > an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> > VMM communicating the event, along with the doorbell value, to the NVMe
> > controller emulation task. With POST_WRITE, the NVMe emulation task is
> > directly notified of the doorbell write and can find the doorbell value
> > in a known location, without involving VMM.
> >
> > Add tests for this new functionality.
> >
> > LLM (claude-4.6-opus-high) was used mainly for the tests and to a
> > lesser extent for pre-reviewing this patch.
> >
> > Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
> > ---
> > Documentation/virt/kvm/api.rst | 13 +-
> > include/uapi/linux/kvm.h | 6 +-
> > tools/testing/selftests/kvm/Makefile.kvm | 1 +
> > tools/testing/selftests/kvm/ioeventfd_test.c | 624 +++++++++++++++++++
> > virt/kvm/eventfd.c | 23 +
> > virt/kvm/kvm_main.c | 1 +
> > 6 files changed, 666 insertions(+), 2 deletions(-)
> > create mode 100644 tools/testing/selftests/kvm/ioeventfd_test.c
> >
> > diff --git a/Documentation/virt/kvm/api.rst
> > b/Documentation/virt/kvm/api.rst
> > index 6f85e1b321dd..b8d030f03101 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -2109,7 +2109,8 @@ provided event instead of triggering an exit.
> > __u32 len; /* 0, 1, 2, 4, or 8 bytes */
> > __s32 fd;
> > __u32 flags;
> > - __u8 pad[36];
> > + __aligned_u64 post_addr; /* address to write to if POST_WRITE is set
> > */
> > + __u8 pad[24];
> > };
> >
> > For the special case of virtio-ccw devices on s390, the ioevent is matched
> > @@ -2122,6 +2123,7 @@ The following flags are defined::
> > #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 <<
> > kvm_ioeventfd_flag_nr_deassign)
> > #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> > (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> > + #define KVM_IOEVENTFD_FLAG_POST_WRITE (1 <<
> > kvm_ioeventfd_flag_nr_post_write)
> >
> > If datamatch flag is set, the event will be signaled only if the written value
> > to the registered address is equal to datamatch in struct kvm_ioeventfd.
> > @@ -2134,6 +2136,15 @@ the kernel will ignore the length of guest write
> > and may get a faster vmexit.
> > The speedup may only apply to specific architectures, but the ioeventfd will
> > work anyway.
> >
> > +With KVM_IOEVENTFD_FLAG_POST_WRITE, the value being written is
> copied
> > to the
> > +userspace address specified by post_addr, and the eventfd is signaled. The
> > +copy is guaranteed to complete before the eventfd is signaled, so a
> userspace
> > +reader that wakes on the eventfd will observe the written value. When
> > multiple
> > +vCPUs write to the same ioeventfd concurrently, the value at post_addr
> > reflects
> > +one of the writes. If the copy to post_addr fails (e.g. the memory has been
> > +unmapped), the eventfd is not signaled and the write is reported to
> > userspace
> > +as a regular MMIO/PIO exit.
> > +
> > 4.60 KVM_DIRTY_TLB
> > ------------------
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 65500f5db379..55b8683a856f 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -639,6 +639,7 @@ enum {
> > kvm_ioeventfd_flag_nr_deassign,
> > kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> > kvm_ioeventfd_flag_nr_fast_mmio,
> > + kvm_ioeventfd_flag_nr_post_write,
> > kvm_ioeventfd_flag_nr_max,
> > };
> >
> > @@ -647,6 +648,7 @@ enum {
> > #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 <<
> > kvm_ioeventfd_flag_nr_deassign)
> > #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> > (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> > +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 <<
> > kvm_ioeventfd_flag_nr_post_write)
> >
> > #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 <<
> > kvm_ioeventfd_flag_nr_max) - 1)
> >
> > @@ -656,7 +658,8 @@ struct kvm_ioeventfd {
> > __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
> > __s32 fd;
> > __u32 flags;
> > - __u8 pad[36];
> > + __aligned_u64 post_addr; /* address to write to if POST_WRITE is set
> > */
> > + __u8 pad[24];
> > };
> >
> > #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
> > @@ -985,6 +988,7 @@ struct kvm_enable_cap {
> > #define KVM_CAP_ARM_SEA_TO_USER 245
> > #define KVM_CAP_S390_USER_OPEREXEC 246
> > #define KVM_CAP_S390_KEYOP 247
> > +#define KVM_CAP_IOEVENTFD_POST_WRITE 248
> >
> > struct kvm_irq_routing_irqchip {
> > __u32 irqchip;
> > diff --git a/tools/testing/selftests/kvm/Makefile.kvm
> > b/tools/testing/selftests/kvm/Makefile.kvm
> > index fdec90e85467..7ab470981c31 100644
> > --- a/tools/testing/selftests/kvm/Makefile.kvm
> > +++ b/tools/testing/selftests/kvm/Makefile.kvm
> > @@ -64,6 +64,7 @@ TEST_GEN_PROGS_COMMON +=
> kvm_binary_stats_test
> > TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
> > TEST_GEN_PROGS_COMMON += kvm_page_table_test
> > TEST_GEN_PROGS_COMMON += set_memory_region_test
> > +TEST_GEN_PROGS_COMMON += ioeventfd_test
> >
> > # Compiled test targets
> > TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
> > diff --git a/tools/testing/selftests/kvm/ioeventfd_test.c
> > b/tools/testing/selftests/kvm/ioeventfd_test.c
> > new file mode 100644
> > index 000000000000..24875a2562d4
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/ioeventfd_test.c
> > @@ -0,0 +1,624 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * ioeventfd_test.c - Tests for KVM_IOEVENTFD_FLAG_POST_WRITE.
> > + *
> > + * Tests that when KVM_IOEVENTFD_FLAG_POST_WRITE is set and the
> > MMIO/PIO
> > + * address is written to, the value is copied to the user-provided address
> > + * and the eventfd is signaled. Also tests negative cases and interactions
> > + * with DATAMATCH.
> > + *
> > + * Copyright Nutanix, 2026
> > + *
> > + * Author: Thanos Makatos <thanos.makatos@nutanix.com>
> > + */
> > +
> > +#include <errno.h>
> > +#include <poll.h>
> > +#include <string.h>
> > +
> > +#include "kvm_util.h"
> > +#include "processor.h"
> > +#include "ucall_common.h"
> > +
> > +#define MMIO_GPA (1UL << 30)
> > +#define PIO_PORT 0xe000
> > +#define TEST_VAL 0xDEADBEEFCAFEBABEULL
> > +#define MATCH_VAL 0x42U
> > +#define NOMATCH_VAL (MATCH_VAL + 1)
> > +#define POISON_VAL 0xFFFFFFFFU
> > +
> > +/*
> > + * Check that the most recent vCPU exit is a ucall (delivered as
> KVM_EXIT_IO
> > + * on x86) matching @expected_cmd. The caller must have already called
> > + * vcpu_run().
> > + *
> > + * @expected_cmd: UCALL_SYNC, UCALL_DONE, etc.
> > + * @expected_stage: for UCALL_SYNC, the stage number passed by
> > GUEST_SYNC().
> > + * Ignored for other ucall types.
> > + *
> > + * Aborts the test on UCALL_ABORT (a guest-side assertion failure).
> > + */
> > +static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_cmd,
> > + uint64_t expected_stage)
> > +{
> > + struct ucall uc;
> > +
> > + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > +
> > + switch (get_ucall(vcpu, &uc)) {
> > + case UCALL_ABORT:
> > + REPORT_GUEST_ASSERT(uc);
> > + break;
> > + case UCALL_SYNC:
> > + TEST_ASSERT(expected_cmd == UCALL_SYNC,
> > + "got UCALL_SYNC, expected %lu",
> > + expected_cmd);
> > + TEST_ASSERT(uc.args[1] == expected_stage,
> > + "expected stage %lu, got %lu",
> > + expected_stage, uc.args[1]);
> > + break;
> > + case UCALL_DONE:
> > + TEST_ASSERT(expected_cmd == UCALL_DONE,
> > + "got UCALL_DONE, expected %lu",
> > + expected_cmd);
> > + break;
> > + default:
> > + TEST_FAIL("unexpected ucall %lu", uc.cmd);
> > + }
> > +}
> > +
> > +/*
> > + * Verify that KVM_IOEVENTFD rejects invalid POST_WRITE configurations:
> > + * - len=0: the kernel needs a non-zero length to know how many bytes to
> > copy.
> > + * - post_addr=NULL: there is no destination for the copy.
> > + * - post_addr outside the process address space: access_ok() rejects it.
> > + * All three must fail with EINVAL.
> > + */
> > +static void test_post_write_negative(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vm *vm;
> > + uint64_t dummy;
> > + int ret;
> > + int fd;
> > +
> > + vm = vm_create_barebones();
> > + fd = kvm_new_eventfd();
> > +
> > + /* length cannot be zero */
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = MMIO_GPA,
> > + .len = 0,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> > + .post_addr = (u64)&dummy,
> > + };
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(ret && errno == EINVAL,
> > + "len=0: expected EINVAL, got ret=%d errno=%d", ret,
> errno);
> > +
> > + /* post_addr cannot be NULL */
> > + ioeventfd.len = 4;
> > + ioeventfd.post_addr = 0ULL;
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(ret && errno == EINVAL,
> > + "NULL post_addr: expected EINVAL, got ret=%d errno=%d",
> > + ret, errno);
> > +
> > + /* bogus post_addr */
> > + ioeventfd.post_addr = (u64)0xdeaddeaddeaddeadULL;
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(ret && errno == EINVAL,
> > + "bad post_addr: expected EINVAL, got ret=%d errno=%d",
> > + ret, errno);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +#define DEFINE_GUEST_WRITE_FN(suffix, type) \
> > +static void guest_code_w##suffix(void) { \
> > + *(volatile type *)MMIO_GPA = (type)TEST_VAL; \
> > + GUEST_DONE(); \
> > +}
> > +
> > +DEFINE_GUEST_WRITE_FN(1, uint8_t)
> > +DEFINE_GUEST_WRITE_FN(2, uint16_t)
> > +DEFINE_GUEST_WRITE_FN(4, uint32_t)
> > +DEFINE_GUEST_WRITE_FN(8, uint64_t)
> > +
> > +/*
> > + * Verify that ioeventfd_write copies exactly @width bytes to post_addr for
> > + * each supported MMIO write width (1, 2, 4, 8). The guest writes the low
> > + * @width bytes of TEST_VAL; the host checks that exactly those bytes land
> > + * at post_addr and the eventfd is signaled.
> > + */
> > +static void test_post_write_width(int width, void (*guest_fn)(void))
> > +{
> > + uint64_t actual, expected, count;
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_vm *vm;
> > + int fd, ret;
> > +
> > + /* need to initialize to 0 because the guest writes the low @width
> > bytes */
> > + actual = 0;
> > + expected = 0;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_fn);
> > + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> > +
> > + fd = kvm_new_eventfd();
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = MMIO_GPA,
> > + .len = width,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + ret = read(fd, &count, sizeof(count));
> > + TEST_ASSERT(ret == sizeof(count),
> > + "eventfd read failed: ret=%d errno=%d", ret, errno);
> > +
> > + memcpy(&expected, &(uint64_t){TEST_VAL}, width);
> > + TEST_ASSERT_EQ(actual, expected);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +static void guest_code_datamatch(void)
> > +{
> > + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> > + GUEST_SYNC(1);
> > + *(volatile uint32_t *)MMIO_GPA = NOMATCH_VAL;
> > + GUEST_SYNC(2);
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Test the interaction between DATAMATCH and POST_WRITE. When both
> > flags are
> > + * set, ioeventfd_write should only fire (signal eventfd + copy value) when
> the
> > + * written value matches datamatch. A non-matching write must leave the
> > eventfd
> > + * unsignaled and post_addr untouched, and fall through to
> > KVM_EXIT_MMIO.
> > + */
> > +static void test_post_write_datamatch(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_run *run;
> > + struct kvm_vm *vm;
> > + struct pollfd pfd;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_datamatch);
> > + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> > + run = vcpu->run;
> > +
> > + fd = kvm_new_eventfd();
> > + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .datamatch = MATCH_VAL,
> > + .addr = MMIO_GPA,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> > + KVM_IOEVENTFD_FLAG_DATAMATCH,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + /*
> > + * Guest writes MATCH_VAL → ioeventfd fires (value copied, eventfd
> > + * signaled), vCPU continues, then GUEST_SYNC(1).
> > + */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, 1);
> > + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> > + "eventfd read failed: errno=%d", errno);
> > + TEST_ASSERT_EQ(actual, MATCH_VAL);
> > +
> > + actual = POISON_VAL;
> > +
> > + /*
> > + * Guest writes NOMATCH_VAL → ioeventfd_in_range() returns false,
> > bus
> > + * returns -EOPNOTSUPP → KVM_EXIT_MMIO to userspace.
> > + */
> > + vcpu_run(vcpu);
> > + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
> > + TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
> > + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
> > + "expected MMIO at 0x%lx, got 0x%llx",
> > + MMIO_GPA, run->mmio.phys_addr);
> > +
> > + /* Re-enter: KVM completes the MMIO, guest runs to
> > GUEST_SYNC(2). */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, 2);
> > +
> > + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> > + "eventfd should not be signaled after non-matching write");
> > + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> > +
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +static void guest_code_multi(void)
> > +{
> > + *(volatile uint32_t *)MMIO_GPA = 0x11111111;
> > + GUEST_SYNC(1);
> > + *(volatile uint32_t *)MMIO_GPA = 0x22222222;
> > + GUEST_SYNC(2);
> > + *(volatile uint32_t *)MMIO_GPA = 0x33333333;
> > + GUEST_SYNC(3);
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Verify that post_addr is updated on every MMIO write, not just the first.
> > + * The guest writes three distinct values in sequence; the host checks after
> > + * each one that post_addr holds the latest value and the eventfd is
> signaled
> > + * each time.
> > + */
> > +static void test_post_write_multi(void)
> > +{
> > + static const uint32_t expected[] = {
> > + 0x11111111, 0x22222222, 0x33333333,
> > + };
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_vm *vm;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret, i;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi);
> > + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> > +
> > + fd = kvm_new_eventfd();
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = MMIO_GPA,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + for (i = 0; i < ARRAY_SIZE(expected); i++) {
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, i + 1);
> > + TEST_ASSERT(read(fd, &count, sizeof(count)) ==
> > sizeof(count),
> > + "eventfd read failed: errno=%d", errno);
> > + TEST_ASSERT_EQ(actual, expected[i]);
> > + }
> > +
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +static void guest_code_multi_nosync(void)
> > +{
> > + *(volatile uint32_t *)MMIO_GPA = 0x11111111;
> > + *(volatile uint32_t *)MMIO_GPA = 0x22222222;
> > + *(volatile uint32_t *)MMIO_GPA = 0x33333333;
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Variant of the multi-write test where the guest performs three
> consecutive
> > + * MMIO writes with no GUEST_SYNC in between. All three are handled
> in-
> > kernel
> > + * by ioeventfd before the vCPU exits at GUEST_DONE. Verify that:
> > + * - post_addr reflects the last written value (0x33333333).
> > + * - A single eventfd read() returns a counter of 3 (one signal per write).
> > + */
> > +static void test_post_write_multi_nosync(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_vm *vm;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi_nosync);
> > + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> > +
> > + fd = kvm_new_eventfd();
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = MMIO_GPA,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + ret = read(fd, &count, sizeof(count));
> > + TEST_ASSERT(ret == sizeof(count),
> > + "eventfd read failed: ret=%d errno=%d", ret, errno);
> > + TEST_ASSERT_EQ(count, (uint64_t)3);
> > + TEST_ASSERT_EQ(actual, (uint32_t)0x33333333);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +static void guest_code_deassign(void)
> > +{
> > + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> > + GUEST_SYNC(1);
> > + *(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Verify that deassigning an ioeventfd with POST_WRITE fully removes it
> > from
> > + * the I/O bus.
> > + */
> > +static void test_post_write_deassign(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_run *run;
> > + struct kvm_vm *vm;
> > + struct pollfd pfd;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_deassign);
> > + virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
> > + run = vcpu->run;
> > +
> > + fd = kvm_new_eventfd();
> > + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = MMIO_GPA,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD assign failed: %s",
> > strerror(errno));
> > +
> > + /*
> > + * Guest writes MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
> > + */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, 1);
> > + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> > + "eventfd read failed: errno=%d", errno);
> > + TEST_ASSERT_EQ(actual, MATCH_VAL);
> > +
> > + /* Deassign the ioeventfd. */
> > + ioeventfd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD deassign failed: %s",
> > strerror(errno));
> > +
> > + actual = POISON_VAL;
> > +
> > + /*
> > + * Guest writes MATCH_VAL again → no handler on the bus →
> > + * KVM_EXIT_MMIO to userspace.
> > + */
> > + vcpu_run(vcpu);
> > + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
> > + TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
> > + TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
> > + "expected MMIO at 0x%lx, got 0x%llx",
> > + MMIO_GPA, run->mmio.phys_addr);
> > +
> > + /* Re-enter: KVM completes MMIO, guest runs to GUEST_DONE. */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> > + "eventfd should not be signaled after deassign");
> > + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +#ifdef __x86_64__
> > +static void guest_code_pio(void)
> > +{
> > + outl(PIO_PORT, (uint32_t)TEST_VAL);
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Verify that POST_WRITE works on the PIO bus (KVM_PIO_BUS), not just
> > MMIO.
> > + * The guest does an outl to PIO_PORT; the host checks that the written
> value
> > + * is copied to post_addr and the eventfd is signaled.
> > + */
> > +static void test_post_write_pio(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_vm *vm;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio);
> > +
> > + fd = kvm_new_eventfd();
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .addr = PIO_PORT,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> > + KVM_IOEVENTFD_FLAG_PIO,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + ret = read(fd, &count, sizeof(count));
> > + TEST_ASSERT(ret == sizeof(count),
> > + "eventfd read failed: ret=%d errno=%d", ret, errno);
> > +
> > + TEST_ASSERT_EQ(actual, (uint32_t)TEST_VAL);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +
> > +static void guest_code_pio_datamatch(void)
> > +{
> > + outl(PIO_PORT, MATCH_VAL);
> > + GUEST_SYNC(1);
> > + outl(PIO_PORT, NOMATCH_VAL);
> > + GUEST_SYNC(2);
> > + GUEST_DONE();
> > +}
> > +
> > +/*
> > + * Test POST_WRITE + PIO + DATAMATCH together. When all three flags are
> > set,
> > + * the ioeventfd should only fire when the outl value matches datamatch.
> > + * A non-matching outl must fall through to KVM_EXIT_IO (port I/O exit),
> > + * leaving the eventfd unsignaled and post_addr untouched.
> > + */
> > +static void test_post_write_pio_datamatch(void)
> > +{
> > + struct kvm_ioeventfd ioeventfd;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_run *run;
> > + struct kvm_vm *vm;
> > + struct pollfd pfd;
> > + uint64_t count;
> > + uint32_t actual;
> > + int fd, ret;
> > +
> > + actual = POISON_VAL;
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio_datamatch);
> > + run = vcpu->run;
> > +
> > + fd = kvm_new_eventfd();
> > + pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
> > +
> > + ioeventfd = (struct kvm_ioeventfd) {
> > + .datamatch = MATCH_VAL,
> > + .addr = PIO_PORT,
> > + .len = 4,
> > + .fd = fd,
> > + .flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
> > + KVM_IOEVENTFD_FLAG_PIO |
> > + KVM_IOEVENTFD_FLAG_DATAMATCH,
> > + .post_addr = (u64)&actual,
> > + };
> > +
> > + ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
> > + TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
> > +
> > + /*
> > + * Guest does outl MATCH_VAL → ioeventfd fires, then
> > GUEST_SYNC(1).
> > + */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, 1);
> > + TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
> > + "eventfd read failed: errno=%d", errno);
> > + TEST_ASSERT_EQ(actual, MATCH_VAL);
> > +
> > + actual = POISON_VAL;
> > +
> > + /*
> > + * Guest does outl NOMATCH_VAL → no match → KVM_EXIT_IO.
> > + */
> > + vcpu_run(vcpu);
> > + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > + TEST_ASSERT(run->io.direction == KVM_EXIT_IO_OUT,
> > + "expected PIO write");
> > + TEST_ASSERT(run->io.port == PIO_PORT,
> > + "expected PIO at 0x%x, got 0x%x",
> > + PIO_PORT, run->io.port);
> > +
> > + /* Re-enter: guest continues to GUEST_SYNC(2). */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_SYNC, 2);
> > +
> > + TEST_ASSERT(poll(&pfd, 1, 0) == 0,
> > + "eventfd should not be signaled after non-matching PIO
> > write");
> > + TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
> > +
> > + /* GUEST_DONE */
> > + vcpu_run(vcpu);
> > + assert_ucall(vcpu, UCALL_DONE, 0);
> > +
> > + close(fd);
> > + kvm_vm_free(vm);
> > +}
> > +#endif
> > +
> > +int main(void)
> > +{
> > +
> > TEST_REQUIRE(kvm_check_cap(KVM_CAP_IOEVENTFD_POST_WRITE
> > ));
> > +
> > + test_post_write_negative();
> > +
> > + test_post_write_width(1, guest_code_w1);
> > + test_post_write_width(2, guest_code_w2);
> > + test_post_write_width(4, guest_code_w4);
> > + test_post_write_width(8, guest_code_w8);
> > +
> > + test_post_write_datamatch();
> > + test_post_write_multi();
> > + test_post_write_multi_nosync();
> > + test_post_write_deassign();
> > +
> > +#ifdef __x86_64__
> > + test_post_write_pio();
> > + test_post_write_pio_datamatch();
> > +#endif
> > +
> > + return 0;
> > +}
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 0e8b8a2c5b79..22bc49a41503 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -741,6 +741,7 @@ struct _ioeventfd {
> > struct kvm_io_device dev;
> > u8 bus_idx;
> > bool wildcard;
> > + void __user *post_addr;
> > };
> >
> > static inline struct _ioeventfd *
> > @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct
> > kvm_io_device *this, gpa_t addr,
> > if (!ioeventfd_in_range(p, addr, len, val))
> > return -EOPNOTSUPP;
> >
> > + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val,
> > len))
> > + return -EFAULT;
> > +
> > eventfd_signal(p->eventfd);
> > return 0;
> > }
> > @@ -866,6 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm
> *kvm,
> > {
> >
> > struct eventfd_ctx *eventfd;
> > + void __user *post_addr;
> > struct _ioeventfd *p;
> > int ret;
> >
> > @@ -873,6 +878,16 @@ static int kvm_assign_ioeventfd_idx(struct kvm
> > *kvm,
> > if (IS_ERR(eventfd))
> > return PTR_ERR(eventfd);
> >
> > + post_addr = u64_to_user_ptr(args->post_addr);
> > + if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
> > + (!args->len || !post_addr ||
> > + args->post_addr != untagged_addr(args->post_addr) ||
> > + !access_ok(post_addr, args->len))) {
> > + /* In KVM’s ABI, post_addr must be non‑NULL. */
> > + ret = -EINVAL;
> > + goto fail;
> > + }
> > +
> > p = kzalloc_obj(*p, GFP_KERNEL_ACCOUNT);
> > if (!p) {
> > ret = -ENOMEM;
> > @@ -891,6 +906,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm
> *kvm,
> > else
> > p->wildcard = true;
> >
> > + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
> > + p->post_addr = post_addr;
> > +
> > mutex_lock(&kvm->slots_lock);
> >
> > /* Verify that there isn't a match already */
> > @@ -942,6 +960,11 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm,
> > enum kvm_bus bus_idx,
> > mutex_lock(&kvm->slots_lock);
> >
> > list_for_each_entry(p, &kvm->ioeventfds, list) {
> > + /*
> > + * No need to match post_addr, ioeventfd_check_collision
> > + * prevents duplicate registrations that only differ by
> > + * post_addr.
> > + */
> > if (p->bus_idx != bus_idx ||
> > p->eventfd != eventfd ||
> > p->addr != args->addr ||
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 1bc1da66b4b0..02abca5c49df 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -4883,6 +4883,7 @@ static int
> > kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
> > case KVM_CAP_IRQFD:
> > #endif
> > case KVM_CAP_IOEVENTFD_ANY_LENGTH:
> > + case KVM_CAP_IOEVENTFD_POST_WRITE:
> > case KVM_CAP_CHECK_EXTENSION_VM:
> > case KVM_CAP_ENABLE_CAP_VM:
> > case KVM_CAP_HALT_POLL:
> > --
> > 2.47.3
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-04-10 19:11 ` Thanos Makatos
@ 2026-04-21 14:45 ` Sean Christopherson
0 siblings, 0 replies; 12+ messages in thread
From: Sean Christopherson @ 2026-04-21 14:45 UTC (permalink / raw)
To: Thanos Makatos; +Cc: Paolo Bonzini, John Levon, kvm@vger.kernel.org
On Fri, Apr 10, 2026, Thanos Makatos wrote:
> Hi Paolo, Sean,
>
> Following up on this, please let me know whether this is something I
> should revise/resend.
No action needed on your end. More discussion is needed to make sure this is
uAPI that we want to support long term, but we're not waiting on you for anything.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [RFC PATCH] KVM: optionally commit write on ioeventfd write
@ 2026-01-13 20:00 Sean Christopherson
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
0 siblings, 1 reply; 12+ messages in thread
From: Sean Christopherson @ 2026-01-13 20:00 UTC (permalink / raw)
To: Thanos Makatos
Cc: kvm@vger.kernel.org, John Levon, mst@redhat.com,
dinechin@redhat.com, cohuck@redhat.com, jasowang@redhat.com,
stefanha@redhat.com, jag.raman@oracle.com, eafanasova@gmail.com,
elena.ufimtseva@oracle.com, Paolo Bonzini
On Tue, Jan 13, 2026, Thanos Makatos wrote:
> > +Paolo (just realized Paolo isn't on the Cc)
> >
> > On Wed, Dec 03, 2025, Thanos Makatos wrote:
> > > > From: Sean Christopherson <seanjc@google.com>
> > > > Side topic, Paolo had an off-the-cuff idea of adding uAPI to support
> > > > notifications on memslot ranges, as opposed to posting writes via
> > > > ioeventfd. E.g. add a memslot flag, or maybe a memory attribute, that
> > > > causes KVM to write-protect a region, emulate in response to writes,
> > > > and then notify an eventfd after emulating the write. It'd be a lot
> > > > like KVM_MEM_READONLY, except that KVM would commit the write to
> > > > memory and notify, as opposed to exiting to userspace.
> > >
> > > Are you thinking for reusing/adapting the mechanism in this patch for that?
> >
> > Paolo's idea was to forego this patch entirely and instead add a more
> > generic write-notify mechanism. In practice, the only real difference is
> > that the writes would be fully in-place instead of a redirection, which in
> > turn would allow the guest to read without triggering a VM-Exit, and I
> > suppose might save userspace from some dirty logging operations.
> >
> > While I really like the mechanics of the idea, after sketching out the
> > basic gist (see below), I'm not convinced the additional complexity is
> > worth the gains. Unless reading from NVMe submission queues is a common
> > operation, it doesn't seem like eliding VM-Exits on reads buys much.
> >
> > Every arch would need to be updated to handle the new way of handling
> > emulated writes, with varying degrees of complexity. E.g. on x86 I think
> > it would just be teaching the MMU about the new "emulate on write"
> > behavior, but for arm64 (and presumably any other architecture without a
> > generic emulator), it would be that plus new code to actually commit the
> > write to guest memory.
> >
> > The other scary aspect is correctly handling "writable from KVM" and "can't
> > be mapped writable". Getting that correct in all places is non-trivial,
> > and seems like it could be a pain to maintain, which potentially fatal
> > failure modes, e.g. if KVM writes guest memory but fails to notify,
> > tracking down the bug would be "fun".
> >
> > So my vote is to add POST_WRITE functionality to I/O eventfd, and hold off
> > on a generic write-notify mechanism until there's a (really) strong use
> > case.
> >
> > Paolo, thoughts?
>
> In the absence of a response, shall we go ahead with POST_WRITE? I have the
> revised patch ready.
Ya, fire away.
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH] KVM: optionally post write on ioeventfd write
2026-01-13 20:00 [RFC PATCH] KVM: optionally commit " Sean Christopherson
@ 2026-03-02 12:28 ` Thanos Makatos
2026-03-05 1:26 ` Sean Christopherson
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Thanos Makatos @ 2026-03-02 12:28 UTC (permalink / raw)
To: seanjc@google.com
Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org,
Thanos Makatos
This patch is a slightly different take on the ioregionfd mechanism
previously described here:
https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com/
The goal of this new mechanism is to speed up doorbell writes on NVMe
controllers emulated outside of the VMM. Currently, a doorbell write to
an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
VMM communicating the event, along with the doorbell value, to the NVMe
controller emulation task. With the shadow ioeventfd, the NVMe
emulation task is directly notified of the doorbell write and can find
the doorbell value in a known location, without the interference of the
VMM.
Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
---
include/uapi/linux/kvm.h | 11 ++++++++++-
tools/include/uapi/linux/kvm.h | 2 ++
virt/kvm/eventfd.c | 32 ++++++++++++++++++++++++++++++--
3 files changed, 42 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 65500f5db379..f3ff559de60d 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -639,6 +639,7 @@ enum {
kvm_ioeventfd_flag_nr_deassign,
kvm_ioeventfd_flag_nr_virtio_ccw_notify,
kvm_ioeventfd_flag_nr_fast_mmio,
+ kvm_ioevetnfd_flag_nr_post_write,
kvm_ioeventfd_flag_nr_max,
};
@@ -648,6 +649,12 @@ enum {
#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+/*
+ * KVM does not provide any guarantees regarding read-after-write ordering for
+ * such updates.
+ */
+#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioevetnfd_flag_nr_post_write)
+
#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
struct kvm_ioeventfd {
@@ -656,8 +663,10 @@ struct kvm_ioeventfd {
__u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
__s32 fd;
__u32 flags;
- __u8 pad[36];
+ void __user *post_addr; /* address to write to if POST_WRITE is set */
+ __u8 pad[24];
};
+_Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
#define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
#define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
index dddb781b0507..1fb481c90b57 100644
--- a/tools/include/uapi/linux/kvm.h
+++ b/tools/include/uapi/linux/kvm.h
@@ -629,6 +629,7 @@ enum {
kvm_ioeventfd_flag_nr_deassign,
kvm_ioeventfd_flag_nr_virtio_ccw_notify,
kvm_ioeventfd_flag_nr_fast_mmio,
+ kvm_ioevetnfd_flag_nr_commit_write,
kvm_ioeventfd_flag_nr_max,
};
@@ -637,6 +638,7 @@ enum {
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
#define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+#define KVM_IOEVENTFD_FLAG_COMMIT_WRITE (1 << kvm_ioevetnfd_flag_nr_commit_write)
#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 0e8b8a2c5b79..019cf3606aef 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -741,6 +741,7 @@ struct _ioeventfd {
struct kvm_io_device dev;
u8 bus_idx;
bool wildcard;
+ void __user *post_addr;
};
static inline struct _ioeventfd *
@@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
+ if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len))
+ return -EFAULT;
+
eventfd_signal(p->eventfd);
return 0;
}
@@ -879,6 +883,27 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
goto fail;
}
+ if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) {
+ /*
+ * Although a NULL pointer it technically valid for userspace, it's
+ * unlikely that any use case actually cares.
+ */
+ if (!args->len || !args->post_addr ||
+ args->post_addr != untagged_addr(args->post_addr) ||
+ !access_ok((void __user *)(unsigned long)args->post_addr, args->len)) {
+ ret = -EINVAL;
+ goto free_fail;
+ }
+ p->post_addr = args->post_addr;
+ } else if (!args->post_addr) {
+ /*
+ * Ensure that post_addr isn't set without POST_WRITE to avoid accidental
+ * userspace errors.
+ */
+ ret = -EINVAL;
+ goto free_fail;
+ }
+
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->bus_idx = bus_idx;
@@ -915,8 +940,8 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
unlock_fail:
mutex_unlock(&kvm->slots_lock);
+free_fail:
kfree(p);
-
fail:
eventfd_ctx_put(eventfd);
@@ -932,12 +957,14 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_bus *bus;
int ret = -ENOENT;
bool wildcard;
+ void __user *post_addr;
eventfd = eventfd_ctx_fdget(args->fd);
if (IS_ERR(eventfd))
return PTR_ERR(eventfd);
wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
+ post_addr = args->post_addr;
mutex_lock(&kvm->slots_lock);
@@ -946,7 +973,8 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
p->eventfd != eventfd ||
p->addr != args->addr ||
p->length != args->len ||
- p->wildcard != wildcard)
+ p->wildcard != wildcard ||
+ p->post_addr != post_addr)
continue;
if (!p->wildcard && p->datamatch != args->datamatch)
--
2.47.3
^ permalink raw reply related [flat|nested] 12+ messages in thread* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
@ 2026-03-05 1:26 ` Sean Christopherson
2026-03-06 11:14 ` Thanos Makatos
2026-03-05 1:49 ` kernel test robot
2026-03-05 9:39 ` kernel test robot
2 siblings, 1 reply; 12+ messages in thread
From: Sean Christopherson @ 2026-03-05 1:26 UTC (permalink / raw)
To: Thanos Makatos; +Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org
Please don't send patches in-reply to the previous version(s), it tends to mess
up b4.
On Mon, Mar 02, 2026, Thanos Makatos wrote:
> This patch is a slightly different take on the ioregionfd mechanism
> previously described here:
> https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com/
>
> The goal of this new mechanism is to speed up doorbell writes on NVMe
> controllers emulated outside of the VMM. Currently, a doorbell write to
> an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> VMM communicating the event, along with the doorbell value, to the NVMe
> controller emulation task. With the shadow ioeventfd, the NVMe
> emulation task is directly notified of the doorbell write and can find
> the doorbell value in a known location, without the interference of the
> VMM.
Please add a KVM selftest to verify this works, and to verify that KVM rejects
bad configurations.
> Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
> ---
> include/uapi/linux/kvm.h | 11 ++++++++++-
> tools/include/uapi/linux/kvm.h | 2 ++
> virt/kvm/eventfd.c | 32 ++++++++++++++++++++++++++++++--
> 3 files changed, 42 insertions(+), 3 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 65500f5db379..f3ff559de60d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -639,6 +639,7 @@ enum {
> kvm_ioeventfd_flag_nr_deassign,
> kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> kvm_ioeventfd_flag_nr_fast_mmio,
> + kvm_ioevetnfd_flag_nr_post_write,
> kvm_ioeventfd_flag_nr_max,
> };
>
> @@ -648,6 +649,12 @@ enum {
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
>
> +/*
> + * KVM does not provide any guarantees regarding read-after-write ordering for
> + * such updates.
Please document this (and more) in Documentation/virt/kvm/api.rst, not here.
> + */
> +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioevetnfd_flag_nr_post_write)
> +
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
>
> struct kvm_ioeventfd {
> @@ -656,8 +663,10 @@ struct kvm_ioeventfd {
> __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
> __s32 fd;
> __u32 flags;
> - __u8 pad[36];
> + void __user *post_addr; /* address to write to if POST_WRITE is set */
> + __u8 pad[24];
> };
> +_Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
>
> #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
> #define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index dddb781b0507..1fb481c90b57 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
Don't bother updating tools, the copy of uapi headers in tools is maintained by
the perf folks (perf-the-tool needs all of the headers, nothing else does).
> @@ -629,6 +629,7 @@ enum {
> kvm_ioeventfd_flag_nr_deassign,
> kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> kvm_ioeventfd_flag_nr_fast_mmio,
> + kvm_ioevetnfd_flag_nr_commit_write,
Then you won't have amusing mistakes like this :-)
> kvm_ioeventfd_flag_nr_max,
> };
>
> @@ -637,6 +638,7 @@ enum {
> #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> +#define KVM_IOEVENTFD_FLAG_COMMIT_WRITE (1 << kvm_ioevetnfd_flag_nr_commit_write)
>
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 0e8b8a2c5b79..019cf3606aef 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -741,6 +741,7 @@ struct _ioeventfd {
> struct kvm_io_device dev;
> u8 bus_idx;
> bool wildcard;
> + void __user *post_addr;
> };
>
> static inline struct _ioeventfd *
> @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> if (!ioeventfd_in_range(p, addr, len, val))
> return -EOPNOTSUPP;
>
> + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len))
> + return -EFAULT;
> +
> eventfd_signal(p->eventfd);
> return 0;
> }
> @@ -879,6 +883,27 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
> goto fail;
> }
>
> + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) {
> + /*
> + * Although a NULL pointer it technically valid for userspace, it's
> + * unlikely that any use case actually cares.
This is fine for a changelog, but for a code comment, simply state that KVM's ABI
is that NULL is disallowed.
> + */
> + if (!args->len || !args->post_addr ||
> + args->post_addr != untagged_addr(args->post_addr) ||
> + !access_ok((void __user *)(unsigned long)args->post_addr, args->len)) {
Align indentation. And use u64_to_user_ptr().
> + ret = -EINVAL;
> + goto free_fail;
This is rather silly. Put the checks before allocating. Then the post-alloc
code can simply be:
if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
p->post_addr = args->post_addr;
I.e. your burning more code to try and save code. E.g.
if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
(!args->len || !args->post_addr ||
args->post_addr != untagged_addr(args->post_addr) ||
!access_ok(u64_to_user_ptr(args->post_addr), args->len)))
return -EINVAL;
p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
if (!p) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->bus_idx = bus_idx;
p->length = args->len;
p->eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
p->datamatch = args->datamatch;
else
p->wildcard = true;
if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
p->post_addr = args->post_addr;
> + }
> + p->post_addr = args->post_addr;
> + } else if (!args->post_addr) {
This isn't a valid check. KVM didn't/doesn't require args->pad to be zero, so
it would be entirely legal for existing userspace to pass in a non-zero value and
expect success. If this added truly meaningful value, then maybe it would be
worth risking breakage, but in this case trying to help userspace is more likely
to do harm than good.
> + /*
> + * Ensure that post_addr isn't set without POST_WRITE to avoid accidental
Wrap at 80 since the comment carries over to a new line anyways. But as above,
it's moot.
^ permalink raw reply [flat|nested] 12+ messages in thread* RE: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-05 1:26 ` Sean Christopherson
@ 2026-03-06 11:14 ` Thanos Makatos
0 siblings, 0 replies; 12+ messages in thread
From: Thanos Makatos @ 2026-03-06 11:14 UTC (permalink / raw)
To: Sean Christopherson; +Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org
> -----Original Message-----
> From: Sean Christopherson <seanjc@google.com>
> Sent: 05 March 2026 01:27
> To: Thanos Makatos <thanos.makatos@nutanix.com>
> Cc: pbonzini@redhat.com; John Levon <john.levon@nutanix.com>;
> kvm@vger.kernel.org
> Subject: Re: [PATCH] KVM: optionally post write on ioeventfd write
>
> !-------------------------------------------------------------------|
> CAUTION: External Email
>
> |-------------------------------------------------------------------!
>
> Please don't send patches in-reply to the previous version(s), it tends to mess
> up b4.
>
> On Mon, Mar 02, 2026, Thanos Makatos wrote:
> > This patch is a slightly different take on the ioregionfd mechanism
> > previously described here:
> > https://urldefense.proofpoint.com/v2/url?u=https-
> 3A__lore.kernel.org_all_88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.ca
> mel-
> 40gmail.com_&d=DwIBAg&c=s883GpUCOChKOHiocYtGcg&r=XTpYsh5Ps2zJvt
> w6ogtti46atk736SI4vgsJiUKIyDE&m=XKZUVVKO9SGqV_txMzP2_tgrfJrgB2lU
> 50rbshSY1i91mYQgU2LKO23a_If0S6GB&s=kp06dnwO7ESRSZ1iL_VQw0yKD
> OOED0L4jHbNjj4FqgI&e=
> >
> > The goal of this new mechanism is to speed up doorbell writes on NVMe
> > controllers emulated outside of the VMM. Currently, a doorbell write to
> > an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> > VMM communicating the event, along with the doorbell value, to the NVMe
> > controller emulation task. With the shadow ioeventfd, the NVMe
> > emulation task is directly notified of the doorbell write and can find
> > the doorbell value in a known location, without the interference of the
> > VMM.
>
> Please add a KVM selftest to verify this works, and to verify that KVM rejects
> bad configurations.
Ack
>
> > Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
> > ---
> > include/uapi/linux/kvm.h | 11 ++++++++++-
> > tools/include/uapi/linux/kvm.h | 2 ++
> > virt/kvm/eventfd.c | 32 ++++++++++++++++++++++++++++++--
> > 3 files changed, 42 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index 65500f5db379..f3ff559de60d 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -639,6 +639,7 @@ enum {
> > kvm_ioeventfd_flag_nr_deassign,
> > kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> > kvm_ioeventfd_flag_nr_fast_mmio,
> > + kvm_ioevetnfd_flag_nr_post_write,
> > kvm_ioeventfd_flag_nr_max,
> > };
> >
> > @@ -648,6 +649,12 @@ enum {
> > #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> > (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> >
> > +/*
> > + * KVM does not provide any guarantees regarding read-after-write
> ordering for
> > + * such updates.
>
> Please document this (and more) in Documentation/virt/kvm/api.rst, not
> here.
Ack
>
> > + */
> > +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 <<
> kvm_ioevetnfd_flag_nr_post_write)
> > +
> > #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 <<
> kvm_ioeventfd_flag_nr_max) - 1)
> >
> > struct kvm_ioeventfd {
> > @@ -656,8 +663,10 @@ struct kvm_ioeventfd {
> > __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
> > __s32 fd;
> > __u32 flags;
> > - __u8 pad[36];
> > + void __user *post_addr; /* address to write to if POST_WRITE is set */
> > + __u8 pad[24];
> > };
> > +_Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
> >
> > #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
> > #define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
> > diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> > index dddb781b0507..1fb481c90b57 100644
> > --- a/tools/include/uapi/linux/kvm.h
> > +++ b/tools/include/uapi/linux/kvm.h
>
> Don't bother updating tools, the copy of uapi headers in tools is maintained by
> the perf folks (perf-the-tool needs all of the headers, nothing else does).
>
> > @@ -629,6 +629,7 @@ enum {
> > kvm_ioeventfd_flag_nr_deassign,
> > kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> > kvm_ioeventfd_flag_nr_fast_mmio,
> > + kvm_ioevetnfd_flag_nr_commit_write,
>
> Then you won't have amusing mistakes like this :-)
Ack
>
> > kvm_ioeventfd_flag_nr_max,
> > };
> >
> > @@ -637,6 +638,7 @@ enum {
> > #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 <<
> kvm_ioeventfd_flag_nr_deassign)
> > #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> > (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> > +#define KVM_IOEVENTFD_FLAG_COMMIT_WRITE (1 <<
> kvm_ioevetnfd_flag_nr_commit_write)
> >
> > #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 <<
> kvm_ioeventfd_flag_nr_max) - 1)
> >
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 0e8b8a2c5b79..019cf3606aef 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -741,6 +741,7 @@ struct _ioeventfd {
> > struct kvm_io_device dev;
> > u8 bus_idx;
> > bool wildcard;
> > + void __user *post_addr;
> > };
> >
> > static inline struct _ioeventfd *
> > @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct
> kvm_io_device *this, gpa_t addr,
> > if (!ioeventfd_in_range(p, addr, len, val))
> > return -EOPNOTSUPP;
> >
> > + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val,
> len))
> > + return -EFAULT;
> > +
> > eventfd_signal(p->eventfd);
> > return 0;
> > }
> > @@ -879,6 +883,27 @@ static int kvm_assign_ioeventfd_idx(struct kvm
> *kvm,
> > goto fail;
> > }
> >
> > + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) {
> > + /*
> > + * Although a NULL pointer it technically valid for userspace,
> it's
> > + * unlikely that any use case actually cares.
>
> This is fine for a changelog, but for a code comment, simply state that KVM's
> ABI
> is that NULL is disallowed.
Ack
>
> > + */
> > + if (!args->len || !args->post_addr ||
> > + args->post_addr != untagged_addr(args->post_addr)
> ||
> > + !access_ok((void __user *)(unsigned long)args-
> >post_addr, args->len)) {
>
> Align indentation. And use u64_to_user_ptr().
>
> > + ret = -EINVAL;
> > + goto free_fail;
>
> This is rather silly. Put the checks before allocating. Then the post-alloc
> code can simply be:
>
> if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
> p->post_addr = args->post_addr;
>
> I.e. your burning more code to try and save code. E.g.
>
> if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
> (!args->len || !args->post_addr ||
> args->post_addr != untagged_addr(args->post_addr) ||
> !access_ok(u64_to_user_ptr(args->post_addr), args->len)))
> return -EINVAL;
>
> p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
> if (!p) {
> ret = -ENOMEM;
> goto fail;
> }
>
> INIT_LIST_HEAD(&p->list);
> p->addr = args->addr;
> p->bus_idx = bus_idx;
> p->length = args->len;
> p->eventfd = eventfd;
>
> /* The datamatch feature is optional, otherwise this is a wildcard */
> if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
> p->datamatch = args->datamatch;
> else
> p->wildcard = true;
>
> if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
> p->post_addr = args->post_addr;
Ack
>
>
> > + }
> > + p->post_addr = args->post_addr;
> > + } else if (!args->post_addr) {
>
> This isn't a valid check. KVM didn't/doesn't require args->pad to be zero, so
> it would be entirely legal for existing userspace to pass in a non-zero value and
> expect success. If this added truly meaningful value, then maybe it would be
> worth risking breakage, but in this case trying to help userspace is more likely
> to do harm than good.
Ack
>
> > + /*
> > + * Ensure that post_addr isn't set without POST_WRITE to
> avoid accidental
>
> Wrap at 80 since the comment carries over to a new line anyways. But as
> above,
> it's moot.
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
2026-03-05 1:26 ` Sean Christopherson
@ 2026-03-05 1:49 ` kernel test robot
2026-03-05 9:39 ` kernel test robot
2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2026-03-05 1:49 UTC (permalink / raw)
To: Thanos Makatos, seanjc@google.com
Cc: oe-kbuild-all, pbonzini@redhat.com, John Levon,
kvm@vger.kernel.org, Thanos Makatos
Hi Thanos,
kernel test robot noticed the following build errors:
[auto build test ERROR on kvm/queue]
[also build test ERROR on kvm/next linus/master v7.0-rc2 next-20260304]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Thanos-Makatos/KVM-optionally-post-write-on-ioeventfd-write/20260302-204031
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20260302122826.2572-1-thanos.makatos%40nutanix.com
patch subject: [PATCH] KVM: optionally post write on ioeventfd write
config: i386-allnoconfig (https://download.01.org/0day-ci/archive/20260305/202603050920.Lmf80GaE-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260305/202603050920.Lmf80GaE-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603050920.Lmf80GaE-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from include/linux/kvm_host.h:40,
from arch/x86/events/intel/core.c:17:
>> include/uapi/linux/kvm.h:669:1: error: static assertion failed: "bad size"
669 | _Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
| ^~~~~~~~~~~~~~
vim +669 include/uapi/linux/kvm.h
659
660 struct kvm_ioeventfd {
661 __u64 datamatch;
662 __u64 addr; /* legal pio/mmio address */
663 __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
664 __s32 fd;
665 __u32 flags;
666 void __user *post_addr; /* address to write to if POST_WRITE is set */
667 __u8 pad[24];
668 };
> 669 _Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
670
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 12+ messages in thread* Re: [PATCH] KVM: optionally post write on ioeventfd write
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
2026-03-05 1:26 ` Sean Christopherson
2026-03-05 1:49 ` kernel test robot
@ 2026-03-05 9:39 ` kernel test robot
2 siblings, 0 replies; 12+ messages in thread
From: kernel test robot @ 2026-03-05 9:39 UTC (permalink / raw)
To: Thanos Makatos, seanjc@google.com
Cc: oe-kbuild-all, pbonzini@redhat.com, John Levon,
kvm@vger.kernel.org, Thanos Makatos
Hi Thanos,
kernel test robot noticed the following build errors:
[auto build test ERROR on kvm/queue]
[also build test ERROR on kvm/next linus/master v7.0-rc2 next-20260304]
[cannot apply to kvm/linux-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Thanos-Makatos/KVM-optionally-post-write-on-ioeventfd-write/20260302-204031
base: https://git.kernel.org/pub/scm/virt/kvm/kvm.git queue
patch link: https://lore.kernel.org/r/20260302122826.2572-1-thanos.makatos%40nutanix.com
patch subject: [PATCH] KVM: optionally post write on ioeventfd write
config: i386-randconfig-141-20260305 (https://download.01.org/0day-ci/archive/20260305/202603051704.nmQyEAnO-lkp@intel.com/config)
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
smatch: v0.5.0-9004-gb810ac53
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260305/202603051704.nmQyEAnO-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202603051704.nmQyEAnO-lkp@intel.com/
All errors (new ones prefixed by >>):
In file included from arch/x86/events/intel/core.c:17:
In file included from include/linux/kvm_host.h:40:
>> include/uapi/linux/kvm.h:669:16: error: static assertion failed due to requirement 'sizeof(struct kvm_ioeventfd) == 1 << 6': bad size
669 | _Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
| ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/uapi/linux/kvm.h:669:45: note: expression evaluates to '56 == 64'
669 | _Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
| ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~
1 error generated.
vim +669 include/uapi/linux/kvm.h
659
660 struct kvm_ioeventfd {
661 __u64 datamatch;
662 __u64 addr; /* legal pio/mmio address */
663 __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
664 __s32 fd;
665 __u32 flags;
666 void __user *post_addr; /* address to write to if POST_WRITE is set */
667 __u8 pad[24];
668 };
> 669 _Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
670
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2026-04-21 14:45 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-06 12:56 [PATCH] KVM: optionally post write on ioeventfd write Thanos Makatos
2026-03-12 15:02 ` David Woodhouse
2026-03-12 16:12 ` Thanos Makatos
2026-04-21 14:44 ` Sean Christopherson
2026-03-23 15:01 ` Thanos Makatos
2026-04-10 19:11 ` Thanos Makatos
2026-04-21 14:45 ` Sean Christopherson
-- strict thread matches above, loose matches on Subject: below --
2026-01-13 20:00 [RFC PATCH] KVM: optionally commit " Sean Christopherson
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
2026-03-05 1:26 ` Sean Christopherson
2026-03-06 11:14 ` Thanos Makatos
2026-03-05 1:49 ` kernel test robot
2026-03-05 9:39 ` kernel test robot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox