public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] KVM: optionally post write on ioeventfd write
@ 2026-03-06 12:56 Thanos Makatos
  2026-03-12 15:02 ` David Woodhouse
  2026-03-23 15:01 ` Thanos Makatos
  0 siblings, 2 replies; 12+ messages in thread
From: Thanos Makatos @ 2026-03-06 12:56 UTC (permalink / raw)
  To: seanjc@google.com
  Cc: pbonzini@redhat.com, John Levon, kvm@vger.kernel.org,
	Thanos Makatos

Add a new flag, KVM_IOEVENTFD_FLAG_POST_WRITE, when assigning an
ioeventfd that results in the value written by the guest to be copied
to user-supplied memory instead of being discarded.

The goal of this new mechanism is to speed up doorbell writes on NVMe
controllers emulated outside of the VMM. Currently, a doorbell write to
an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
VMM communicating the event, along with the doorbell value, to the NVMe
controller emulation task.  With POST_WRITE, the NVMe emulation task is
directly notified of the doorbell write and can find the doorbell value
in a known location, without involving VMM.

Add tests for this new functionality.

LLM (claude-4.6-opus-high) was used mainly for the tests and to a
lesser extent for pre-reviewing this patch.

Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
---
 Documentation/virt/kvm/api.rst               |  13 +-
 include/uapi/linux/kvm.h                     |   6 +-
 tools/testing/selftests/kvm/Makefile.kvm     |   1 +
 tools/testing/selftests/kvm/ioeventfd_test.c | 624 +++++++++++++++++++
 virt/kvm/eventfd.c                           |  23 +
 virt/kvm/kvm_main.c                          |   1 +
 6 files changed, 666 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/kvm/ioeventfd_test.c

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 6f85e1b321dd..b8d030f03101 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -2109,7 +2109,8 @@ provided event instead of triggering an exit.
 	__u32 len;         /* 0, 1, 2, 4, or 8 bytes    */
 	__s32 fd;
 	__u32 flags;
-	__u8  pad[36];
+	__aligned_u64 post_addr; /* address to write to if POST_WRITE is set */
+	__u8  pad[24];
   };
 
 For the special case of virtio-ccw devices on s390, the ioevent is matched
@@ -2122,6 +2123,7 @@ The following flags are defined::
   #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
   #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
 	(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+  #define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioeventfd_flag_nr_post_write)
 
 If datamatch flag is set, the event will be signaled only if the written value
 to the registered address is equal to datamatch in struct kvm_ioeventfd.
@@ -2134,6 +2136,15 @@ the kernel will ignore the length of guest write and may get a faster vmexit.
 The speedup may only apply to specific architectures, but the ioeventfd will
 work anyway.
 
+With KVM_IOEVENTFD_FLAG_POST_WRITE, the value being written is copied to the
+userspace address specified by post_addr, and the eventfd is signaled.  The
+copy is guaranteed to complete before the eventfd is signaled, so a userspace
+reader that wakes on the eventfd will observe the written value.  When multiple
+vCPUs write to the same ioeventfd concurrently, the value at post_addr reflects
+one of the writes. If the copy to post_addr fails (e.g. the memory has been
+unmapped), the eventfd is not signaled and the write is reported to userspace
+as a regular MMIO/PIO exit.
+
 4.60 KVM_DIRTY_TLB
 ------------------
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 65500f5db379..55b8683a856f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -639,6 +639,7 @@ enum {
 	kvm_ioeventfd_flag_nr_deassign,
 	kvm_ioeventfd_flag_nr_virtio_ccw_notify,
 	kvm_ioeventfd_flag_nr_fast_mmio,
+	kvm_ioeventfd_flag_nr_post_write,
 	kvm_ioeventfd_flag_nr_max,
 };
 
@@ -647,6 +648,7 @@ enum {
 #define KVM_IOEVENTFD_FLAG_DEASSIGN  (1 << kvm_ioeventfd_flag_nr_deassign)
 #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
 	(1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
+#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioeventfd_flag_nr_post_write)
 
 #define KVM_IOEVENTFD_VALID_FLAG_MASK  ((1 << kvm_ioeventfd_flag_nr_max) - 1)
 
@@ -656,7 +658,8 @@ struct kvm_ioeventfd {
 	__u32 len;         /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
 	__s32 fd;
 	__u32 flags;
-	__u8  pad[36];
+	__aligned_u64 post_addr; /* address to write to if POST_WRITE is set */
+	__u8  pad[24];
 };
 
 #define KVM_X86_DISABLE_EXITS_MWAIT          (1 << 0)
@@ -985,6 +988,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_ARM_SEA_TO_USER 245
 #define KVM_CAP_S390_USER_OPEREXEC 246
 #define KVM_CAP_S390_KEYOP 247
+#define KVM_CAP_IOEVENTFD_POST_WRITE 248
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index fdec90e85467..7ab470981c31 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -64,6 +64,7 @@ TEST_GEN_PROGS_COMMON += kvm_binary_stats_test
 TEST_GEN_PROGS_COMMON += kvm_create_max_vcpus
 TEST_GEN_PROGS_COMMON += kvm_page_table_test
 TEST_GEN_PROGS_COMMON += set_memory_region_test
+TEST_GEN_PROGS_COMMON += ioeventfd_test
 
 # Compiled test targets
 TEST_GEN_PROGS_x86 = $(TEST_GEN_PROGS_COMMON)
diff --git a/tools/testing/selftests/kvm/ioeventfd_test.c b/tools/testing/selftests/kvm/ioeventfd_test.c
new file mode 100644
index 000000000000..24875a2562d4
--- /dev/null
+++ b/tools/testing/selftests/kvm/ioeventfd_test.c
@@ -0,0 +1,624 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ioeventfd_test.c - Tests for KVM_IOEVENTFD_FLAG_POST_WRITE.
+ *
+ * Tests that when KVM_IOEVENTFD_FLAG_POST_WRITE is set and the MMIO/PIO
+ * address is written to, the value is copied to the user-provided address
+ * and the eventfd is signaled.  Also tests negative cases and interactions
+ * with DATAMATCH.
+ *
+ * Copyright Nutanix, 2026
+ *
+ * Author: Thanos Makatos <thanos.makatos@nutanix.com>
+ */
+
+#include <errno.h>
+#include <poll.h>
+#include <string.h>
+
+#include "kvm_util.h"
+#include "processor.h"
+#include "ucall_common.h"
+
+#define MMIO_GPA	(1UL << 30)
+#define PIO_PORT	0xe000
+#define TEST_VAL	0xDEADBEEFCAFEBABEULL
+#define MATCH_VAL	0x42U
+#define NOMATCH_VAL	(MATCH_VAL + 1)
+#define POISON_VAL	0xFFFFFFFFU
+
+/*
+ * Check that the most recent vCPU exit is a ucall (delivered as KVM_EXIT_IO
+ * on x86) matching @expected_cmd.  The caller must have already called
+ * vcpu_run().
+ *
+ * @expected_cmd:   UCALL_SYNC, UCALL_DONE, etc.
+ * @expected_stage: for UCALL_SYNC, the stage number passed by GUEST_SYNC().
+ *                  Ignored for other ucall types.
+ *
+ * Aborts the test on UCALL_ABORT (a guest-side assertion failure).
+ */
+static void assert_ucall(struct kvm_vcpu *vcpu, uint64_t expected_cmd,
+                         uint64_t expected_stage)
+{
+	struct ucall uc;
+
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	case UCALL_SYNC:
+		TEST_ASSERT(expected_cmd == UCALL_SYNC,
+			    "got UCALL_SYNC, expected %lu",
+			    expected_cmd);
+		TEST_ASSERT(uc.args[1] == expected_stage,
+			    "expected stage %lu, got %lu",
+			    expected_stage, uc.args[1]);
+		break;
+	case UCALL_DONE:
+		TEST_ASSERT(expected_cmd == UCALL_DONE,
+			    "got UCALL_DONE, expected %lu",
+			    expected_cmd);
+		break;
+	default:
+		TEST_FAIL("unexpected ucall %lu", uc.cmd);
+	}
+}
+
+/*
+ * Verify that KVM_IOEVENTFD rejects invalid POST_WRITE configurations:
+ *   - len=0: the kernel needs a non-zero length to know how many bytes to copy.
+ *   - post_addr=NULL: there is no destination for the copy.
+ *   - post_addr outside the process address space: access_ok() rejects it.
+ * All three must fail with EINVAL.
+ */
+static void test_post_write_negative(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vm *vm;
+	uint64_t dummy;
+	int ret;
+	int fd;
+
+	vm = vm_create_barebones();
+	fd = kvm_new_eventfd();
+
+	/* length cannot be zero */
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = MMIO_GPA,
+		.len = 0,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+		.post_addr = (u64)&dummy,
+	};
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "len=0: expected EINVAL, got ret=%d errno=%d", ret, errno);
+
+	/* post_addr cannot be NULL */
+	ioeventfd.len = 4;
+	ioeventfd.post_addr = 0ULL;
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "NULL post_addr: expected EINVAL, got ret=%d errno=%d",
+		    ret, errno);
+
+	/* bogus post_addr */
+	ioeventfd.post_addr = (u64)0xdeaddeaddeaddeadULL;
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(ret && errno == EINVAL,
+		    "bad post_addr: expected EINVAL, got ret=%d errno=%d",
+		    ret, errno);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+#define DEFINE_GUEST_WRITE_FN(suffix, type)      \
+static void guest_code_w##suffix(void) {         \
+	*(volatile type *)MMIO_GPA = (type)TEST_VAL; \
+	GUEST_DONE();                                \
+}
+
+DEFINE_GUEST_WRITE_FN(1, uint8_t)
+DEFINE_GUEST_WRITE_FN(2, uint16_t)
+DEFINE_GUEST_WRITE_FN(4, uint32_t)
+DEFINE_GUEST_WRITE_FN(8, uint64_t)
+
+/*
+ * Verify that ioeventfd_write copies exactly @width bytes to post_addr for
+ * each supported MMIO write width (1, 2, 4, 8).  The guest writes the low
+ * @width bytes of TEST_VAL; the host checks that exactly those bytes land
+ * at post_addr and the eventfd is signaled.
+ */
+static void test_post_write_width(int width, void (*guest_fn)(void))
+{
+	uint64_t actual, expected, count;
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	int fd, ret;
+
+	/* need to initialize to 0 because the guest writes the low @width bytes */
+	actual = 0;
+	expected = 0;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_fn);
+	virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+	fd = kvm_new_eventfd();
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = MMIO_GPA,
+		.len = width,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	ret = read(fd, &count, sizeof(count));
+	TEST_ASSERT(ret == sizeof(count),
+		    "eventfd read failed: ret=%d errno=%d", ret, errno);
+
+	memcpy(&expected, &(uint64_t){TEST_VAL}, width);
+	TEST_ASSERT_EQ(actual, expected);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+static void guest_code_datamatch(void)
+{
+	*(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+	GUEST_SYNC(1);
+	*(volatile uint32_t *)MMIO_GPA = NOMATCH_VAL;
+	GUEST_SYNC(2);
+	GUEST_DONE();
+}
+
+/*
+ * Test the interaction between DATAMATCH and POST_WRITE.  When both flags are
+ * set, ioeventfd_write should only fire (signal eventfd + copy value) when the
+ * written value matches datamatch.  A non-matching write must leave the eventfd
+ * unsignaled and post_addr untouched, and fall through to KVM_EXIT_MMIO.
+ */
+static void test_post_write_datamatch(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct pollfd pfd;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_datamatch);
+	virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+	run = vcpu->run;
+
+	fd = kvm_new_eventfd();
+	pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.datamatch = MATCH_VAL,
+		.addr = MMIO_GPA,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+			 KVM_IOEVENTFD_FLAG_DATAMATCH,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	/*
+	 * Guest writes MATCH_VAL → ioeventfd fires (value copied, eventfd
+	 * signaled), vCPU continues, then GUEST_SYNC(1).
+	 */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_SYNC, 1);
+	TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+	            "eventfd read failed: errno=%d", errno);
+	TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+	actual = POISON_VAL;
+
+	/*
+	 * Guest writes NOMATCH_VAL → ioeventfd_in_range() returns false, bus
+	 * returns -EOPNOTSUPP → KVM_EXIT_MMIO to userspace.
+	 */
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+	TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
+	TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
+	            "expected MMIO at 0x%lx, got 0x%llx",
+	            MMIO_GPA, run->mmio.phys_addr);
+
+	/* Re-enter: KVM completes the MMIO, guest runs to GUEST_SYNC(2). */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_SYNC, 2);
+
+	TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+	            "eventfd should not be signaled after non-matching write");
+	TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+static void guest_code_multi(void)
+{
+	*(volatile uint32_t *)MMIO_GPA = 0x11111111;
+	GUEST_SYNC(1);
+	*(volatile uint32_t *)MMIO_GPA = 0x22222222;
+	GUEST_SYNC(2);
+	*(volatile uint32_t *)MMIO_GPA = 0x33333333;
+	GUEST_SYNC(3);
+	GUEST_DONE();
+}
+
+/*
+ * Verify that post_addr is updated on every MMIO write, not just the first.
+ * The guest writes three distinct values in sequence; the host checks after
+ * each one that post_addr holds the latest value and the eventfd is signaled
+ * each time.
+ */
+static void test_post_write_multi(void)
+{
+	static const uint32_t expected[] = {
+		0x11111111, 0x22222222, 0x33333333,
+	};
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret, i;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi);
+	virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+	fd = kvm_new_eventfd();
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = MMIO_GPA,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	for (i = 0; i < ARRAY_SIZE(expected); i++) {
+		vcpu_run(vcpu);
+		assert_ucall(vcpu, UCALL_SYNC, i + 1);
+		TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+		            "eventfd read failed: errno=%d", errno);
+		TEST_ASSERT_EQ(actual, expected[i]);
+	}
+
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+static void guest_code_multi_nosync(void)
+{
+	*(volatile uint32_t *)MMIO_GPA = 0x11111111;
+	*(volatile uint32_t *)MMIO_GPA = 0x22222222;
+	*(volatile uint32_t *)MMIO_GPA = 0x33333333;
+	GUEST_DONE();
+}
+
+/*
+ * Variant of the multi-write test where the guest performs three consecutive
+ * MMIO writes with no GUEST_SYNC in between.  All three are handled in-kernel
+ * by ioeventfd before the vCPU exits at GUEST_DONE.  Verify that:
+ *   - post_addr reflects the last written value (0x33333333).
+ *   - A single eventfd read() returns a counter of 3 (one signal per write).
+ */
+static void test_post_write_multi_nosync(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_multi_nosync);
+	virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+
+	fd = kvm_new_eventfd();
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = MMIO_GPA,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	ret = read(fd, &count, sizeof(count));
+	TEST_ASSERT(ret == sizeof(count),
+		    "eventfd read failed: ret=%d errno=%d", ret, errno);
+	TEST_ASSERT_EQ(count, (uint64_t)3);
+	TEST_ASSERT_EQ(actual, (uint32_t)0x33333333);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+static void guest_code_deassign(void)
+{
+	*(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+	GUEST_SYNC(1);
+	*(volatile uint32_t *)MMIO_GPA = MATCH_VAL;
+	GUEST_DONE();
+}
+
+/*
+ * Verify that deassigning an ioeventfd with POST_WRITE fully removes it from
+ * the I/O bus.
+ */
+static void test_post_write_deassign(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct pollfd pfd;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_deassign);
+	virt_map(vm, MMIO_GPA, MMIO_GPA, 1);
+	run = vcpu->run;
+
+	fd = kvm_new_eventfd();
+	pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = MMIO_GPA,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD assign failed: %s", strerror(errno));
+
+	/*
+	 * Guest writes MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
+	 */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_SYNC, 1);
+	TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+	            "eventfd read failed: errno=%d", errno);
+	TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+	/* Deassign the ioeventfd. */
+	ioeventfd.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD deassign failed: %s", strerror(errno));
+
+	actual = POISON_VAL;
+
+	/*
+	 * Guest writes MATCH_VAL again → no handler on the bus →
+	 * KVM_EXIT_MMIO to userspace.
+	 */
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_MMIO);
+	TEST_ASSERT(run->mmio.is_write, "expected MMIO write");
+	TEST_ASSERT(run->mmio.phys_addr == MMIO_GPA,
+	            "expected MMIO at 0x%lx, got 0x%llx",
+	            MMIO_GPA, run->mmio.phys_addr);
+
+	/* Re-enter: KVM completes MMIO, guest runs to GUEST_DONE. */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+		    "eventfd should not be signaled after deassign");
+	TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+#ifdef __x86_64__
+static void guest_code_pio(void)
+{
+	outl(PIO_PORT, (uint32_t)TEST_VAL);
+	GUEST_DONE();
+}
+
+/*
+ * Verify that POST_WRITE works on the PIO bus (KVM_PIO_BUS), not just MMIO.
+ * The guest does an outl to PIO_PORT; the host checks that the written value
+ * is copied to post_addr and the eventfd is signaled.
+ */
+static void test_post_write_pio(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio);
+
+	fd = kvm_new_eventfd();
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.addr = PIO_PORT,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+			 KVM_IOEVENTFD_FLAG_PIO,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	ret = read(fd, &count, sizeof(count));
+	TEST_ASSERT(ret == sizeof(count),
+	            "eventfd read failed: ret=%d errno=%d", ret, errno);
+
+	TEST_ASSERT_EQ(actual, (uint32_t)TEST_VAL);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+
+static void guest_code_pio_datamatch(void)
+{
+	outl(PIO_PORT, MATCH_VAL);
+	GUEST_SYNC(1);
+	outl(PIO_PORT, NOMATCH_VAL);
+	GUEST_SYNC(2);
+	GUEST_DONE();
+}
+
+/*
+ * Test POST_WRITE + PIO + DATAMATCH together.  When all three flags are set,
+ * the ioeventfd should only fire when the outl value matches datamatch.
+ * A non-matching outl must fall through to KVM_EXIT_IO (port I/O exit),
+ * leaving the eventfd unsignaled and post_addr untouched.
+ */
+static void test_post_write_pio_datamatch(void)
+{
+	struct kvm_ioeventfd ioeventfd;
+	struct kvm_vcpu *vcpu;
+	struct kvm_run *run;
+	struct kvm_vm *vm;
+	struct pollfd pfd;
+	uint64_t count;
+	uint32_t actual;
+	int fd, ret;
+
+	actual = POISON_VAL;
+
+	vm = vm_create_with_one_vcpu(&vcpu, guest_code_pio_datamatch);
+	run = vcpu->run;
+
+	fd = kvm_new_eventfd();
+	pfd = (struct pollfd){ .fd = fd, .events = POLLIN };
+
+	ioeventfd = (struct kvm_ioeventfd) {
+		.datamatch = MATCH_VAL,
+		.addr = PIO_PORT,
+		.len = 4,
+		.fd = fd,
+		.flags = KVM_IOEVENTFD_FLAG_POST_WRITE |
+		         KVM_IOEVENTFD_FLAG_PIO |
+		         KVM_IOEVENTFD_FLAG_DATAMATCH,
+		.post_addr = (u64)&actual,
+	};
+
+	ret = __vm_ioctl(vm, KVM_IOEVENTFD, &ioeventfd);
+	TEST_ASSERT(!ret, "KVM_IOEVENTFD failed: %s", strerror(errno));
+
+	/*
+	 * Guest does outl MATCH_VAL → ioeventfd fires, then GUEST_SYNC(1).
+	 */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_SYNC, 1);
+	TEST_ASSERT(read(fd, &count, sizeof(count)) == sizeof(count),
+	            "eventfd read failed: errno=%d", errno);
+	TEST_ASSERT_EQ(actual, MATCH_VAL);
+
+	actual = POISON_VAL;
+
+	/*
+	 * Guest does outl NOMATCH_VAL → no match → KVM_EXIT_IO.
+	 */
+	vcpu_run(vcpu);
+	TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
+	TEST_ASSERT(run->io.direction == KVM_EXIT_IO_OUT,
+	            "expected PIO write");
+	TEST_ASSERT(run->io.port == PIO_PORT,
+	            "expected PIO at 0x%x, got 0x%x",
+	            PIO_PORT, run->io.port);
+
+	/* Re-enter: guest continues to GUEST_SYNC(2). */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_SYNC, 2);
+
+	TEST_ASSERT(poll(&pfd, 1, 0) == 0,
+		    "eventfd should not be signaled after non-matching PIO write");
+	TEST_ASSERT_EQ(actual, (uint32_t)POISON_VAL);
+
+	/* GUEST_DONE */
+	vcpu_run(vcpu);
+	assert_ucall(vcpu, UCALL_DONE, 0);
+
+	close(fd);
+	kvm_vm_free(vm);
+}
+#endif
+
+int main(void)
+{
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_IOEVENTFD_POST_WRITE));
+
+	test_post_write_negative();
+
+	test_post_write_width(1, guest_code_w1);
+	test_post_write_width(2, guest_code_w2);
+	test_post_write_width(4, guest_code_w4);
+	test_post_write_width(8, guest_code_w8);
+
+	test_post_write_datamatch();
+	test_post_write_multi();
+	test_post_write_multi_nosync();
+	test_post_write_deassign();
+
+#ifdef __x86_64__
+	test_post_write_pio();
+	test_post_write_pio_datamatch();
+#endif
+
+	return 0;
+}
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 0e8b8a2c5b79..22bc49a41503 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -741,6 +741,7 @@ struct _ioeventfd {
 	struct kvm_io_device dev;
 	u8                   bus_idx;
 	bool                 wildcard;
+	void         __user *post_addr;
 };
 
 static inline struct _ioeventfd *
@@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
 	if (!ioeventfd_in_range(p, addr, len, val))
 		return -EOPNOTSUPP;
 
+	if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len))
+		return -EFAULT;
+
 	eventfd_signal(p->eventfd);
 	return 0;
 }
@@ -866,6 +870,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 {
 
 	struct eventfd_ctx *eventfd;
+	void __user *post_addr;
 	struct _ioeventfd *p;
 	int ret;
 
@@ -873,6 +878,16 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
+	post_addr = u64_to_user_ptr(args->post_addr);
+	if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
+	    (!args->len || !post_addr ||
+	     args->post_addr != untagged_addr(args->post_addr) ||
+	     !access_ok(post_addr, args->len))) {
+		/* In KVM’s ABI, post_addr must be non‑NULL. */
+		ret = -EINVAL;
+		goto fail;
+	}
+
 	p = kzalloc_obj(*p, GFP_KERNEL_ACCOUNT);
 	if (!p) {
 		ret = -ENOMEM;
@@ -891,6 +906,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 	else
 		p->wildcard = true;
 
+	if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
+		p->post_addr = post_addr;
+
 	mutex_lock(&kvm->slots_lock);
 
 	/* Verify that there isn't a match already */
@@ -942,6 +960,11 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx,
 	mutex_lock(&kvm->slots_lock);
 
 	list_for_each_entry(p, &kvm->ioeventfds, list) {
+		/*
+		 * No need to match post_addr, ioeventfd_check_collision
+		 * prevents duplicate registrations that only differ by
+		 * post_addr.
+		 */
 		if (p->bus_idx != bus_idx ||
 		    p->eventfd != eventfd  ||
 		    p->addr != args->addr  ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1bc1da66b4b0..02abca5c49df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4883,6 +4883,7 @@ static int kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IRQFD:
 #endif
 	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
+	case KVM_CAP_IOEVENTFD_POST_WRITE:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_HALT_POLL:
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread
* Re: [RFC PATCH] KVM: optionally commit write on ioeventfd write
@ 2026-01-13 20:00 Sean Christopherson
  2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
  0 siblings, 1 reply; 12+ messages in thread
From: Sean Christopherson @ 2026-01-13 20:00 UTC (permalink / raw)
  To: Thanos Makatos
  Cc: kvm@vger.kernel.org, John Levon, mst@redhat.com,
	dinechin@redhat.com, cohuck@redhat.com, jasowang@redhat.com,
	stefanha@redhat.com, jag.raman@oracle.com, eafanasova@gmail.com,
	elena.ufimtseva@oracle.com, Paolo Bonzini

On Tue, Jan 13, 2026, Thanos Makatos wrote:
> > +Paolo (just realized Paolo isn't on the Cc)
> > 
> > On Wed, Dec 03, 2025, Thanos Makatos wrote:
> > > > From: Sean Christopherson <seanjc@google.com>
> > > > Side topic, Paolo had an off-the-cuff idea of adding uAPI to support
> > > > notifications on memslot ranges, as opposed to posting writes via
> > > > ioeventfd.  E.g. add a memslot flag, or maybe a memory attribute, that
> > > > causes KVM to write-protect a region, emulate in response to writes,
> > > > and then notify an eventfd after emulating the write.  It'd be a lot
> > > > like KVM_MEM_READONLY, except that KVM would commit the write to
> > > > memory and notify, as opposed to exiting to userspace.
> > >
> > > Are you thinking for reusing/adapting the mechanism in this patch for that?
> > 
> > Paolo's idea was to forego this patch entirely and instead add a more
> > generic write-notify mechanism.  In practice, the only real difference is
> > that the writes would be fully in-place instead of a redirection, which in
> > turn would allow the guest to read without triggering a VM-Exit, and I
> > suppose might save userspace from some dirty logging operations.
> > 
> > While I really like the mechanics of the idea, after sketching out the
> > basic gist (see below), I'm not convinced the additional complexity is
> > worth the gains.  Unless reading from NVMe submission queues is a common
> > operation, it doesn't seem like eliding VM-Exits on reads buys much.
> > 
> > Every arch would need to be updated to handle the new way of handling
> > emulated writes, with varying degrees of complexity.  E.g. on x86 I think
> > it would just be teaching the MMU about the new "emulate on write"
> > behavior, but for arm64 (and presumably any other architecture without a
> > generic emulator), it would be that plus new code to actually commit the
> > write to guest memory.
> > 
> > The other scary aspect is correctly handling "writable from KVM" and "can't
> > be mapped writable".  Getting that correct in all places is non-trivial,
> > and seems like it could be a pain to maintain, which potentially fatal
> > failure modes, e.g.  if KVM writes guest memory but fails to notify,
> > tracking down the bug would be "fun".
> > 
> > So my vote is to add POST_WRITE functionality to I/O eventfd, and hold off
> > on a generic write-notify mechanism until there's a (really) strong use
> > case.
> > 
> > Paolo, thoughts?
> 
> In the absence of a response, shall we go ahead with POST_WRITE? I have the
> revised patch ready.

Ya, fire away.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2026-04-21 14:45 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-03-06 12:56 [PATCH] KVM: optionally post write on ioeventfd write Thanos Makatos
2026-03-12 15:02 ` David Woodhouse
2026-03-12 16:12   ` Thanos Makatos
2026-04-21 14:44     ` Sean Christopherson
2026-03-23 15:01 ` Thanos Makatos
2026-04-10 19:11   ` Thanos Makatos
2026-04-21 14:45     ` Sean Christopherson
  -- strict thread matches above, loose matches on Subject: below --
2026-01-13 20:00 [RFC PATCH] KVM: optionally commit " Sean Christopherson
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
2026-03-05  1:26   ` Sean Christopherson
2026-03-06 11:14     ` Thanos Makatos
2026-03-05  1:49   ` kernel test robot
2026-03-05  9:39   ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox