From: Sean Christopherson <seanjc@google.com>
To: Thanos Makatos <thanos.makatos@nutanix.com>
Cc: "pbonzini@redhat.com" <pbonzini@redhat.com>,
John Levon <john.levon@nutanix.com>,
"kvm@vger.kernel.org" <kvm@vger.kernel.org>
Subject: Re: [PATCH] KVM: optionally post write on ioeventfd write
Date: Wed, 4 Mar 2026 17:26:46 -0800 [thread overview]
Message-ID: <aajb1r7Al7mxK5Zf@google.com> (raw)
In-Reply-To: <20260302122826.2572-1-thanos.makatos@nutanix.com>
Please don't send patches in-reply to the previous version(s), it tends to mess
up b4.
On Mon, Mar 02, 2026, Thanos Makatos wrote:
> This patch is a slightly different take on the ioregionfd mechanism
> previously described here:
> https://lore.kernel.org/all/88ca79d2e378dcbfb3988b562ad2c16c4f929ac7.camel@gmail.com/
>
> The goal of this new mechanism is to speed up doorbell writes on NVMe
> controllers emulated outside of the VMM. Currently, a doorbell write to
> an NVMe SQ tail doorbell requires returning from ioctl(KVM_RUN) and the
> VMM communicating the event, along with the doorbell value, to the NVMe
> controller emulation task. With the shadow ioeventfd, the NVMe
> emulation task is directly notified of the doorbell write and can find
> the doorbell value in a known location, without the interference of the
> VMM.
Please add a KVM selftest to verify this works, and to verify that KVM rejects
bad configurations.
> Signed-off-by: Thanos Makatos <thanos.makatos@nutanix.com>
> ---
> include/uapi/linux/kvm.h | 11 ++++++++++-
> tools/include/uapi/linux/kvm.h | 2 ++
> virt/kvm/eventfd.c | 32 ++++++++++++++++++++++++++++++--
> 3 files changed, 42 insertions(+), 3 deletions(-)
>
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 65500f5db379..f3ff559de60d 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -639,6 +639,7 @@ enum {
> kvm_ioeventfd_flag_nr_deassign,
> kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> kvm_ioeventfd_flag_nr_fast_mmio,
> + kvm_ioevetnfd_flag_nr_post_write,
> kvm_ioeventfd_flag_nr_max,
> };
>
> @@ -648,6 +649,12 @@ enum {
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
>
> +/*
> + * KVM does not provide any guarantees regarding read-after-write ordering for
> + * such updates.
Please document this (and more) in Documentation/virt/kvm/api.rst, not here.
> + */
> +#define KVM_IOEVENTFD_FLAG_POST_WRITE (1 << kvm_ioevetnfd_flag_nr_post_write)
> +
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
>
> struct kvm_ioeventfd {
> @@ -656,8 +663,10 @@ struct kvm_ioeventfd {
> __u32 len; /* 1, 2, 4, or 8 bytes; or 0 to ignore length */
> __s32 fd;
> __u32 flags;
> - __u8 pad[36];
> + void __user *post_addr; /* address to write to if POST_WRITE is set */
> + __u8 pad[24];
> };
> +_Static_assert(sizeof(struct kvm_ioeventfd) == 1 << 6, "bad size");
>
> #define KVM_X86_DISABLE_EXITS_MWAIT (1 << 0)
> #define KVM_X86_DISABLE_EXITS_HLT (1 << 1)
> diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h
> index dddb781b0507..1fb481c90b57 100644
> --- a/tools/include/uapi/linux/kvm.h
> +++ b/tools/include/uapi/linux/kvm.h
Don't bother updating tools, the copy of uapi headers in tools is maintained by
the perf folks (perf-the-tool needs all of the headers, nothing else does).
> @@ -629,6 +629,7 @@ enum {
> kvm_ioeventfd_flag_nr_deassign,
> kvm_ioeventfd_flag_nr_virtio_ccw_notify,
> kvm_ioeventfd_flag_nr_fast_mmio,
> + kvm_ioevetnfd_flag_nr_commit_write,
Then you won't have amusing mistakes like this :-)
> kvm_ioeventfd_flag_nr_max,
> };
>
> @@ -637,6 +638,7 @@ enum {
> #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
> #define KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY \
> (1 << kvm_ioeventfd_flag_nr_virtio_ccw_notify)
> +#define KVM_IOEVENTFD_FLAG_COMMIT_WRITE (1 << kvm_ioevetnfd_flag_nr_commit_write)
>
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 0e8b8a2c5b79..019cf3606aef 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -741,6 +741,7 @@ struct _ioeventfd {
> struct kvm_io_device dev;
> u8 bus_idx;
> bool wildcard;
> + void __user *post_addr;
> };
>
> static inline struct _ioeventfd *
> @@ -812,6 +813,9 @@ ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
> if (!ioeventfd_in_range(p, addr, len, val))
> return -EOPNOTSUPP;
>
> + if (p->post_addr && len > 0 && __copy_to_user(p->post_addr, val, len))
> + return -EFAULT;
> +
> eventfd_signal(p->eventfd);
> return 0;
> }
> @@ -879,6 +883,27 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
> goto fail;
> }
>
> + if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) {
> + /*
> + * Although a NULL pointer it technically valid for userspace, it's
> + * unlikely that any use case actually cares.
This is fine for a changelog, but for a code comment, simply state that KVM's ABI
is that NULL is disallowed.
> + */
> + if (!args->len || !args->post_addr ||
> + args->post_addr != untagged_addr(args->post_addr) ||
> + !access_ok((void __user *)(unsigned long)args->post_addr, args->len)) {
Align indentation. And use u64_to_user_ptr().
> + ret = -EINVAL;
> + goto free_fail;
This is rather silly. Put the checks before allocating. Then the post-alloc
code can simply be:
if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
p->post_addr = args->post_addr;
I.e. your burning more code to try and save code. E.g.
if ((args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE) &&
(!args->len || !args->post_addr ||
args->post_addr != untagged_addr(args->post_addr) ||
!access_ok(u64_to_user_ptr(args->post_addr), args->len)))
return -EINVAL;
p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
if (!p) {
ret = -ENOMEM;
goto fail;
}
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->bus_idx = bus_idx;
p->length = args->len;
p->eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
p->datamatch = args->datamatch;
else
p->wildcard = true;
if (args->flags & KVM_IOEVENTFD_FLAG_POST_WRITE)
p->post_addr = args->post_addr;
> + }
> + p->post_addr = args->post_addr;
> + } else if (!args->post_addr) {
This isn't a valid check. KVM didn't/doesn't require args->pad to be zero, so
it would be entirely legal for existing userspace to pass in a non-zero value and
expect success. If this added truly meaningful value, then maybe it would be
worth risking breakage, but in this case trying to help userspace is more likely
to do harm than good.
> + /*
> + * Ensure that post_addr isn't set without POST_WRITE to avoid accidental
Wrap at 80 since the comment carries over to a new line anyways. But as above,
it's moot.
next prev parent reply other threads:[~2026-03-05 1:26 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-10-05 21:15 [RFC PATCH] KVM: optionally commit write on ioeventfd write Thanos Makatos
2025-09-05 14:09 ` Sean Christopherson
2025-12-02 9:15 ` Thanos Makatos
2025-12-02 22:53 ` Sean Christopherson
2025-12-03 14:25 ` Thanos Makatos
2025-12-15 22:42 ` Thanos Makatos
2025-12-19 1:20 ` Sean Christopherson
2025-12-19 10:36 ` John Levon
2026-01-13 19:43 ` Thanos Makatos
2026-01-13 20:00 ` Sean Christopherson
2026-03-02 12:28 ` [PATCH] KVM: optionally post " Thanos Makatos
2026-03-05 1:26 ` Sean Christopherson [this message]
2026-03-06 11:14 ` Thanos Makatos
2026-03-05 1:49 ` kernel test robot
2026-03-05 9:39 ` kernel test robot
-- strict thread matches above, loose matches on Subject: below --
2026-03-06 12:56 Thanos Makatos
2026-03-12 15:02 ` David Woodhouse
2026-03-12 16:12 ` Thanos Makatos
2026-03-23 15:01 ` Thanos Makatos
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=aajb1r7Al7mxK5Zf@google.com \
--to=seanjc@google.com \
--cc=john.levon@nutanix.com \
--cc=kvm@vger.kernel.org \
--cc=pbonzini@redhat.com \
--cc=thanos.makatos@nutanix.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox