* [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
@ 2011-07-03 17:04 Sasha Levin
2011-07-03 17:16 ` Avi Kivity
2011-07-04 10:32 ` Michael S. Tsirkin
0 siblings, 2 replies; 20+ messages in thread
From: Sasha Levin @ 2011-07-03 17:04 UTC (permalink / raw)
To: kvm
Cc: Sasha Levin, Avi Kivity, Ingo Molnar, Marcelo Tosatti,
Michael S. Tsirkin, Pekka Enberg
The new flag allows passing a write side of a pipe instead of an
eventfd to be notified of writes to the specified memory region.
Instead of signaling an event, the value written to the memory region
is written to the pipe.
Using a pipe instead of an eventfd is usefull when any value can be
written to the memory region but we're interested in recieving the
actual value instead of just a notification.
A simple example for practical use is the serial port. we are not
interested in an exit every time a char is written to the port, but
we do need to know what was written so we could handle it on the guest.
Cc: Avi Kivity <avi@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
include/linux/kvm.h | 2 +
virt/kvm/eventfd.c | 65 +++++++++++++++++++++++++++++++++++---------------
2 files changed, 47 insertions(+), 20 deletions(-)
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 55ef181..548f23a 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -387,12 +387,14 @@ enum {
kvm_ioeventfd_flag_nr_datamatch,
kvm_ioeventfd_flag_nr_pio,
kvm_ioeventfd_flag_nr_deassign,
+ kvm_ioeventfd_flag_nr_pipe,
kvm_ioeventfd_flag_nr_max,
};
#define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
#define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
#define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
+#define KVM_IOEVENTFD_FLAG_PIPE (1 << kvm_ioeventfd_flag_nr_pipe)
#define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 73358d2..434293e 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -413,10 +413,11 @@ module_exit(irqfd_module_exit);
/*
* --------------------------------------------------------------------
- * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
+ * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
+ * a pipe write.
*
- * userspace can register a PIO/MMIO address with an eventfd for receiving
- * notification when the memory has been touched.
+ * userspace can register a PIO/MMIO address with an eventfd or a
+ * pipe for receiving notification when the memory has been touched.
* --------------------------------------------------------------------
*/
@@ -424,6 +425,7 @@ struct _ioeventfd {
struct list_head list;
u64 addr;
int length;
+ struct file *pipe;
struct eventfd_ctx *eventfd;
u64 datamatch;
struct kvm_io_device dev;
@@ -439,7 +441,11 @@ to_ioeventfd(struct kvm_io_device *dev)
static void
ioeventfd_release(struct _ioeventfd *p)
{
- eventfd_ctx_put(p->eventfd);
+ if (p->eventfd)
+ eventfd_ctx_put(p->eventfd);
+ else
+ fput(p->pipe);
+
list_del(&p->list);
kfree(p);
}
@@ -481,6 +487,21 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
return _val == p->datamatch ? true : false;
}
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+ loff_t pos)
+{
+ mm_segment_t old_fs;
+ ssize_t res;
+
+ old_fs = get_fs();
+ set_fs(get_ds());
+ /* The cast to a user pointer is valid due to the set_fs() */
+ res = vfs_write(file, (const char __user *)buf, count, &pos);
+ set_fs(old_fs);
+
+ return res;
+}
+
/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
@@ -491,7 +512,11 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
if (!ioeventfd_in_range(p, addr, len, val))
return -EOPNOTSUPP;
- eventfd_signal(p->eventfd, 1);
+ if (p->pipe)
+ kernel_write(p->pipe, val, len, 0);
+ else
+ eventfd_signal(p->eventfd, 1);
+
return 0;
}
@@ -533,7 +558,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p;
- struct eventfd_ctx *eventfd;
+ struct eventfd_ctx *eventfd = NULL;
int ret;
/* must be natural-word sized */
@@ -555,9 +580,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
return -EINVAL;
- eventfd = eventfd_ctx_fdget(args->fd);
- if (IS_ERR(eventfd))
- return PTR_ERR(eventfd);
+ if (!(args->flags & KVM_IOEVENTFD_FLAG_PIPE)) {
+ eventfd = eventfd_ctx_fdget(args->fd);
+ if (IS_ERR(eventfd))
+ return PTR_ERR(eventfd);
+ }
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p) {
@@ -568,7 +595,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
INIT_LIST_HEAD(&p->list);
p->addr = args->addr;
p->length = args->len;
- p->eventfd = eventfd;
+
+ if (args->flags & KVM_IOEVENTFD_FLAG_PIPE)
+ p->pipe = fget(args->fd);
+ else
+ p->eventfd = eventfd;
/* The datamatch feature is optional, otherwise this is a wildcard */
if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
@@ -601,7 +632,9 @@ unlock_fail:
fail:
kfree(p);
- eventfd_ctx_put(eventfd);
+
+ if (!(args->flags & KVM_IOEVENTFD_FLAG_PIPE))
+ eventfd_ctx_put(eventfd);
return ret;
}
@@ -612,20 +645,14 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
struct _ioeventfd *p, *tmp;
- struct eventfd_ctx *eventfd;
int ret = -ENOENT;
- eventfd = eventfd_ctx_fdget(args->fd);
- if (IS_ERR(eventfd))
- return PTR_ERR(eventfd);
-
mutex_lock(&kvm->slots_lock);
list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
- if (p->eventfd != eventfd ||
- p->addr != args->addr ||
+ if (p->addr != args->addr ||
p->length != args->len ||
p->wildcard != wildcard)
continue;
@@ -641,8 +668,6 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
mutex_unlock(&kvm->slots_lock);
- eventfd_ctx_put(eventfd);
-
return ret;
}
--
1.7.6
^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-03 17:04 [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE Sasha Levin
@ 2011-07-03 17:16 ` Avi Kivity
2011-07-03 17:44 ` Sasha Levin
2011-07-04 10:32 ` Michael S. Tsirkin
1 sibling, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-03 17:16 UTC (permalink / raw)
To: Sasha Levin
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On 07/03/2011 08:04 PM, Sasha Levin wrote:
> The new flag allows passing a write side of a pipe instead of an
> eventfd to be notified of writes to the specified memory region.
>
> Instead of signaling an event, the value written to the memory region
> is written to the pipe.
>
> Using a pipe instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
> ---
> include/linux/kvm.h | 2 +
> virt/kvm/eventfd.c | 65 +++++++++++++++++++++++++++++++++++---------------
Documentation/virtua/kvm/api.txt +++++++++++++++++
>
> @@ -424,6 +425,7 @@ struct _ioeventfd {
> struct list_head list;
> u64 addr;
> int length;
> + struct file *pipe;
> struct eventfd_ctx *eventfd;
In a union with eventfd please.
> @@ -481,6 +487,21 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
> return _val == p->datamatch ? true : false;
> }
>
> +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> + loff_t pos)
> +{
> + mm_segment_t old_fs;
> + ssize_t res;
> +
> + old_fs = get_fs();
> + set_fs(get_ds());
> + /* The cast to a user pointer is valid due to the set_fs() */
> + res = vfs_write(file, (const char __user *)buf, count,&pos);
> + set_fs(old_fs);
> +
> + return res;
> +}
> +
Is there no generic helper for this? Should there be?
> /* MMIO/PIO writes trigger an event if the addr/val match */
> static int
> ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> @@ -491,7 +512,11 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> if (!ioeventfd_in_range(p, addr, len, val))
> return -EOPNOTSUPP;
>
> - eventfd_signal(p->eventfd, 1);
> + if (p->pipe)
> + kernel_write(p->pipe, val, len, 0);
You're writing potentially variable length data.
We need a protocol containing address, data, length, and supporting read
accesses as well.
Is the write guaranteed atomic? We probably need serialization here.
> + else
> + eventfd_signal(p->eventfd, 1);
> +
> return 0;
> }
>
> @@ -555,9 +580,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> if (args->flags& ~KVM_IOEVENTFD_VALID_FLAG_MASK)
> return -EINVAL;
>
> - eventfd = eventfd_ctx_fdget(args->fd);
> - if (IS_ERR(eventfd))
> - return PTR_ERR(eventfd);
> + if (!(args->flags& KVM_IOEVENTFD_FLAG_PIPE)) {
> + eventfd = eventfd_ctx_fdget(args->fd);
> + if (IS_ERR(eventfd))
> + return PTR_ERR(eventfd);
> + }
>
> p = kzalloc(sizeof(*p), GFP_KERNEL);
> if (!p) {
> @@ -568,7 +595,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> INIT_LIST_HEAD(&p->list);
> p->addr = args->addr;
> p->length = args->len;
> - p->eventfd = eventfd;
> +
> + if (args->flags& KVM_IOEVENTFD_FLAG_PIPE)
> + p->pipe = fget(args->fd);
> + else
> + p->eventfd = eventfd;
The split logic with the previous hunk isn't nice. Suggest moving the
'else' there, and assigning the whole union here.
> list_for_each_entry_safe(p, tmp,&kvm->ioeventfds, list) {
> bool wildcard = !(args->flags& KVM_IOEVENTFD_FLAG_DATAMATCH);
>
> - if (p->eventfd != eventfd ||
> - p->addr != args->addr ||
> + if (p->addr != args->addr ||
> p->length != args->len ||
> p->wildcard != wildcard)
> continue;
Why?
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-03 17:16 ` Avi Kivity
@ 2011-07-03 17:44 ` Sasha Levin
2011-07-03 17:57 ` Pekka Enberg
2011-07-04 10:27 ` Avi Kivity
0 siblings, 2 replies; 20+ messages in thread
From: Sasha Levin @ 2011-07-03 17:44 UTC (permalink / raw)
To: Avi Kivity
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On Sun, 2011-07-03 at 20:16 +0300, Avi Kivity wrote:
> On 07/03/2011 08:04 PM, Sasha Levin wrote:
> > The new flag allows passing a write side of a pipe instead of an
> > eventfd to be notified of writes to the specified memory region.
> >
> > Instead of signaling an event, the value written to the memory region
> > is written to the pipe.
> >
> > Using a pipe instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> >
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
>
> > ---
> > include/linux/kvm.h | 2 +
> > virt/kvm/eventfd.c | 65 +++++++++++++++++++++++++++++++++++---------------
>
> Documentation/virtua/kvm/api.txt +++++++++++++++++
I couldn't find the ioeventfd docs in there and forgot that it's not in
the mainline yet, I'll rebase on kvm git tree :)
> >
> > @@ -424,6 +425,7 @@ struct _ioeventfd {
> > struct list_head list;
> > u64 addr;
> > int length;
> > + struct file *pipe;
> > struct eventfd_ctx *eventfd;
>
> In a union with eventfd please.
>
> > @@ -481,6 +487,21 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
> > return _val == p->datamatch ? true : false;
> > }
> >
> > +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> > + loff_t pos)
> > +{
> > + mm_segment_t old_fs;
> > + ssize_t res;
> > +
> > + old_fs = get_fs();
> > + set_fs(get_ds());
> > + /* The cast to a user pointer is valid due to the set_fs() */
> > + res = vfs_write(file, (const char __user *)buf, count,&pos);
> > + set_fs(old_fs);
> > +
> > + return res;
> > +}
> > +
>
> Is there no generic helper for this? Should there be?
>
I couldn't find one, I took the code above from fs/splice.c.
There should probably be a generic version of it as this snippet repeats
itself in several locations throughout the kernel.
> > /* MMIO/PIO writes trigger an event if the addr/val match */
> > static int
> > ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> > @@ -491,7 +512,11 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> > if (!ioeventfd_in_range(p, addr, len, val))
> > return -EOPNOTSUPP;
> >
> > - eventfd_signal(p->eventfd, 1);
> > + if (p->pipe)
> > + kernel_write(p->pipe, val, len, 0);
>
> You're writing potentially variable length data.
>
> We need a protocol containing address, data, length, and supporting read
> accesses as well.
>
This can't be variable length.
The user defines an ioeventfd as an address+length (with length being up
to 8 bytes). The only time an ioeventfd is signaled is when the write to
the guest memory is exactly at the specified address with exactly the
specified length.
ioeventfds can be extended to handle more than 8 bytes, variable address
offset and reads now that pipe support is added, but I'd rather do it in
follow-up patches once basic pipe support is in.
> Is the write guaranteed atomic? We probably need serialization here.
afaik vfs_write is just a wrapper to the write() function of the
underlying fs so it should be atomic, no?
>
> > + else
> > + eventfd_signal(p->eventfd, 1);
> > +
> > return 0;
> > }
> >
> > @@ -555,9 +580,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > if (args->flags& ~KVM_IOEVENTFD_VALID_FLAG_MASK)
> > return -EINVAL;
> >
> > - eventfd = eventfd_ctx_fdget(args->fd);
> > - if (IS_ERR(eventfd))
> > - return PTR_ERR(eventfd);
> > + if (!(args->flags& KVM_IOEVENTFD_FLAG_PIPE)) {
> > + eventfd = eventfd_ctx_fdget(args->fd);
> > + if (IS_ERR(eventfd))
> > + return PTR_ERR(eventfd);
> > + }
> >
> > p = kzalloc(sizeof(*p), GFP_KERNEL);
> > if (!p) {
> > @@ -568,7 +595,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> > INIT_LIST_HEAD(&p->list);
> > p->addr = args->addr;
> > p->length = args->len;
> > - p->eventfd = eventfd;
> > +
> > + if (args->flags& KVM_IOEVENTFD_FLAG_PIPE)
> > + p->pipe = fget(args->fd);
> > + else
> > + p->eventfd = eventfd;
>
> The split logic with the previous hunk isn't nice. Suggest moving the
> 'else' there, and assigning the whole union here.
>
> > list_for_each_entry_safe(p, tmp,&kvm->ioeventfds, list) {
> > bool wildcard = !(args->flags& KVM_IOEVENTFD_FLAG_DATAMATCH);
> >
> > - if (p->eventfd != eventfd ||
> > - p->addr != args->addr ||
> > + if (p->addr != args->addr ||
> > p->length != args->len ||
> > p->wildcard != wildcard)
> > continue;
>
> Why?
I didn't think that assigning 2 different events with exactly the same
address, length and data can happen. Why would it?
>
>
--
Sasha.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-03 17:44 ` Sasha Levin
@ 2011-07-03 17:57 ` Pekka Enberg
2011-07-04 10:27 ` Avi Kivity
1 sibling, 0 replies; 20+ messages in thread
From: Pekka Enberg @ 2011-07-03 17:57 UTC (permalink / raw)
To: Sasha Levin
Cc: Avi Kivity, kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin
On Sun, Jul 3, 2011 at 8:44 PM, Sasha Levin <levinsasha928@gmail.com> wrote:
>> > +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
>> > + loff_t pos)
>> > +{
>> > + mm_segment_t old_fs;
>> > + ssize_t res;
>> > +
>> > + old_fs = get_fs();
>> > + set_fs(get_ds());
>> > + /* The cast to a user pointer is valid due to the set_fs() */
>> > + res = vfs_write(file, (const char __user *)buf, count,&pos);
>> > + set_fs(old_fs);
>> > +
>> > + return res;
>> > +}
>> > +
>>
>> Is there no generic helper for this? Should there be?
>>
>
> I couldn't find one, I took the code above from fs/splice.c.
>
> There should probably be a generic version of it as this snippet repeats
> itself in several locations throughout the kernel.
But not all of them do just vfs_write() after set_fs(). I don't think
a generic kernel_wrapper() is worth it to be honest.
Pekka
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-03 17:44 ` Sasha Levin
2011-07-03 17:57 ` Pekka Enberg
@ 2011-07-04 10:27 ` Avi Kivity
2011-07-04 10:49 ` Michael S. Tsirkin
2011-07-04 14:38 ` Sasha Levin
1 sibling, 2 replies; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 10:27 UTC (permalink / raw)
To: Sasha Levin
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On 07/03/2011 08:44 PM, Sasha Levin wrote:
> On Sun, 2011-07-03 at 20:16 +0300, Avi Kivity wrote:
> > On 07/03/2011 08:04 PM, Sasha Levin wrote:
> > > The new flag allows passing a write side of a pipe instead of an
> > > eventfd to be notified of writes to the specified memory region.
> > >
> > > Instead of signaling an event, the value written to the memory region
> > > is written to the pipe.
> > >
> > > Using a pipe instead of an eventfd is usefull when any value can be
> > > written to the memory region but we're interested in recieving the
> > > actual value instead of just a notification.
> > >
> > > A simple example for practical use is the serial port. we are not
> > > interested in an exit every time a char is written to the port, but
> > > we do need to know what was written so we could handle it on the guest.
> >
> > > ---
> > > include/linux/kvm.h | 2 +
> > > virt/kvm/eventfd.c | 65 +++++++++++++++++++++++++++++++++++---------------
> >
> > Documentation/virtua/kvm/api.txt +++++++++++++++++
>
> I couldn't find the ioeventfd docs in there and forgot that it's not in
> the mainline yet, I'll rebase on kvm git tree :)
Please do that forall kvm patches in the future.
> >
> > Is there no generic helper for this? Should there be?
> >
>
> I couldn't find one, I took the code above from fs/splice.c.
>
> There should probably be a generic version of it as this snippet repeats
> itself in several locations throughout the kernel.
Yes. Suggest sending a patchset that consolidates this to akpm.
> > >
> > > - eventfd_signal(p->eventfd, 1);
> > > + if (p->pipe)
> > > + kernel_write(p->pipe, val, len, 0);
> >
> > You're writing potentially variable length data.
> >
> > We need a protocol containing address, data, length, and supporting read
> > accesses as well.
> >
>
> This can't be variable length.
>
> The user defines an ioeventfd as an address+length (with length being up
> to 8 bytes). The only time an ioeventfd is signaled is when the write to
> the guest memory is exactly at the specified address with exactly the
> specified length.
>
It can be variable length if multiple ioeventfds reference the same pipe.
> ioeventfds can be extended to handle more than 8 bytes, variable address
> offset and reads now that pipe support is added, but I'd rather do it in
> follow-up patches once basic pipe support is in.
In general incremental development is great, but I don't want to
fragment the ABI. I'd like to be able to forward an entire PCI BAR over
a pipe. That means sending the address/data/length tuple, and both read
and write support.
> > Is the write guaranteed atomic? We probably need serialization here.
>
> afaik vfs_write is just a wrapper to the write() function of the
> underlying fs so it should be atomic, no?
write() isn't atomic in general. It is for pipes under certain
circumstances, but there is no guarantee that the circumstances apply,
or that the fd is in fact a pipe.
> > > list_for_each_entry_safe(p, tmp,&kvm->ioeventfds, list) {
> > > bool wildcard = !(args->flags& KVM_IOEVENTFD_FLAG_DATAMATCH);
> > >
> > > - if (p->eventfd != eventfd ||
> > > - p->addr != args->addr ||
> > > + if (p->addr != args->addr ||
> > > p->length != args->len ||
> > > p->wildcard != wildcard)
> > > continue;
> >
> > Why?
>
> I didn't think that assigning 2 different events with exactly the same
> address, length and data can happen. Why would it?
>
No reason, but this is a user interface. You can't assume anything
about the user except that he is an evil genius intent on breaking the
kernel in various ways.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-03 17:04 [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE Sasha Levin
2011-07-03 17:16 ` Avi Kivity
@ 2011-07-04 10:32 ` Michael S. Tsirkin
2011-07-04 10:45 ` Avi Kivity
1 sibling, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-07-04 10:32 UTC (permalink / raw)
To: Sasha Levin; +Cc: kvm, Avi Kivity, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On Sun, Jul 03, 2011 at 08:04:49PM +0300, Sasha Levin wrote:
> The new flag allows passing a write side of a pipe instead of an
> eventfd to be notified of writes to the specified memory region.
>
> Instead of signaling an event, the value written to the memory region
> is written to the pipe.
>
> Using a pipe instead of an eventfd is usefull when any value can be
> written to the memory region but we're interested in recieving the
> actual value instead of just a notification.
>
> A simple example for practical use is the serial port. we are not
> interested in an exit every time a char is written to the port, but
> we do need to know what was written so we could handle it on the guest.
Looking at this example, how would you handle a pipe full condition?
We can't buffer unlimited amount of data in the host.
> Cc: Avi Kivity <avi@redhat.com>
> Cc: Ingo Molnar <mingo@elte.hu>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> ---
> include/linux/kvm.h | 2 +
> virt/kvm/eventfd.c | 65 +++++++++++++++++++++++++++++++++++---------------
> 2 files changed, 47 insertions(+), 20 deletions(-)
>
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 55ef181..548f23a 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -387,12 +387,14 @@ enum {
> kvm_ioeventfd_flag_nr_datamatch,
> kvm_ioeventfd_flag_nr_pio,
> kvm_ioeventfd_flag_nr_deassign,
> + kvm_ioeventfd_flag_nr_pipe,
> kvm_ioeventfd_flag_nr_max,
> };
>
> #define KVM_IOEVENTFD_FLAG_DATAMATCH (1 << kvm_ioeventfd_flag_nr_datamatch)
> #define KVM_IOEVENTFD_FLAG_PIO (1 << kvm_ioeventfd_flag_nr_pio)
> #define KVM_IOEVENTFD_FLAG_DEASSIGN (1 << kvm_ioeventfd_flag_nr_deassign)
> +#define KVM_IOEVENTFD_FLAG_PIPE (1 << kvm_ioeventfd_flag_nr_pipe)
>
> #define KVM_IOEVENTFD_VALID_FLAG_MASK ((1 << kvm_ioeventfd_flag_nr_max) - 1)
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 73358d2..434293e 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -413,10 +413,11 @@ module_exit(irqfd_module_exit);
>
> /*
> * --------------------------------------------------------------------
> - * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal.
> + * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal or
> + * a pipe write.
> *
> - * userspace can register a PIO/MMIO address with an eventfd for receiving
> - * notification when the memory has been touched.
> + * userspace can register a PIO/MMIO address with an eventfd or a
> + * pipe for receiving notification when the memory has been touched.
> * --------------------------------------------------------------------
> */
>
> @@ -424,6 +425,7 @@ struct _ioeventfd {
> struct list_head list;
> u64 addr;
> int length;
> + struct file *pipe;
> struct eventfd_ctx *eventfd;
> u64 datamatch;
> struct kvm_io_device dev;
> @@ -439,7 +441,11 @@ to_ioeventfd(struct kvm_io_device *dev)
> static void
> ioeventfd_release(struct _ioeventfd *p)
> {
> - eventfd_ctx_put(p->eventfd);
> + if (p->eventfd)
> + eventfd_ctx_put(p->eventfd);
> + else
> + fput(p->pipe);
> +
> list_del(&p->list);
> kfree(p);
> }
> @@ -481,6 +487,21 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
> return _val == p->datamatch ? true : false;
> }
>
> +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> + loff_t pos)
> +{
> + mm_segment_t old_fs;
> + ssize_t res;
> +
> + old_fs = get_fs();
> + set_fs(get_ds());
> + /* The cast to a user pointer is valid due to the set_fs() */
Interesting. Is buf really always a user pointer?
Why don't we tag it __user then?
> + res = vfs_write(file, (const char __user *)buf, count, &pos);
If pipe is non-blocking, or if we get a signal,
this might fail or return a value < len.
Data will be lost then, won't it?
> + set_fs(old_fs);
> +
> + return res;
> +}
> +
> /* MMIO/PIO writes trigger an event if the addr/val match */
> static int
> ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> @@ -491,7 +512,11 @@ ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> if (!ioeventfd_in_range(p, addr, len, val))
> return -EOPNOTSUPP;
>
> - eventfd_signal(p->eventfd, 1);
> + if (p->pipe)
> + kernel_write(p->pipe, val, len, 0);
> + else
> + eventfd_signal(p->eventfd, 1);
> +
> return 0;
> }
>
> @@ -533,7 +558,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
> enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
> struct _ioeventfd *p;
> - struct eventfd_ctx *eventfd;
> + struct eventfd_ctx *eventfd = NULL;
> int ret;
>
> /* must be natural-word sized */
> @@ -555,9 +580,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
> return -EINVAL;
>
> - eventfd = eventfd_ctx_fdget(args->fd);
> - if (IS_ERR(eventfd))
> - return PTR_ERR(eventfd);
> + if (!(args->flags & KVM_IOEVENTFD_FLAG_PIPE)) {
> + eventfd = eventfd_ctx_fdget(args->fd);
> + if (IS_ERR(eventfd))
> + return PTR_ERR(eventfd);
> + }
>
> p = kzalloc(sizeof(*p), GFP_KERNEL);
> if (!p) {
> @@ -568,7 +595,11 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> INIT_LIST_HEAD(&p->list);
> p->addr = args->addr;
> p->length = args->len;
> - p->eventfd = eventfd;
> +
> + if (args->flags & KVM_IOEVENTFD_FLAG_PIPE)
> + p->pipe = fget(args->fd);
This really needs to check that the fd is a pipe.
Otherwise you can do weird things like pass in
the kvm device fd itself.
> + else
> + p->eventfd = eventfd;
>
> /* The datamatch feature is optional, otherwise this is a wildcard */
> if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
> @@ -601,7 +632,9 @@ unlock_fail:
>
> fail:
> kfree(p);
> - eventfd_ctx_put(eventfd);
> +
> + if (!(args->flags & KVM_IOEVENTFD_FLAG_PIPE))
> + eventfd_ctx_put(eventfd);
>
> return ret;
> }
> @@ -612,20 +645,14 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
> int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
> enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
> struct _ioeventfd *p, *tmp;
> - struct eventfd_ctx *eventfd;
> int ret = -ENOENT;
>
> - eventfd = eventfd_ctx_fdget(args->fd);
> - if (IS_ERR(eventfd))
> - return PTR_ERR(eventfd);
> -
> mutex_lock(&kvm->slots_lock);
>
> list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) {
> bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH);
>
> - if (p->eventfd != eventfd ||
> - p->addr != args->addr ||
> + if (p->addr != args->addr ||
> p->length != args->len ||
> p->wildcard != wildcard)
> continue;
> @@ -641,8 +668,6 @@ kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
>
> mutex_unlock(&kvm->slots_lock);
>
> - eventfd_ctx_put(eventfd);
> -
Don't we need to put on deassign?
> return ret;
> }
>
> --
> 1.7.6
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 10:32 ` Michael S. Tsirkin
@ 2011-07-04 10:45 ` Avi Kivity
2011-07-04 11:07 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 10:45 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On 07/04/2011 01:32 PM, Michael S. Tsirkin wrote:
> On Sun, Jul 03, 2011 at 08:04:49PM +0300, Sasha Levin wrote:
> > The new flag allows passing a write side of a pipe instead of an
> > eventfd to be notified of writes to the specified memory region.
> >
> > Instead of signaling an event, the value written to the memory region
> > is written to the pipe.
> >
> > Using a pipe instead of an eventfd is usefull when any value can be
> > written to the memory region but we're interested in recieving the
> > actual value instead of just a notification.
> >
> > A simple example for practical use is the serial port. we are not
> > interested in an exit every time a char is written to the port, but
> > we do need to know what was written so we could handle it on the guest.
>
> Looking at this example, how would you handle a pipe full condition?
> We can't buffer unlimited amount of data in the host.
Stall.
> > +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> > + loff_t pos)
> > +{
> > + mm_segment_t old_fs;
> > + ssize_t res;
> > +
> > + old_fs = get_fs();
> > + set_fs(get_ds());
> > + /* The cast to a user pointer is valid due to the set_fs() */
>
> Interesting. Is buf really always a user pointer?
> Why don't we tag it __user then?
It's not a user pointer, this is just to get the tools happy. set_fs()
makes it safe.
> > + res = vfs_write(file, (const char __user *)buf, count,&pos);
>
> If pipe is non-blocking, or if we get a signal,
> this might fail or return a value< len.
> Data will be lost then, won't it?
Yes. Need a loop-until-buffer-exhausted-or-error.
Error reporting is an interesting question. Typically we have KVM_RUN
return the error, but if we use this facility to run something in a
separate process, this can cause the device process crash to cause the
guest to crash.
> > - p->eventfd = eventfd;
> > +
> > + if (args->flags& KVM_IOEVENTFD_FLAG_PIPE)
> > + p->pipe = fget(args->fd);
>
> This really needs to check that the fd is a pipe.
> Otherwise you can do weird things like pass in
> the kvm device fd itself.
Eww, reference loop. Good catch.
We should allow unix domain sockets as well. In fact, for read/write
support, we need this to be a unix domain socket.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 10:27 ` Avi Kivity
@ 2011-07-04 10:49 ` Michael S. Tsirkin
2011-07-04 10:57 ` Avi Kivity
2011-07-04 14:38 ` Sasha Levin
1 sibling, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-07-04 10:49 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On Mon, Jul 04, 2011 at 01:27:45PM +0300, Avi Kivity wrote:
> I'd like to be able to forward an entire PCI BAR
> over a pipe. That means sending the address/data/length tuple, and
> both read and write support.
>
> >> Is the write guaranteed atomic? We probably need serialization here.
> >
> >afaik vfs_write is just a wrapper to the write() function of the
> >underlying fs so it should be atomic, no?
>
> write() isn't atomic in general. It is for pipes under certain
> circumstances, but there is no guarantee that the circumstances
> apply, or that the fd is in fact a pipe.
So the above makes it tricky to pass structured data
which is > 1 byte in size over a pipe. We could build an in-kernel API
(not sure how useful it would be for userspace) to do
an atomic write or fail (instead of a partial write),
but we'll still need to handle failures.
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 10:49 ` Michael S. Tsirkin
@ 2011-07-04 10:57 ` Avi Kivity
0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 10:57 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On 07/04/2011 01:49 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 04, 2011 at 01:27:45PM +0300, Avi Kivity wrote:
> > I'd like to be able to forward an entire PCI BAR
> > over a pipe. That means sending the address/data/length tuple, and
> > both read and write support.
> >
> > >> Is the write guaranteed atomic? We probably need serialization here.
> > >
> > >afaik vfs_write is just a wrapper to the write() function of the
> > >underlying fs so it should be atomic, no?
> >
> > write() isn't atomic in general. It is for pipes under certain
> > circumstances, but there is no guarantee that the circumstances
> > apply, or that the fd is in fact a pipe.
>
> So the above makes it tricky to pass structured data
> which is> 1 byte in size over a pipe. We could build an in-kernel API
> (not sure how useful it would be for userspace) to do
> an atomic write or fail (instead of a partial write),
> but we'll still need to handle failures.
We don't need atomicity after a failure, just under normal operation,
and that can be provided by a mutex.
It's not trivial since a single fd might be associated with several region.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 10:45 ` Avi Kivity
@ 2011-07-04 11:07 ` Michael S. Tsirkin
2011-07-04 11:19 ` Avi Kivity
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-07-04 11:07 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On Mon, Jul 04, 2011 at 01:45:07PM +0300, Avi Kivity wrote:
> On 07/04/2011 01:32 PM, Michael S. Tsirkin wrote:
> >On Sun, Jul 03, 2011 at 08:04:49PM +0300, Sasha Levin wrote:
> >> The new flag allows passing a write side of a pipe instead of an
> >> eventfd to be notified of writes to the specified memory region.
> >>
> >> Instead of signaling an event, the value written to the memory region
> >> is written to the pipe.
> >>
> >> Using a pipe instead of an eventfd is usefull when any value can be
> >> written to the memory region but we're interested in recieving the
> >> actual value instead of just a notification.
> >>
> >> A simple example for practical use is the serial port. we are not
> >> interested in an exit every time a char is written to the port, but
> >> we do need to know what was written so we could handle it on the guest.
> >
> >Looking at this example, how would you handle a pipe full condition?
> >We can't buffer unlimited amount of data in the host.
>
> Stall.
Right, but the guest gets no indication that the pipe is full.
Something like virtio would let the guest do something useful
instead of stalling the vcpu.
Also noting that the fd can be set not to block, or that
a signal can interrupt the write. Both cases are not errors.
> >> +static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
> >> + loff_t pos)
> >> +{
> >> + mm_segment_t old_fs;
> >> + ssize_t res;
> >> +
> >> + old_fs = get_fs();
> >> + set_fs(get_ds());
> >> + /* The cast to a user pointer is valid due to the set_fs() */
> >
> >Interesting. Is buf really always a user pointer?
> >Why don't we tag it __user then?
>
> It's not a user pointer, this is just to get the tools happy.
> set_fs() makes it safe.
>
> >> + res = vfs_write(file, (const char __user *)buf, count,&pos);
> >
> >If pipe is non-blocking, or if we get a signal,
> >this might fail or return a value< len.
> >Data will be lost then, won't it?
>
> Yes. Need a loop-until-buffer-exhausted-or-error.
Signal handling becomes a problem. You don't want a
full pipe to prevent qemu from getting killed or
getting a timer alert.
> Error reporting is an interesting question. Typically we have
> KVM_RUN return the error, but if we use this facility to run
> something in a separate process, this can cause the device process
> crash to cause the guest to crash.
>
> >> - p->eventfd = eventfd;
> >> +
> >> + if (args->flags& KVM_IOEVENTFD_FLAG_PIPE)
> >> + p->pipe = fget(args->fd);
> >
> >This really needs to check that the fd is a pipe.
> >Otherwise you can do weird things like pass in
> >the kvm device fd itself.
>
> Eww, reference loop. Good catch.
>
> We should allow unix domain sockets as well. In fact, for
> read/write support, we need this to be a unix domain socket.
Sockets are actually better at this than pipes
as you can at least make the writes
non-blocking by passing in a message flag.
If we support sockets, do we really need to support
pipes at all?
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 11:07 ` Michael S. Tsirkin
@ 2011-07-04 11:19 ` Avi Kivity
2011-07-04 11:45 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 11:19 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On 07/04/2011 02:07 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 04, 2011 at 01:45:07PM +0300, Avi Kivity wrote:
> > On 07/04/2011 01:32 PM, Michael S. Tsirkin wrote:
> > >On Sun, Jul 03, 2011 at 08:04:49PM +0300, Sasha Levin wrote:
> > >> The new flag allows passing a write side of a pipe instead of an
> > >> eventfd to be notified of writes to the specified memory region.
> > >>
> > >> Instead of signaling an event, the value written to the memory region
> > >> is written to the pipe.
> > >>
> > >> Using a pipe instead of an eventfd is usefull when any value can be
> > >> written to the memory region but we're interested in recieving the
> > >> actual value instead of just a notification.
> > >>
> > >> A simple example for practical use is the serial port. we are not
> > >> interested in an exit every time a char is written to the port, but
> > >> we do need to know what was written so we could handle it on the guest.
> > >
> > >Looking at this example, how would you handle a pipe full condition?
> > >We can't buffer unlimited amount of data in the host.
> >
> > Stall.
>
> Right, but the guest gets no indication that the pipe is full.
> Something like virtio would let the guest do something useful
> instead of stalling the vcpu.
That's not a problem. The vcpu blocks, which lets the other process get
the cpu and run with it. If there are not enough cpu resources, we'll
indeed stall the vcpu, but that happens whenever you're overcommitted
anyway.
> Also noting that the fd can be set not to block, or that
> a signal can interrupt the write. Both cases are not errors.
One thing we can do is return via the normal KVM_EXIT_MMIO method and
hope userspace knows how to handle this. Otherwise I don't see what we
can do.
> > >
> > >If pipe is non-blocking, or if we get a signal,
> > >this might fail or return a value< len.
> > >Data will be lost then, won't it?
> >
> > Yes. Need a loop-until-buffer-exhausted-or-error.
>
> Signal handling becomes a problem. You don't want a
> full pipe to prevent qemu from getting killed or
> getting a timer alert.
Maybe we should require AF_UNIX SOCK_SEQPACKET connection. That gives
us atomicity, and drops the need for a mutex.
> >
> > We should allow unix domain sockets as well. In fact, for
> > read/write support, we need this to be a unix domain socket.
>
> Sockets are actually better at this than pipes
> as you can at least make the writes
> non-blocking by passing in a message flag.
I'm not sure we want that. How do we handle it?
If the socket buffers get filled up, it's time for the vcpu to wait for
the mmio server process. Let the scheduler sort things out.
btw, like vhost-net and other thread offloads, this sort of trick is
dangerous. When you have excess cpu resources throughput improves, but
once the system is loaded, the workload is needlessly spread across more
cores than strictly necessary and communication is done by context
switches instead of user/system transitions.
> If we support sockets, do we really need to support
> pipes at all
I think not.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 11:19 ` Avi Kivity
@ 2011-07-04 11:45 ` Michael S. Tsirkin
2011-07-04 11:49 ` Avi Kivity
0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-07-04 11:45 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On Mon, Jul 04, 2011 at 02:19:39PM +0300, Avi Kivity wrote:
> >Also noting that the fd can be set not to block, or that
> >a signal can interrupt the write. Both cases are not errors.
>
> One thing we can do is return via the normal KVM_EXIT_MMIO method
> and hope userspace knows how to handle this.
Agree. We'll need some way to tell userspace that
write has partially succeeded, presumably, so that
userspace can write out the missing part.
--
MST
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 11:45 ` Michael S. Tsirkin
@ 2011-07-04 11:49 ` Avi Kivity
2011-07-04 12:12 ` Michael S. Tsirkin
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 11:49 UTC (permalink / raw)
To: Michael S. Tsirkin
Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On 07/04/2011 02:45 PM, Michael S. Tsirkin wrote:
> On Mon, Jul 04, 2011 at 02:19:39PM +0300, Avi Kivity wrote:
> > >Also noting that the fd can be set not to block, or that
> > >a signal can interrupt the write. Both cases are not errors.
> >
> > One thing we can do is return via the normal KVM_EXIT_MMIO method
> > and hope userspace knows how to handle this.
>
> Agree. We'll need some way to tell userspace that
> write has partially succeeded, presumably, so that
> userspace can write out the missing part.
With SOCK_SEQPACKET, there will be no partial writes.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 11:49 ` Avi Kivity
@ 2011-07-04 12:12 ` Michael S. Tsirkin
0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-07-04 12:12 UTC (permalink / raw)
To: Avi Kivity; +Cc: Sasha Levin, kvm, Ingo Molnar, Marcelo Tosatti, Pekka Enberg
On Mon, Jul 04, 2011 at 02:49:23PM +0300, Avi Kivity wrote:
> On 07/04/2011 02:45 PM, Michael S. Tsirkin wrote:
> >On Mon, Jul 04, 2011 at 02:19:39PM +0300, Avi Kivity wrote:
> >> >Also noting that the fd can be set not to block, or that
> >> >a signal can interrupt the write. Both cases are not errors.
> >>
> >> One thing we can do is return via the normal KVM_EXIT_MMIO method
> >> and hope userspace knows how to handle this.
> >
> >Agree. We'll need some way to tell userspace that
> >write has partially succeeded, presumably, so that
> >userspace can write out the missing part.
>
> With SOCK_SEQPACKET, there will be no partial writes.
Good idea. I think the same applies to netlink sockets.
> --
> error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 10:27 ` Avi Kivity
2011-07-04 10:49 ` Michael S. Tsirkin
@ 2011-07-04 14:38 ` Sasha Levin
2011-07-04 14:45 ` Avi Kivity
1 sibling, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-07-04 14:38 UTC (permalink / raw)
To: Avi Kivity
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On Mon, 2011-07-04 at 13:27 +0300, Avi Kivity wrote:
> On 07/03/2011 08:44 PM, Sasha Levin wrote:
> > On Sun, 2011-07-03 at 20:16 +0300, Avi Kivity wrote:
> > > On 07/03/2011 08:04 PM, Sasha Levin wrote:
> > > >
> > > > - eventfd_signal(p->eventfd, 1);
> > > > + if (p->pipe)
> > > > + kernel_write(p->pipe, val, len, 0);
> > >
> > > You're writing potentially variable length data.
> > >
> > > We need a protocol containing address, data, length, and supporting read
> > > accesses as well.
> > >
> >
> > This can't be variable length.
> >
> > The user defines an ioeventfd as an address+length (with length being up
> > to 8 bytes). The only time an ioeventfd is signaled is when the write to
> > the guest memory is exactly at the specified address with exactly the
> > specified length.
> >
>
> It can be variable length if multiple ioeventfds reference the same pipe.
>
> > ioeventfds can be extended to handle more than 8 bytes, variable address
> > offset and reads now that pipe support is added, but I'd rather do it in
> > follow-up patches once basic pipe support is in.
>
> In general incremental development is great, but I don't want to
> fragment the ABI. I'd like to be able to forward an entire PCI BAR over
> a pipe. That means sending the address/data/length tuple, and both read
> and write support.
Would this mean that for sockets we want to remove the 8 byte limit?
What about eventfds? We can remove the limit there and assume that if
the user asked for more than 8 bytes he knows what he's doing?
--
Sasha.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 14:38 ` Sasha Levin
@ 2011-07-04 14:45 ` Avi Kivity
2011-07-04 14:52 ` Sasha Levin
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 14:45 UTC (permalink / raw)
To: Sasha Levin
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On 07/04/2011 05:38 PM, Sasha Levin wrote:
> >
> > In general incremental development is great, but I don't want to
> > fragment the ABI. I'd like to be able to forward an entire PCI BAR over
> > a pipe. That means sending the address/data/length tuple, and both read
> > and write support.
>
> Would this mean that for sockets we want to remove the 8 byte limit?
Yes. Register a range and support all sizes.
Perhaps it merits a separate ioctl.
> What about eventfds? We can remove the limit there and assume that if
> the user asked for more than 8 bytes he knows what he's doing?
I can't really see that as useful. eventfds destroy information;
without datamatch, you have no idea what value was written. Even with
datamatch, you have no idea how many times it was written. With a
range, you also have no idea which address was written. It's pretty
meaningless.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 14:45 ` Avi Kivity
@ 2011-07-04 14:52 ` Sasha Levin
2011-07-04 14:59 ` Avi Kivity
0 siblings, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-07-04 14:52 UTC (permalink / raw)
To: Avi Kivity
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On Mon, 2011-07-04 at 17:45 +0300, Avi Kivity wrote:
> On 07/04/2011 05:38 PM, Sasha Levin wrote:
> > >
> > > In general incremental development is great, but I don't want to
> > > fragment the ABI. I'd like to be able to forward an entire PCI BAR over
> > > a pipe. That means sending the address/data/length tuple, and both read
> > > and write support.
> >
> > Would this mean that for sockets we want to remove the 8 byte limit?
>
> Yes. Register a range and support all sizes.
>
> Perhaps it merits a separate ioctl.
>
> > What about eventfds? We can remove the limit there and assume that if
> > the user asked for more than 8 bytes he knows what he's doing?
>
> I can't really see that as useful. eventfds destroy information;
> without datamatch, you have no idea what value was written. Even with
> datamatch, you have no idea how many times it was written. With a
> range, you also have no idea which address was written. It's pretty
> meaningless.
>
It is pretty useless, but I didn't want the ioctl to behave differently
when passing a socket or an eventfd.
If we do go for a new ioctl as you suggested then yes, problem solved.
--
Sasha.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 14:52 ` Sasha Levin
@ 2011-07-04 14:59 ` Avi Kivity
2011-07-06 4:37 ` Sasha Levin
0 siblings, 1 reply; 20+ messages in thread
From: Avi Kivity @ 2011-07-04 14:59 UTC (permalink / raw)
To: Sasha Levin
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On 07/04/2011 05:52 PM, Sasha Levin wrote:
> >
> > I can't really see that as useful. eventfds destroy information;
> > without datamatch, you have no idea what value was written. Even with
> > datamatch, you have no idea how many times it was written. With a
> > range, you also have no idea which address was written. It's pretty
> > meaningless.
> >
>
> It is pretty useless, but I didn't want the ioctl to behave differently
> when passing a socket or an eventfd.
>
> If we do go for a new ioctl as you suggested then yes, problem solved.
>
Yes. I guess it depends on the numbers of if () s introduced into the
code. If it starts to feel dirty, split it into a separate ioctl (they
can both call common helpers).
It's fine to allow size > 8 for eventfds. Yes it's meaningless, but
it's not harmful and I can't see it breaking anything. Note that we may
need to change the way we do matches - currently 'size' means the access
size, with an exact match on the address, but the new meaning is 'any
address from start to start+size-1'.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-04 14:59 ` Avi Kivity
@ 2011-07-06 4:37 ` Sasha Levin
2011-07-06 11:30 ` Avi Kivity
0 siblings, 1 reply; 20+ messages in thread
From: Sasha Levin @ 2011-07-06 4:37 UTC (permalink / raw)
To: Avi Kivity
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On Mon, 2011-07-04 at 17:59 +0300, Avi Kivity wrote:
> On 07/04/2011 05:52 PM, Sasha Levin wrote:
> > >
> > > I can't really see that as useful. eventfds destroy information;
> > > without datamatch, you have no idea what value was written. Even with
> > > datamatch, you have no idea how many times it was written. With a
> > > range, you also have no idea which address was written. It's pretty
> > > meaningless.
> > >
> >
> > It is pretty useless, but I didn't want the ioctl to behave differently
> > when passing a socket or an eventfd.
> >
> > If we do go for a new ioctl as you suggested then yes, problem solved.
> >
>
> Yes. I guess it depends on the numbers of if () s introduced into the
> code. If it starts to feel dirty, split it into a separate ioctl (they
> can both call common helpers).
>
> It's fine to allow size > 8 for eventfds. Yes it's meaningless, but
> it's not harmful and I can't see it breaking anything. Note that we may
> need to change the way we do matches - currently 'size' means the access
> size, with an exact match on the address, but the new meaning is 'any
> address from start to start+size-1'.
>
Looks like there was no need for a new ioctl. I've tried going further
with adding 'meaningless' features to the existing implementation so
that the sockets could be added without needing a new mechanism.
I've added a user configurable 'read' flag which is supposed to trigger
an event when the memory is read and a 'nowrite' flag which won't
trigger an event on a write. This makes it possible to add sockets while
keeping the code pretty much the same.
Instead of making these flags user configurable, they could be
implicitly defined according to the choice of eventfd or socket, I
couldn't decide which method was better so just went with the first one.
--
Sasha.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE
2011-07-06 4:37 ` Sasha Levin
@ 2011-07-06 11:30 ` Avi Kivity
0 siblings, 0 replies; 20+ messages in thread
From: Avi Kivity @ 2011-07-06 11:30 UTC (permalink / raw)
To: Sasha Levin
Cc: kvm, Ingo Molnar, Marcelo Tosatti, Michael S. Tsirkin,
Pekka Enberg
On 07/06/2011 07:37 AM, Sasha Levin wrote:
> Instead of making these flags user configurable, they could be
> implicitly defined according to the choice of eventfd or socket, I
> couldn't decide which method was better so just went with the first one.
>
Explicit is better than implicit, so I agree with your choice here.
--
error compiling committee.c: too many arguments to function
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2011-07-06 11:31 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-03 17:04 [PATCH] ioeventfd: Introduce KVM_IOEVENTFD_FLAG_PIPE Sasha Levin
2011-07-03 17:16 ` Avi Kivity
2011-07-03 17:44 ` Sasha Levin
2011-07-03 17:57 ` Pekka Enberg
2011-07-04 10:27 ` Avi Kivity
2011-07-04 10:49 ` Michael S. Tsirkin
2011-07-04 10:57 ` Avi Kivity
2011-07-04 14:38 ` Sasha Levin
2011-07-04 14:45 ` Avi Kivity
2011-07-04 14:52 ` Sasha Levin
2011-07-04 14:59 ` Avi Kivity
2011-07-06 4:37 ` Sasha Levin
2011-07-06 11:30 ` Avi Kivity
2011-07-04 10:32 ` Michael S. Tsirkin
2011-07-04 10:45 ` Avi Kivity
2011-07-04 11:07 ` Michael S. Tsirkin
2011-07-04 11:19 ` Avi Kivity
2011-07-04 11:45 ` Michael S. Tsirkin
2011-07-04 11:49 ` Avi Kivity
2011-07-04 12:12 ` Michael S. Tsirkin
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox