[PATCH] MMIO: Make coalesced mmio use a device per zone

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] MMIO: Make coalesced mmio use a device per zone
@ 2011-07-19  8:10 Sasha Levin
  2011-07-19  8:48 ` Avi Kivity
  0 siblings, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-07-19  8:10 UTC (permalink / raw)
  To: kvm; +Cc: Sasha Levin, Avi Kivity, Marcelo Tosatti

This patch changes coalesced mmio to create one mmio device per
zone instead of handling all zones in one device.

Doing so enables us to take advantage of existing locking and prevents
a race condition between coalesced mmio registration/unregistration
and lookups.

Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Suggested-by: Avi Kivity <avi@redhat.com>
Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---

The patch depends on 'KVM: MMIO: Lock coalesced device when checking
for available entry'.

 include/linux/kvm_host.h  |    1 -
 virt/kvm/coalesced_mmio.c |  116 +++++++++++++++++----------------------------
 virt/kvm/coalesced_mmio.h |    7 ++-
 3 files changed, 48 insertions(+), 76 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index eabb21a..1616096 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -256,7 +256,6 @@ struct kvm {
 	struct kvm_arch arch;
 	atomic_t users_count;
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
-	struct kvm_coalesced_mmio_dev *coalesced_mmio_dev;
 	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
 #endif
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index ae075dc..d25da84 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -16,6 +16,9 @@
 
 #include "coalesced_mmio.h"
 
+static spinlock_t lock;
+static LIST_HEAD(head);
+
 static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 {
 	return container_of(dev, struct kvm_coalesced_mmio_dev, dev);
@@ -24,23 +27,13 @@ static inline struct kvm_coalesced_mmio_dev *to_mmio(struct kvm_io_device *dev)
 static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 				   gpa_t addr, int len)
 {
-	struct kvm_coalesced_mmio_zone *zone;
-	int i;
-
-	/* is it in a batchable area ? */
-
-	for (i = 0; i < dev->nb_zones; i++) {
-		zone = &dev->zone[i];
-
-		/* (addr,len) is fully included in
-		 * (zone->addr, zone->size)
-		 */
+	/* is it in a batchable area ?
+	 * (addr,len) is fully included in
+	 * (zone->addr, zone->size)
+	 */
 
-		if (zone->addr <= addr &&
-		    addr + len <= zone->addr + zone->size)
-			return 1;
-	}
-	return 0;
+	return (dev->zone.addr <= addr &&
+		addr + len <= dev->zone.addr + dev->zone.size);
 }
 
 static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
@@ -73,10 +66,10 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 	if (!coalesced_mmio_in_range(dev, addr, len))
 		return -EOPNOTSUPP;
 
-	spin_lock(&dev->lock);
+	spin_lock(&lock);
 
 	if (!coalesced_mmio_has_room(dev)) {
-		spin_unlock(&dev->lock);
+		spin_unlock(&lock);
 		return -EOPNOTSUPP;
 	}
 
@@ -87,7 +80,7 @@ static int coalesced_mmio_write(struct kvm_io_device *this,
 	memcpy(ring->coalesced_mmio[ring->last].data, val, len);
 	smp_wmb();
 	ring->last = (ring->last + 1) % KVM_COALESCED_MMIO_MAX;
-	spin_unlock(&dev->lock);
+	spin_unlock(&lock);
 	return 0;
 }
 
@@ -95,6 +88,8 @@ static void coalesced_mmio_destructor(struct kvm_io_device *this)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 
+	list_del(&dev->list);
+
 	kfree(dev);
 }
 
@@ -105,7 +100,6 @@ static const struct kvm_io_device_ops coalesced_mmio_ops = {
 
 int kvm_coalesced_mmio_init(struct kvm *kvm)
 {
-	struct kvm_coalesced_mmio_dev *dev;
 	struct page *page;
 	int ret;
 
@@ -113,31 +107,12 @@ int kvm_coalesced_mmio_init(struct kvm *kvm)
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page)
 		goto out_err;
-	kvm->coalesced_mmio_ring = page_address(page);
-
-	ret = -ENOMEM;
-	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
-	if (!dev)
-		goto out_free_page;
-	spin_lock_init(&dev->lock);
-	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
-	dev->kvm = kvm;
-	kvm->coalesced_mmio_dev = dev;
 
-	mutex_lock(&kvm->slots_lock);
-	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
-	mutex_unlock(&kvm->slots_lock);
-	if (ret < 0)
-		goto out_free_dev;
+	ret = 0;
+	kvm->coalesced_mmio_ring = page_address(page);
 
-	return ret;
+	spin_lock_init(&lock);
 
-out_free_dev:
-	kvm->coalesced_mmio_dev = NULL;
-	kfree(dev);
-out_free_page:
-	kvm->coalesced_mmio_ring = NULL;
-	__free_page(page);
 out_err:
 	return ret;
 }
@@ -151,51 +126,48 @@ void kvm_coalesced_mmio_free(struct kvm *kvm)
 int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 					 struct kvm_coalesced_mmio_zone *zone)
 {
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
+	int ret;
+	struct kvm_coalesced_mmio_dev *dev;
 
-	if (dev == NULL)
-		return -ENXIO;
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	kvm_iodevice_init(&dev->dev, &coalesced_mmio_ops);
+	dev->kvm = kvm;
+	dev->zone = *zone;
 
 	mutex_lock(&kvm->slots_lock);
-	if (dev->nb_zones >= KVM_COALESCED_MMIO_ZONE_MAX) {
-		mutex_unlock(&kvm->slots_lock);
-		return -ENOBUFS;
-	}
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+	mutex_unlock(&kvm->slots_lock);
+	if (ret < 0)
+		goto out_free_dev;
 
-	dev->zone[dev->nb_zones] = *zone;
-	dev->nb_zones++;
+	list_add_tail(&dev->list, &head);
+
+	return ret;
+
+out_free_dev:
+	kfree(dev);
+
+	if (dev == NULL)
+		return -ENXIO;
 
-	mutex_unlock(&kvm->slots_lock);
 	return 0;
 }
 
 int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm,
 					   struct kvm_coalesced_mmio_zone *zone)
 {
-	int i;
-	struct kvm_coalesced_mmio_dev *dev = kvm->coalesced_mmio_dev;
-	struct kvm_coalesced_mmio_zone *z;
-
-	if (dev == NULL)
-		return -ENXIO;
+	struct kvm_coalesced_mmio_dev *dev;
 
 	mutex_lock(&kvm->slots_lock);
 
-	i = dev->nb_zones;
-	while (i) {
-		z = &dev->zone[i - 1];
-
-		/* unregister all zones
-		 * included in (zone->addr, zone->size)
-		 */
-
-		if (zone->addr <= z->addr &&
-		    z->addr + z->size <= zone->addr + zone->size) {
-			dev->nb_zones--;
-			*z = dev->zone[dev->nb_zones];
+	list_for_each_entry(dev, &head, list)
+		if (coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
+			kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dev->dev);
+			kvm_iodevice_destructor(&dev->dev);
 		}
-		i--;
-	}
 
 	mutex_unlock(&kvm->slots_lock);
 
diff --git a/virt/kvm/coalesced_mmio.h b/virt/kvm/coalesced_mmio.h
index 8a5959e..1011346 100644
--- a/virt/kvm/coalesced_mmio.h
+++ b/virt/kvm/coalesced_mmio.h
@@ -12,14 +12,15 @@
 
 #ifdef CONFIG_KVM_MMIO
 
+#include <linux/list.h>
+
 #define KVM_COALESCED_MMIO_ZONE_MAX 100
 
 struct kvm_coalesced_mmio_dev {
+	struct list_head list;
 	struct kvm_io_device dev;
 	struct kvm *kvm;
-	spinlock_t lock;
-	int nb_zones;
-	struct kvm_coalesced_mmio_zone zone[KVM_COALESCED_MMIO_ZONE_MAX];
+	struct kvm_coalesced_mmio_zone zone;
 };
 
 int kvm_coalesced_mmio_init(struct kvm *kvm);
-- 
1.7.6


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19  8:10 [PATCH] MMIO: Make coalesced mmio use a device per zone Sasha Levin
@ 2011-07-19  8:48 ` Avi Kivity
  2011-07-19  9:53   ` Sasha Levin
  0 siblings, 1 reply; 18+ messages in thread
From: Avi Kivity @ 2011-07-19  8:48 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Marcelo Tosatti

On 07/19/2011 11:10 AM, Sasha Levin wrote:
> This patch changes coalesced mmio to create one mmio device per
> zone instead of handling all zones in one device.
>
> Doing so enables us to take advantage of existing locking and prevents
> a race condition between coalesced mmio registration/unregistration
> and lookups.
>
>
>
>   #include "coalesced_mmio.h"
>
> +static spinlock_t lock;
> +static LIST_HEAD(head);

Make these per-guest instead of global.  The lock may be contended, and 
the list shouldn't hold items from different guests (why is it needed, 
anyway?)

The coalesced mmio devices will now contend with other io devices for 
NR_IOBUS_DEVS, so need to increase that (by KVM_COALESCED_MMIO_ZONE_MAX).

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19  8:48 ` Avi Kivity
@ 2011-07-19  9:53   ` Sasha Levin
  2011-07-19  9:59     ` Avi Kivity
  0 siblings, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-07-19  9:53 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

On Tue, 2011-07-19 at 11:48 +0300, Avi Kivity wrote:
> On 07/19/2011 11:10 AM, Sasha Levin wrote:
> > This patch changes coalesced mmio to create one mmio device per
> > zone instead of handling all zones in one device.
> >
> > Doing so enables us to take advantage of existing locking and prevents
> > a race condition between coalesced mmio registration/unregistration
> > and lookups.
> >
> >
> >
> >   #include "coalesced_mmio.h"
> >
> > +static spinlock_t lock;
> > +static LIST_HEAD(head);
> 
> Make these per-guest instead of global.  The lock may be contended, and 
> the list shouldn't hold items from different guests (why is it needed, 
> anyway?)
> 

We only need the list for removal, since we only have the range we want
to remove, and we want to find all devices which contain this range.

> The coalesced mmio devices will now contend with other io devices for 
> NR_IOBUS_DEVS, so need to increase that (by KVM_COALESCED_MMIO_ZONE_MAX).
> 

-- 

Sasha.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19  9:53   ` Sasha Levin
@ 2011-07-19  9:59     ` Avi Kivity
  2011-07-19 10:17       ` Sasha Levin
  0 siblings, 1 reply; 18+ messages in thread
From: Avi Kivity @ 2011-07-19  9:59 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Marcelo Tosatti

On 07/19/2011 12:53 PM, Sasha Levin wrote:
> >  Make these per-guest instead of global.  The lock may be contended, and
> >  the list shouldn't hold items from different guests (why is it needed,
> >  anyway?)
> >
>
> We only need the list for removal, since we only have the range we want
> to remove, and we want to find all devices which contain this range.
>

All devices in the same guest which contain this range.  Your patch 
removes devices from all guests.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19  9:59     ` Avi Kivity
@ 2011-07-19 10:17       ` Sasha Levin
  2011-07-19 10:32         ` Avi Kivity
  0 siblings, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-07-19 10:17 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

On Tue, 2011-07-19 at 12:59 +0300, Avi Kivity wrote:
> On 07/19/2011 12:53 PM, Sasha Levin wrote:
> > >  Make these per-guest instead of global.  The lock may be contended, and
> > >  the list shouldn't hold items from different guests (why is it needed,
> > >  anyway?)
> > >
> >
> > We only need the list for removal, since we only have the range we want
> > to remove, and we want to find all devices which contain this range.
> >
> 
> All devices in the same guest which contain this range.  Your patch 
> removes devices from all guests.
> 

Yup. I've messed up guest-locality. Will fix.

Also, I found this comment when increasing NR_IOBUS_DEVS:

/*
 * It would be nice to use something smarter than a linear search, TBD...
 * Thankfully we dont expect many devices to register (famous last words :),
 * so until then it will suffice.  At least its abstracted so we can change
 * in one place.
 */

Since so far we've registered 5-6 devices, and now it may increase
significantly (since we may want to do the same change to ioeventfds,
which work the same way) - how would you feel if we make devices
register range(s) and do a rbtree lookup instead of a linear search?

-- 

Sasha.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 10:17       ` Sasha Levin
@ 2011-07-19 10:32         ` Avi Kivity
  2011-07-19 11:22           ` Sasha Levin
  2011-07-19 17:14           ` Jan Kiszka
  0 siblings, 2 replies; 18+ messages in thread
From: Avi Kivity @ 2011-07-19 10:32 UTC (permalink / raw)
  To: Sasha Levin; +Cc: kvm, Marcelo Tosatti

On 07/19/2011 01:17 PM, Sasha Levin wrote:
> On Tue, 2011-07-19 at 12:59 +0300, Avi Kivity wrote:
> >  On 07/19/2011 12:53 PM, Sasha Levin wrote:
> >  >  >   Make these per-guest instead of global.  The lock may be contended, and
> >  >  >   the list shouldn't hold items from different guests (why is it needed,
> >  >  >   anyway?)
> >  >  >
> >  >
> >  >  We only need the list for removal, since we only have the range we want
> >  >  to remove, and we want to find all devices which contain this range.
> >  >
> >
> >  All devices in the same guest which contain this range.  Your patch
> >  removes devices from all guests.
> >
>
> Yup. I've messed up guest-locality. Will fix.
>
> Also, I found this comment when increasing NR_IOBUS_DEVS:
>
> /*
>   * It would be nice to use something smarter than a linear search, TBD...
>   * Thankfully we dont expect many devices to register (famous last words :),
>   * so until then it will suffice.  At least its abstracted so we can change
>   * in one place.
>   */
>
> Since so far we've registered 5-6 devices, and now it may increase
> significantly (since we may want to do the same change to ioeventfds,
> which work the same way) - how would you feel if we make devices
> register range(s) and do a rbtree lookup instead of a linear search?
>

It makes sense.  In fact your change is a good first step - so far it 
was impossible to to a clever search since the seaching code was not 
aware of the ranges (and could not be, since the single coalesced mmio 
device had multiple ranges).

Rather than an rbtree, I suggest putting all ranges in an array and 
sorting it, then using binary search.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 10:32         ` Avi Kivity
@ 2011-07-19 11:22           ` Sasha Levin
  2011-07-19 11:45             ` Sasha Levin
  2011-07-19 17:14           ` Jan Kiszka
  1 sibling, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-07-19 11:22 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

On Tue, 2011-07-19 at 13:32 +0300, Avi Kivity wrote:
> On 07/19/2011 01:17 PM, Sasha Levin wrote:
> > On Tue, 2011-07-19 at 12:59 +0300, Avi Kivity wrote:
> > >  On 07/19/2011 12:53 PM, Sasha Levin wrote:
> > >  >  >   Make these per-guest instead of global.  The lock may be contended, and
> > >  >  >   the list shouldn't hold items from different guests (why is it needed,
> > >  >  >   anyway?)
> > >  >  >
> > >  >
> > >  >  We only need the list for removal, since we only have the range we want
> > >  >  to remove, and we want to find all devices which contain this range.
> > >  >
> > >
> > >  All devices in the same guest which contain this range.  Your patch
> > >  removes devices from all guests.
> > >
> >
> > Yup. I've messed up guest-locality. Will fix.
> >
> > Also, I found this comment when increasing NR_IOBUS_DEVS:
> >
> > /*
> >   * It would be nice to use something smarter than a linear search, TBD...
> >   * Thankfully we dont expect many devices to register (famous last words :),
> >   * so until then it will suffice.  At least its abstracted so we can change
> >   * in one place.
> >   */
> >
> > Since so far we've registered 5-6 devices, and now it may increase
> > significantly (since we may want to do the same change to ioeventfds,
> > which work the same way) - how would you feel if we make devices
> > register range(s) and do a rbtree lookup instead of a linear search?
> >
> 
> It makes sense.  In fact your change is a good first step - so far it 
> was impossible to to a clever search since the seaching code was not 
> aware of the ranges (and could not be, since the single coalesced mmio 
> device had multiple ranges).
> 
> Rather than an rbtree, I suggest putting all ranges in an array and 
> sorting it, then using binary search.
> 

Why array over rbtree? An array would be heavier on register/unregister,
and using rbtree would let us easily eliminate any max device limit we
had so far.

We've had good experience using interval rbtree in /tools/kvm where we
did PIO and MMIO device mapping using an augmented rbtree which made
range lookups very simple and fast.

-- 

Sasha.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 11:22           ` Sasha Levin
@ 2011-07-19 11:45             ` Sasha Levin
  0 siblings, 0 replies; 18+ messages in thread
From: Sasha Levin @ 2011-07-19 11:45 UTC (permalink / raw)
  To: Avi Kivity; +Cc: kvm, Marcelo Tosatti

On Tue, 2011-07-19 at 14:22 +0300, Sasha Levin wrote:
> On Tue, 2011-07-19 at 13:32 +0300, Avi Kivity wrote:
> > On 07/19/2011 01:17 PM, Sasha Levin wrote:
> > > On Tue, 2011-07-19 at 12:59 +0300, Avi Kivity wrote:
> > > >  On 07/19/2011 12:53 PM, Sasha Levin wrote:
> > > >  >  >   Make these per-guest instead of global.  The lock may be contended, and
> > > >  >  >   the list shouldn't hold items from different guests (why is it needed,
> > > >  >  >   anyway?)
> > > >  >  >
> > > >  >
> > > >  >  We only need the list for removal, since we only have the range we want
> > > >  >  to remove, and we want to find all devices which contain this range.
> > > >  >
> > > >
> > > >  All devices in the same guest which contain this range.  Your patch
> > > >  removes devices from all guests.
> > > >
> > >
> > > Yup. I've messed up guest-locality. Will fix.
> > >
> > > Also, I found this comment when increasing NR_IOBUS_DEVS:
> > >
> > > /*
> > >   * It would be nice to use something smarter than a linear search, TBD...
> > >   * Thankfully we dont expect many devices to register (famous last words :),
> > >   * so until then it will suffice.  At least its abstracted so we can change
> > >   * in one place.
> > >   */
> > >
> > > Since so far we've registered 5-6 devices, and now it may increase
> > > significantly (since we may want to do the same change to ioeventfds,
> > > which work the same way) - how would you feel if we make devices
> > > register range(s) and do a rbtree lookup instead of a linear search?
> > >
> > 
> > It makes sense.  In fact your change is a good first step - so far it 
> > was impossible to to a clever search since the seaching code was not 
> > aware of the ranges (and could not be, since the single coalesced mmio 
> > device had multiple ranges).
> > 
> > Rather than an rbtree, I suggest putting all ranges in an array and 
> > sorting it, then using binary search.
> > 
> 
> Why array over rbtree? An array would be heavier on register/unregister,
> and using rbtree would let us easily eliminate any max device limit we
> had so far.
> 
> We've had good experience using interval rbtree in /tools/kvm where we
> did PIO and MMIO device mapping using an augmented rbtree which made
> range lookups very simple and fast.
> 

Ah, rcu...

Array it is :)

-- 

Sasha.


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 10:32         ` Avi Kivity
  2011-07-19 11:22           ` Sasha Levin
@ 2011-07-19 17:14           ` Jan Kiszka
  2011-07-19 17:17             ` Avi Kivity
  1 sibling, 1 reply; 18+ messages in thread
From: Jan Kiszka @ 2011-07-19 17:14 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sasha Levin, kvm, Marcelo Tosatti

On 2011-07-19 12:32, Avi Kivity wrote:
> On 07/19/2011 01:17 PM, Sasha Levin wrote:
>> On Tue, 2011-07-19 at 12:59 +0300, Avi Kivity wrote:
>> >  On 07/19/2011 12:53 PM, Sasha Levin wrote:
>> >  >  >   Make these per-guest instead of global.  The lock may be
>> contended, and
>> >  >  >   the list shouldn't hold items from different guests (why is
>> it needed,
>> >  >  >   anyway?)
>> >  >  >
>> >  >
>> >  >  We only need the list for removal, since we only have the range
>> we want
>> >  >  to remove, and we want to find all devices which contain this
>> range.
>> >  >
>> >
>> >  All devices in the same guest which contain this range.  Your patch
>> >  removes devices from all guests.
>> >
>>
>> Yup. I've messed up guest-locality. Will fix.
>>
>> Also, I found this comment when increasing NR_IOBUS_DEVS:
>>
>> /*
>>   * It would be nice to use something smarter than a linear search,
>> TBD...
>>   * Thankfully we dont expect many devices to register (famous last
>> words :),
>>   * so until then it will suffice.  At least its abstracted so we can
>> change
>>   * in one place.
>>   */
>>
>> Since so far we've registered 5-6 devices, and now it may increase
>> significantly (since we may want to do the same change to ioeventfds,
>> which work the same way) - how would you feel if we make devices
>> register range(s) and do a rbtree lookup instead of a linear search?
>>
> 
> It makes sense.  In fact your change is a good first step - so far it
> was impossible to to a clever search since the seaching code was not
> aware of the ranges (and could not be, since the single coalesced mmio
> device had multiple ranges).
> 
> Rather than an rbtree, I suggest putting all ranges in an array and
> sorting it, then using binary search.

Another improvement - unfortunately less transparent for user space -
would be to overcome the single ring buffer that forces us to hold a
central lock in user space while processing the entries. We rather need
per-device rings. While waiting for coalesced VGA MMIO being processed,
way too many kittens are killed.

I have this on our agenda, but I wouldn't be disappointed as well if
someone else is faster.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 17:14           ` Jan Kiszka
@ 2011-07-19 17:17             ` Avi Kivity
  2011-07-19 17:23               ` Jan Kiszka
  0 siblings, 1 reply; 18+ messages in thread
From: Avi Kivity @ 2011-07-19 17:17 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Sasha Levin, kvm, Marcelo Tosatti

On 07/19/2011 08:14 PM, Jan Kiszka wrote:
>
> Another improvement - unfortunately less transparent for user space -
> would be to overcome the single ring buffer that forces us to hold a
> central lock in user space while processing the entries. We rather need
> per-device rings. While waiting for coalesced VGA MMIO being processed,
> way too many kittens are killed.
>
> I have this on our agenda, but I wouldn't be disappointed as well if
> someone else is faster.

The socket mmio would have accomplished this as well.  One thing to 
beware of is to preserve correctness:

1) write to 0xa0000 (queued)
2) write to 0xa0002 (queued)
3) remap 0xa0000 region (executed)
4) write to 0xa000 (queued)
5) drain queue

writes 1 and 2 go to the wrong place.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 17:17             ` Avi Kivity
@ 2011-07-19 17:23               ` Jan Kiszka
  2011-07-19 19:28                 ` Sasha Levin
  2011-07-20  8:24                 ` Avi Kivity
  0 siblings, 2 replies; 18+ messages in thread
From: Jan Kiszka @ 2011-07-19 17:23 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 2011-07-19 19:17, Avi Kivity wrote:
> On 07/19/2011 08:14 PM, Jan Kiszka wrote:
>>
>> Another improvement - unfortunately less transparent for user space -
>> would be to overcome the single ring buffer that forces us to hold a
>> central lock in user space while processing the entries. We rather need
>> per-device rings. While waiting for coalesced VGA MMIO being processed,
>> way too many kittens are killed.
>>
>> I have this on our agenda, but I wouldn't be disappointed as well if
>> someone else is faster.
> 
> The socket mmio would have accomplished this as well. 

I haven't followed the outcome in all details - is that approach dead
due to its complexity?

> One thing to
> beware of is to preserve correctness:
> 
> 1) write to 0xa0000 (queued)
> 2) write to 0xa0002 (queued)
> 3) remap 0xa0000 region (executed)

Obviously, there must be 3a) here: drain all affected queues.

> 4) write to 0xa000 (queued)
> 5) drain queue
> 
> writes 1 and 2 go to the wrong place.
> 

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 17:23               ` Jan Kiszka
@ 2011-07-19 19:28                 ` Sasha Levin
  2011-07-19 20:00                   ` Jan Kiszka
  2011-07-20  8:24                 ` Avi Kivity
  1 sibling, 1 reply; 18+ messages in thread
From: Sasha Levin @ 2011-07-19 19:28 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Avi Kivity, kvm@vger.kernel.org, Marcelo Tosatti

On Tue, Jul 19, 2011 at 8:23 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
> On 2011-07-19 19:17, Avi Kivity wrote:
>> On 07/19/2011 08:14 PM, Jan Kiszka wrote:
>>>
>>> Another improvement - unfortunately less transparent for user space -
>>> would be to overcome the single ring buffer that forces us to hold a
>>> central lock in user space while processing the entries. We rather need
>>> per-device rings. While waiting for coalesced VGA MMIO being processed,
>>> way too many kittens are killed.
>>>
>>> I have this on our agenda, but I wouldn't be disappointed as well if
>>> someone else is faster.
>>
>> The socket mmio would have accomplished this as well.

It's possible to process the coalesced mmio ring without waiting for
an exit, no? Is the performance that bad?

I would have thought it's reasonable after seeing how well
virtio-blk/virtio-net behave.

>
> I haven't followed the outcome in all details - is that approach dead
> due to its complexity?

The ability to allow reverse direction in the socket ('read' support
instead of just write) is making it a bit complex. Hopefully it'll be
ready soon.

>
>> One thing to
>> beware of is to preserve correctness:
>>
>> 1) write to 0xa0000 (queued)
>> 2) write to 0xa0002 (queued)
>> 3) remap 0xa0000 region (executed)
>
> Obviously, there must be 3a) here: drain all affected queues.
>
>> 4) write to 0xa000 (queued)
>> 5) drain queue
>>
>> writes 1 and 2 go to the wrong place.
>>
>
> Jan
>
> --
> Siemens AG, Corporate Technology, CT T DE IT 1
> Corporate Competence Center Embedded Linux
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 19:28                 ` Sasha Levin
@ 2011-07-19 20:00                   ` Jan Kiszka
  0 siblings, 0 replies; 18+ messages in thread
From: Jan Kiszka @ 2011-07-19 20:00 UTC (permalink / raw)
  To: Sasha Levin; +Cc: Avi Kivity, kvm@vger.kernel.org, Marcelo Tosatti

[-- Attachment #1: Type: text/plain, Size: 1777 bytes --]

On 2011-07-19 21:28, Sasha Levin wrote:
> On Tue, Jul 19, 2011 at 8:23 PM, Jan Kiszka <jan.kiszka@siemens.com> wrote:
>> On 2011-07-19 19:17, Avi Kivity wrote:
>>> On 07/19/2011 08:14 PM, Jan Kiszka wrote:
>>>>
>>>> Another improvement - unfortunately less transparent for user space -
>>>> would be to overcome the single ring buffer that forces us to hold a
>>>> central lock in user space while processing the entries. We rather need
>>>> per-device rings. While waiting for coalesced VGA MMIO being processed,
>>>> way too many kittens are killed.
>>>>
>>>> I have this on our agenda, but I wouldn't be disappointed as well if
>>>> someone else is faster.
>>>
>>> The socket mmio would have accomplished this as well.
> 
> It's possible to process the coalesced mmio ring without waiting for
> an exit, no? Is the performance that bad?

You do not necessarily have to wait for an exit, but you obviously have
to synchronize all readers, including vcpus issuing some serializing
access. By the time you have >1 and VGA or other heavy users are among
them, the rest suffers from increased latencies (you easily get beyond
hundreds of ms if graphic is involved). That might be ok for ordinary
workload, but it's unacceptable for real-time guests.

> 
> I would have thought it's reasonable after seeing how well
> virtio-blk/virtio-net behave.
> 
>>
>> I haven't followed the outcome in all details - is that approach dead
>> due to its complexity?
> 
> The ability to allow reverse direction in the socket ('read' support
> instead of just write) is making it a bit complex. Hopefully it'll be
> ready soon.

Great, will have an eye on it and take care of the QEMU integration -
unless you would like to do that as well... :)

Jan


[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 262 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-19 17:23               ` Jan Kiszka
  2011-07-19 19:28                 ` Sasha Levin
@ 2011-07-20  8:24                 ` Avi Kivity
  2011-07-20  8:43                   ` Jan Kiszka
  1 sibling, 1 reply; 18+ messages in thread
From: Avi Kivity @ 2011-07-20  8:24 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 07/19/2011 08:23 PM, Jan Kiszka wrote:
> On 2011-07-19 19:17, Avi Kivity wrote:
> >  On 07/19/2011 08:14 PM, Jan Kiszka wrote:
> >>
> >>  Another improvement - unfortunately less transparent for user space -
> >>  would be to overcome the single ring buffer that forces us to hold a
> >>  central lock in user space while processing the entries. We rather need
> >>  per-device rings. While waiting for coalesced VGA MMIO being processed,
> >>  way too many kittens are killed.
> >>
> >>  I have this on our agenda, but I wouldn't be disappointed as well if
> >>  someone else is faster.
> >
> >  The socket mmio would have accomplished this as well.
>
> I haven't followed the outcome in all details - is that approach dead
> due to its complexity?
>
> >  One thing to
> >  beware of is to preserve correctness:
> >
> >  1) write to 0xa0000 (queued)
> >  2) write to 0xa0002 (queued)
> >  3) remap 0xa0000 region (executed)
>
> Obviously, there must be 3a) here: drain all affected queues.

How do you implement this 3a, if your consumers are outside the main 
process?  I guess you could have an additional synchonize API (for 
in-kernel consumers) or RPC (for external process consumers), but then 
this is no longer a simple API.

> >  4) write to 0xa000 (queued)
> >  5) drain queue
> >
> >  writes 1 and 2 go to the wrong place.
> >

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-20  8:24                 ` Avi Kivity
@ 2011-07-20  8:43                   ` Jan Kiszka
  2011-07-20  8:52                     ` Avi Kivity
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kiszka @ 2011-07-20  8:43 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 2011-07-20 10:24, Avi Kivity wrote:
> On 07/19/2011 08:23 PM, Jan Kiszka wrote:
>> On 2011-07-19 19:17, Avi Kivity wrote:
>>>  On 07/19/2011 08:14 PM, Jan Kiszka wrote:
>>>>
>>>>  Another improvement - unfortunately less transparent for user space -
>>>>  would be to overcome the single ring buffer that forces us to hold a
>>>>  central lock in user space while processing the entries. We rather need
>>>>  per-device rings. While waiting for coalesced VGA MMIO being processed,
>>>>  way too many kittens are killed.
>>>>
>>>>  I have this on our agenda, but I wouldn't be disappointed as well if
>>>>  someone else is faster.
>>>
>>>  The socket mmio would have accomplished this as well.
>>
>> I haven't followed the outcome in all details - is that approach dead
>> due to its complexity?
>>
>>>  One thing to
>>>  beware of is to preserve correctness:
>>>
>>>  1) write to 0xa0000 (queued)
>>>  2) write to 0xa0002 (queued)
>>>  3) remap 0xa0000 region (executed)
>>
>> Obviously, there must be 3a) here: drain all affected queues.
> 
> How do you implement this 3a, if your consumers are outside the main 
> process?  I guess you could have an additional synchonize API (for 
> in-kernel consumers) or RPC (for external process consumers), but then 
> this is no longer a simple API.

I'm not planning to leave the hypervisor process for now, not to speak
of in-kernel models. Already for many other reasons, a synchronization
API between a hypothetical decoupled device model and the core will be
quite complex. The first step is to get it scalable using a single process.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-20  8:43                   ` Jan Kiszka
@ 2011-07-20  8:52                     ` Avi Kivity
  2011-07-20  8:55                       ` Jan Kiszka
  0 siblings, 1 reply; 18+ messages in thread
From: Avi Kivity @ 2011-07-20  8:52 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 07/20/2011 11:43 AM, Jan Kiszka wrote:
> >
> >  How do you implement this 3a, if your consumers are outside the main
> >  process?  I guess you could have an additional synchonize API (for
> >  in-kernel consumers) or RPC (for external process consumers), but then
> >  this is no longer a simple API.
>
> I'm not planning to leave the hypervisor process for now, not to speak
> of in-kernel models. Already for many other reasons, a synchronization
> API between a hypothetical decoupled device model and the core will be
> quite complex. The first step is to get it scalable using a single process.

If we design a new kvm API, we must look a little more into the future 
than satisfying immediate needs.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-20  8:52                     ` Avi Kivity
@ 2011-07-20  8:55                       ` Jan Kiszka
  2011-07-20  8:58                         ` Avi Kivity
  0 siblings, 1 reply; 18+ messages in thread
From: Jan Kiszka @ 2011-07-20  8:55 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 2011-07-20 10:52, Avi Kivity wrote:
> On 07/20/2011 11:43 AM, Jan Kiszka wrote:
>>>
>>>  How do you implement this 3a, if your consumers are outside the main
>>>  process?  I guess you could have an additional synchonize API (for
>>>  in-kernel consumers) or RPC (for external process consumers), but then
>>>  this is no longer a simple API.
>>
>> I'm not planning to leave the hypervisor process for now, not to speak
>> of in-kernel models. Already for many other reasons, a synchronization
>> API between a hypothetical decoupled device model and the core will be
>> quite complex. The first step is to get it scalable using a single process.
> 
> If we design a new kvm API, we must look a little more into the future 
> than satisfying immediate needs.

That's true, but synchronization requirements are widely unrelated to
this API and will have to by satisfied generically anyway.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH] MMIO: Make coalesced mmio use a device per zone
  2011-07-20  8:55                       ` Jan Kiszka
@ 2011-07-20  8:58                         ` Avi Kivity
  0 siblings, 0 replies; 18+ messages in thread
From: Avi Kivity @ 2011-07-20  8:58 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: Sasha Levin, kvm@vger.kernel.org, Marcelo Tosatti

On 07/20/2011 11:55 AM, Jan Kiszka wrote:
> On 2011-07-20 10:52, Avi Kivity wrote:
> >  On 07/20/2011 11:43 AM, Jan Kiszka wrote:
> >>>
> >>>   How do you implement this 3a, if your consumers are outside the main
> >>>   process?  I guess you could have an additional synchonize API (for
> >>>   in-kernel consumers) or RPC (for external process consumers), but then
> >>>   this is no longer a simple API.
> >>
> >>  I'm not planning to leave the hypervisor process for now, not to speak
> >>  of in-kernel models. Already for many other reasons, a synchronization
> >>  API between a hypothetical decoupled device model and the core will be
> >>  quite complex. The first step is to get it scalable using a single process.
> >
> >  If we design a new kvm API, we must look a little more into the future
> >  than satisfying immediate needs.
>
> That's true, but synchronization requirements are widely unrelated to
> this API and will have to by satisfied generically anyway.

I'm trying to see if they can be satisfied at all, for the more esoteric 
consumers, and if so, if it's in a palatable way and performant way.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2011-07-20  8:58 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-19  8:10 [PATCH] MMIO: Make coalesced mmio use a device per zone Sasha Levin
2011-07-19  8:48 ` Avi Kivity
2011-07-19  9:53   ` Sasha Levin
2011-07-19  9:59     ` Avi Kivity
2011-07-19 10:17       ` Sasha Levin
2011-07-19 10:32         ` Avi Kivity
2011-07-19 11:22           ` Sasha Levin
2011-07-19 11:45             ` Sasha Levin
2011-07-19 17:14           ` Jan Kiszka
2011-07-19 17:17             ` Avi Kivity
2011-07-19 17:23               ` Jan Kiszka
2011-07-19 19:28                 ` Sasha Levin
2011-07-19 20:00                   ` Jan Kiszka
2011-07-20  8:24                 ` Avi Kivity
2011-07-20  8:43                   ` Jan Kiszka
2011-07-20  8:52                     ` Avi Kivity
2011-07-20  8:55                       ` Jan Kiszka
2011-07-20  8:58                         ` Avi Kivity

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox