Linux virtualization list
 help / color / mirror / Atom feed
* [PULL] virtio pci fix
From: Rusty Russell @ 2011-11-14  0:48 UTC (permalink / raw)
  To: Linus Torvalds, lkml - Kernel Mailing List
  Cc: Amit Shah, Michael S. Tsirkin, virtualization

 * [new tag]         rusty@rustcorp.com.au-v3.2-rc1-1-g72103bd -> rusty@rustcorp.com.au-v3.2-rc1-1-g72103bd
The following changes since commit 1ea6b8f48918282bdca0b32a34095504ee65bab5:

  Linux 3.2-rc1 (2011-11-07 16:16:02 -0800)

are available in the git repository at:
  git://github.com/rustyrussell/linux.git master

Michael S. Tsirkin (1):
      virtio-pci: fix use after free

 drivers/virtio/virtio_pci.c |   11 ++++++-----
 1 files changed, 6 insertions(+), 5 deletions(-)

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: Asias He @ 2011-11-14  2:04 UTC (permalink / raw)
  To: Sasha Levin
  Cc: Krishna Kumar, gorcunov, kvm, Michael S. Tsirkin, netdev,
	virtualization, penberg, mingo
In-Reply-To: <1321196430.2425.2.camel@sasha>

Hi, Shsha

On 11/13/2011 11:00 PM, Sasha Levin wrote:
> On Sun, 2011-11-13 at 12:24 +0200, Michael S. Tsirkin wrote:
>> On Sat, Nov 12, 2011 at 12:12:01AM +0200, Sasha Levin wrote:
>>> This is a patch based on Krishna Kumar's patch series which implements
>>> multiple VQ support for virtio-net.
>>>
>>> The patch was tested with ver3 of the patch.
>>>
>>> Cc: Krishna Kumar<krkumar2@in.ibm.com>
>>> Cc: Michael S. Tsirkin<mst@redhat.com>
>>> Cc: Rusty Russell<rusty@rustcorp.com.au>
>>> Cc: virtualization@lists.linux-foundation.org
>>> Cc: netdev@vger.kernel.org
>>> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
>>
>> Any performance numbers?
>
> I tried finding a box with more than two cores so I could test it on
> something like that as well.
>
>> From what I see this patch causes a performance regression on my 2 core
> box.
>
> I'll send an updated KVM tools patch in a bit as well.
>
> Before:
>
> # netperf -H 192.168.33.4,ipv4 -t TCP_RR
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    11160.63
> 16384  87380
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_RR
> MIGRATED UDP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 122880 122880 1        1       10.00    12072.64
> 229376 229376
>
> # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
>   87380  16384  16384    10.00    4654.50
>
> netperf -H 192.168.33.4,ipv4 -t TCP_STREAM -- -m 128
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
>   87380  16384    128    10.00     635.45
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM
> MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Socket  Message  Elapsed      Messages
> Size    Size     Time         Okay Errors   Throughput
> bytes   bytes    secs            #      #   10^6bits/sec
>
> 122880   65507   10.00      113894      0    5968.54
> 229376           10.00       89373           4683.54
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM -- -m 128
> MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Socket  Message  Elapsed      Messages
> Size    Size     Time         Okay Errors   Throughput
> bytes   bytes    secs            #      #   10^6bits/sec
>
> 122880     128   10.00      550634      0      56.38
> 229376           10.00      398786             40.84
>
>
> After:
>
> # netperf -H 192.168.33.4,ipv4 -t TCP_RR
> MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 16384  87380  1        1       10.00    8952.47
> 16384  87380
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_RR
> MIGRATED UDP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> Local /Remote
> Socket Size   Request  Resp.   Elapsed  Trans.
> Send   Recv   Size     Size    Time     Rate
> bytes  Bytes  bytes    bytes   secs.    per sec
>
> 122880 122880 1        1       10.00    9534.52
> 229376 229376
>
> # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
>   87380  16384  16384    10.13    2278.23
>
> # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM -- -m 128
> MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Recv   Send    Send
> Socket Socket  Message  Elapsed
> Size   Size    Size     Time     Throughput
> bytes  bytes   bytes    secs.    10^6bits/sec
>
>   87380  16384    128    10.00     623.27
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM
> MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Socket  Message  Elapsed      Messages
> Size    Size     Time         Okay Errors   Throughput
> bytes   bytes    secs            #      #   10^6bits/sec
>
> 122880   65507   10.00      136930      0    7175.72
> 229376           10.00       16726            876.51
>
> # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM -- -m 128
> MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> 192.168.33.4 (192.168.33.4) port 0 AF_INET
> Socket  Message  Elapsed      Messages
> Size    Size     Time         Okay Errors   Throughput
> bytes   bytes    secs            #      #   10^6bits/sec
>
> 122880     128   10.00      982492      0     100.61
> 229376           10.00      249597             25.56
>

Why both the bandwidth and latency performance are dropping so 
dramatically with multiple VQ?

-- 
Asias He

^ permalink raw reply

* RE: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
From: KY Srinivasan @ 2011-11-14  2:40 UTC (permalink / raw)
  To: Jiri Kosina, Dmitry Torokhov
  Cc: gregkh@suse.de, linux-kernel@vger.kernel.org,
	devel@linuxdriverproject.org, virtualization@lists.osdl.org,
	ohering@suse.com, joe@perches.com
In-Reply-To: <alpine.LNX.2.00.1111132100000.15187@pobox.suse.cz>



> -----Original Message-----
> From: Jiri Kosina [mailto:jkosina@suse.cz]
> Sent: Sunday, November 13, 2011 3:01 PM
> To: Dmitry Torokhov
> Cc: KY Srinivasan; gregkh@suse.de; linux-kernel@vger.kernel.org;
> devel@linuxdriverproject.org; virtualization@lists.osdl.org; ohering@suse.com;
> joe@perches.com
> Subject: Re: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
> 
> On Sun, 6 Nov 2011, Dmitry Torokhov wrote:
> 
> > > I am not a hid expert; but all  hid low level drivers appear to do this.
> > > Initially, I was directly invoking hid_connect() directly and based on your
> > > Input, I chose to use hid_hw_start() which all other drivers are using.
> >
> > Note that the users of hid_hw_start() actually are not low level
> > drivers, such as usbhid or bluetooth hidp, but higher-level drivers,
> > such as hid-wacom, hid-a4tech, etc. Since your driver is a low-level
> > driver (a provider so to speak) it should not call hid_hw_start() on its
> > own but rather wait for the hid code to do it.
> >
> > Still, I am not a HID expert either so I'll defer to Jiri here.
> 
> Hi,
> 
> my understanding is that hv driver is actually a bit special in this
> respect -- it's actually all-in-one both low-level and high-level driver.
> I take it that there is not ever going to be a different high-level driver
> using low-level hv transport (is that correct, KY?), so this might indeed
> be an acceptable layout of the driver.

You are right Jiri. While this is a low level transport driver, there will never be a another
higher level driver using this transport.

Regards,

K. Y

^ permalink raw reply

* RE: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
From: KY Srinivasan @ 2011-11-14  2:42 UTC (permalink / raw)
  To: Jiri Kosina
  Cc: Dmitry Torokhov, gregkh@suse.de, linux-kernel@vger.kernel.org,
	devel@linuxdriverproject.org, virtualization@lists.osdl.org,
	ohering@suse.com, joe@perches.com
In-Reply-To: <alpine.LNX.2.00.1111132102070.15187@pobox.suse.cz>



> -----Original Message-----
> From: Jiri Kosina [mailto:jkosina@suse.cz]
> Sent: Sunday, November 13, 2011 3:03 PM
> To: KY Srinivasan
> Cc: Dmitry Torokhov; gregkh@suse.de; linux-kernel@vger.kernel.org;
> devel@linuxdriverproject.org; virtualization@lists.osdl.org; ohering@suse.com;
> joe@perches.com
> Subject: RE: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
> 
> On Wed, 9 Nov 2011, KY Srinivasan wrote:
> 
> > If it is ok with you, I will send the patch out with all the other
> > cleanup that you had suggested.
> 
> Yes please, Dmitry has done an excellent review here (as usual). So I will
> not be merging the latest patch you have sent just now and will wait for
> respin with Dmitry's comments incorporated.

I did send this patch out (with Dmitry's comments incorporated.
If you have not received them, I could re-send the patch. Let me know.

Regards,

K. Y

^ permalink raw reply

* RE: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
From: KY Srinivasan @ 2011-11-14  2:45 UTC (permalink / raw)
  To: Dmitry Torokhov, Jiri Kosina
  Cc: gregkh@suse.de, linux-kernel@vger.kernel.org,
	devel@linuxdriverproject.org, virtualization@lists.osdl.org,
	ohering@suse.com, joe@perches.com
In-Reply-To: <20111114004745.GA3666@core.coreip.homeip.net>



> -----Original Message-----
> From: Dmitry Torokhov [mailto:dmitry.torokhov@gmail.com]
> Sent: Sunday, November 13, 2011 7:48 PM
> To: Jiri Kosina
> Cc: KY Srinivasan; gregkh@suse.de; linux-kernel@vger.kernel.org;
> devel@linuxdriverproject.org; virtualization@lists.osdl.org; ohering@suse.com;
> joe@perches.com
> Subject: Re: [PATCH 1/1] Staging: hv: Move the mouse driver out of staging
> 
> On Sun, Nov 13, 2011 at 09:01:28PM +0100, Jiri Kosina wrote:
> > On Sun, 6 Nov 2011, Dmitry Torokhov wrote:
> >
> > > > I am not a hid expert; but all  hid low level drivers appear to do this.
> > > > Initially, I was directly invoking hid_connect() directly and based on your
> > > > Input, I chose to use hid_hw_start() which all other drivers are using.
> > >
> > > Note that the users of hid_hw_start() actually are not low level
> > > drivers, such as usbhid or bluetooth hidp, but higher-level drivers,
> > > such as hid-wacom, hid-a4tech, etc. Since your driver is a low-level
> > > driver (a provider so to speak) it should not call hid_hw_start() on its
> > > own but rather wait for the hid code to do it.
> > >
> > > Still, I am not a HID expert either so I'll defer to Jiri here.
> >
> > Hi,
> >
> > my understanding is that hv driver is actually a bit special in this
> > respect -- it's actually all-in-one both low-level and high-level driver.
> > I take it that there is not ever going to be a different high-level driver
> > using low-level hv transport (is that correct, KY?), so this might indeed
> > be an acceptable layout of the driver.
> >
> 
> I actually do not see anything of a high-level driver in hv-mouse. It is
> a pure transport driver that channels everything through hid-input...

The split architecture (in my opinion) makes sense if there are multiple consumers
of that transport. In this case though, there is not going to be any other drivers
using this transport. As you have noted, there really is not any code in this driver that
would qualify as being the high level driver. 

Regards,

K. Y

^ permalink raw reply

* Re: [PATCH 5 of 5] virtio: expose added descriptors immediately
From: Michael S. Tsirkin @ 2011-11-14  6:56 UTC (permalink / raw)
  To: Rusty Russell; +Cc: Christoph Hellwig, linux-kernel, kvm, virtualization
In-Reply-To: <20111113210256.GA31621@redhat.com>

On Sun, Nov 13, 2011 at 11:03:13PM +0200, Michael S. Tsirkin wrote:
> On Thu, Nov 03, 2011 at 06:12:53PM +1030, Rusty Russell wrote:
> > A virtio driver does virtqueue_add_buf() multiple times before finally
> > calling virtqueue_kick(); previously we only exposed the added buffers
> > in the virtqueue_kick() call.  This means we don't need a memory
> > barrier in virtqueue_add_buf(), but it reduces concurrency as the
> > device (ie. host) can't see the buffers until the kick.
> > 
> > Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> 
> In the past I played with a patch like this, but I didn't see a
> performance gain either way. Do you see any gain?
> 
> I'm a bit concerned that with this patch, a buggy driver that
> adds more than 2^16 descriptors without a kick
> would seem to work sometimes. Let's add WARN_ON(vq->num_added > (1 << 16))?

Thinking about it more - it might be tricky for drivers
to ensure this. add used to fail when vq is full, but now
driver might do get between add and notify:
	lock
	add_buf * N
	prep
	unlock
	lock
	get_buf * N
	unlock
	lock
	add_buf
	prep
	unlock
	notify

and since add was followed by get, this doesn't fail.

So the right thing to do I think is to either ignore indexes and assume
a kick is needed, something like:
if vq->num_added >= (1 << 15))
	needs_kick = true
(note: maybe it's 1<<16, and maybe >, but 1<<15 is plenty anyway)

Or alternatively, fail add when num_added is too large.

> > ---
> >  drivers/virtio/virtio_ring.c |   37 ++++++++++++++++++++++---------------
> >  1 file changed, 22 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
> > --- a/drivers/virtio/virtio_ring.c
> > +++ b/drivers/virtio/virtio_ring.c
> > @@ -227,9 +227,15 @@ add_head:
> >  
> >  	/* Put entry in available array (but don't update avail->idx until they
> >  	 * do sync). */
> > -	avail = ((vq->vring.avail->idx + vq->num_added++) & (vq->vring.num-1));
> > +	avail = (vq->vring.avail->idx & (vq->vring.num-1));
> >  	vq->vring.avail->ring[avail] = head;
> >  
> > +	/* Descriptors and available array need to be set before we expose the
> > +	 * new available array entries. */
> > +	virtio_wmb();
> > +	vq->vring.avail->idx++;
> > +	vq->num_added++;
> > +
> >  	pr_debug("Added buffer head %i to %p\n", head, vq);
> >  	END_USE(vq);
> >  
> > @@ -248,13 +254,10 @@ bool virtqueue_kick_prepare(struct virtq
> >  	 * new available array entries. */
> >  	virtio_wmb();
> >  
> > -	old = vq->vring.avail->idx;
> > -	new = vq->vring.avail->idx = old + vq->num_added;
> > +	old = vq->vring.avail->idx - vq->num_added;
> > +	new = vq->vring.avail->idx;
> >  	vq->num_added = 0;
> >  
> > -	/* Need to update avail index before checking if we should notify */
> > -	virtio_mb();
> > -
> >  	if (vq->event) {
> >  		needs_kick = vring_need_event(vring_avail_event(&vq->vring),
> >  					      new, old);
> > 
> > 
> > _______________________________________________
> > Virtualization mailing list
> > Virtualization@lists.linux-foundation.org
> > https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: [PATCHv2 RFC] virtio-spec: flexible configuration layout
From: Michael S. Tsirkin @ 2011-11-14  6:59 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Krishna Kumar, kvm, Pawel Moll, Wang Sheng-Hui,
	Alexey Kardashevskiy, lkml - Kernel Mailing List, virtualization,
	penberg, Christian Borntraeger, Sasha Levin, Amit Shah, avi
In-Reply-To: <20111113151427.GA16749@redhat.com>

On Sun, Nov 13, 2011 at 05:14:27PM +0200, Michael S. Tsirkin wrote:
> On Fri, Nov 11, 2011 at 02:54:31PM +1030, Rusty Russell wrote:
> > On Wed, 09 Nov 2011 22:57:28 +0200, Sasha Levin <levinsasha928@gmail.com> wrote:
> > > On Wed, 2011-11-09 at 22:52 +0200, Michael S. Tsirkin wrote:
> > > > On Wed, Nov 09, 2011 at 10:24:47PM +0200, Sasha Levin wrote:
> > > > > It'll be a bit harder deprecating it in the future.
> > > > 
> > > > Harder than ... what ?
> > > 
> > > Harder than allowing devices not to present it at all if new layout
> > > config is used. Right now the simple implementation is to use MMIO for
> > > config and device specific, and let it fallback to legacy for ISR and
> > > notifications (and therefore, this is probably how everybody will
> > > implement it), which means that when you do want to deprecate legacy,
> > > there will be extra work to be done then, instead of doing it now.
> > 
> > Indeed, I'd like to see two changes to your proposal:
> > 
> > (1) It should be all or nothing.  If a driver can find the virtio header
> >     capability, it should only use the capabilties.  Otherwise, it
> >     should fall back to legacy.
> 
> Okay, but going forward, if we add more capabilities, we probably won't
> want to require them and fail to load if not there.  That's really why I
> wanted to make the failover ignore any capability separately - to make
> this future proof.  I'm not terribly fixated on this, it just seemed a
> bit more symmetrical to treat all capabilities in the same way. Hmm?
> 
> >     Your draft suggests a mix is possible;
> >     I prefer a clean failure (ie. one day don't present a BAR 0 *at
> >     all*, so ancient drivers just fail to load.).
> 
> Just to clarify, as written in draft this is possible with the current
> spec proposal.  So I'm guessing there's some other motivation that you
> had in mind?
> 
> > (2) There's no huge win in keeping the same layout.  Let's make some
> >     cleanups.
> 
> About this last point - what cleanups do you have in mind?  Just move
> some registers around?  I guess we could put feature bits near each
> other, and move device status towards the end to avoid wasting 3 bytes.
> The win seems minimal, but the change does make legacy hypervisor
> support in guests more cumbersome, as we need to spread coditional code
> around instead of localizing it in the initialization path.
> 
> >    There are more users ahead of us then behind us (I
> >     hope!).
> 
> In that case isn't it safe to assume we'll find some uses
> for the reserved registers?
> 
> > But I think this is the right direction!
> > 
> > Thanks,
> > Rusty.

Note: waiting on response to the above before I tweak the
spec again.

-- 
MST

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: Sasha Levin @ 2011-11-14 10:15 UTC (permalink / raw)
  To: Asias He
  Cc: Krishna Kumar, gorcunov, kvm, Michael S. Tsirkin, netdev,
	virtualization, penberg, mingo
In-Reply-To: <4EC07729.3050303@gmail.com>

On Mon, 2011-11-14 at 10:04 +0800, Asias He wrote:
> Hi, Shsha
> 
> On 11/13/2011 11:00 PM, Sasha Levin wrote:
> > On Sun, 2011-11-13 at 12:24 +0200, Michael S. Tsirkin wrote:
> >> On Sat, Nov 12, 2011 at 12:12:01AM +0200, Sasha Levin wrote:
> >>> This is a patch based on Krishna Kumar's patch series which implements
> >>> multiple VQ support for virtio-net.
> >>>
> >>> The patch was tested with ver3 of the patch.
> >>>
> >>> Cc: Krishna Kumar<krkumar2@in.ibm.com>
> >>> Cc: Michael S. Tsirkin<mst@redhat.com>
> >>> Cc: Rusty Russell<rusty@rustcorp.com.au>
> >>> Cc: virtualization@lists.linux-foundation.org
> >>> Cc: netdev@vger.kernel.org
> >>> Signed-off-by: Sasha Levin<levinsasha928@gmail.com>
> >>
> >> Any performance numbers?
> >
> > I tried finding a box with more than two cores so I could test it on
> > something like that as well.
> >
> >> From what I see this patch causes a performance regression on my 2 core
> > box.
> >
> > I'll send an updated KVM tools patch in a bit as well.
> >
> > Before:
> >
> > # netperf -H 192.168.33.4,ipv4 -t TCP_RR
> > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> > to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 16384  87380  1        1       10.00    11160.63
> > 16384  87380
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_RR
> > MIGRATED UDP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> > to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 122880 122880 1        1       10.00    12072.64
> > 229376 229376
> >
> > # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Recv   Send    Send
> > Socket Socket  Message  Elapsed
> > Size   Size    Size     Time     Throughput
> > bytes  bytes   bytes    secs.    10^6bits/sec
> >
> >   87380  16384  16384    10.00    4654.50
> >
> > netperf -H 192.168.33.4,ipv4 -t TCP_STREAM -- -m 128
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Recv   Send    Send
> > Socket Socket  Message  Elapsed
> > Size   Size    Size     Time     Throughput
> > bytes  bytes   bytes    secs.    10^6bits/sec
> >
> >   87380  16384    128    10.00     635.45
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM
> > MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Socket  Message  Elapsed      Messages
> > Size    Size     Time         Okay Errors   Throughput
> > bytes   bytes    secs            #      #   10^6bits/sec
> >
> > 122880   65507   10.00      113894      0    5968.54
> > 229376           10.00       89373           4683.54
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM -- -m 128
> > MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Socket  Message  Elapsed      Messages
> > Size    Size     Time         Okay Errors   Throughput
> > bytes   bytes    secs            #      #   10^6bits/sec
> >
> > 122880     128   10.00      550634      0      56.38
> > 229376           10.00      398786             40.84
> >
> >
> > After:
> >
> > # netperf -H 192.168.33.4,ipv4 -t TCP_RR
> > MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> > to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 16384  87380  1        1       10.00    8952.47
> > 16384  87380
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_RR
> > MIGRATED UDP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
> > to 192.168.33.4 (192.168.33.4) port 0 AF_INET : first burst 0
> > Local /Remote
> > Socket Size   Request  Resp.   Elapsed  Trans.
> > Send   Recv   Size     Size    Time     Rate
> > bytes  Bytes  bytes    bytes   secs.    per sec
> >
> > 122880 122880 1        1       10.00    9534.52
> > 229376 229376
> >
> > # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Recv   Send    Send
> > Socket Socket  Message  Elapsed
> > Size   Size    Size     Time     Throughput
> > bytes  bytes   bytes    secs.    10^6bits/sec
> >
> >   87380  16384  16384    10.13    2278.23
> >
> > # netperf -H 192.168.33.4,ipv4 -t TCP_STREAM -- -m 128
> > MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Recv   Send    Send
> > Socket Socket  Message  Elapsed
> > Size   Size    Size     Time     Throughput
> > bytes  bytes   bytes    secs.    10^6bits/sec
> >
> >   87380  16384    128    10.00     623.27
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM
> > MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Socket  Message  Elapsed      Messages
> > Size    Size     Time         Okay Errors   Throughput
> > bytes   bytes    secs            #      #   10^6bits/sec
> >
> > 122880   65507   10.00      136930      0    7175.72
> > 229376           10.00       16726            876.51
> >
> > # netperf -H 192.168.33.4,ipv4 -t UDP_STREAM -- -m 128
> > MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to
> > 192.168.33.4 (192.168.33.4) port 0 AF_INET
> > Socket  Message  Elapsed      Messages
> > Size    Size     Time         Okay Errors   Throughput
> > bytes   bytes    secs            #      #   10^6bits/sec
> >
> > 122880     128   10.00      982492      0     100.61
> > 229376           10.00      249597             25.56
> >
> 
> Why both the bandwidth and latency performance are dropping so 
> dramatically with multiple VQ?

It looks like theres no hash sync between host and guest, which makes
the RX VQ change for every packet. This is my guess.

-- 

Sasha.

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: Pekka Enberg @ 2011-11-14 12:25 UTC (permalink / raw)
  To: Asias He
  Cc: Krishna Kumar, kvm, Michael S. Tsirkin, netdev, virtualization,
	gorcunov, Sasha Levin, mingo
In-Reply-To: <4EC07729.3050303@gmail.com>

On Mon, Nov 14, 2011 at 4:04 AM, Asias He <asias.hejun@gmail.com> wrote:
> Why both the bandwidth and latency performance are dropping so dramatically
> with multiple VQ?

What's the expected benefit from multiple VQs i.e. why are doing the
patches Sasha?

^ permalink raw reply

* Re: [RFC] kvm tools: Implement multiple VQ for virtio-net
From: Michael S. Tsirkin @ 2011-11-14 13:05 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Krishna Kumar, kvm, Asias He, virtualization, gorcunov,
	Sasha Levin, netdev, mingo
In-Reply-To: <CAOJsxLE4yRO+5XncrDEzVh5-RGXG+=HGTAmQ=bX=KMsDqf=feA@mail.gmail.com>

On Mon, Nov 14, 2011 at 02:25:17PM +0200, Pekka Enberg wrote:
> On Mon, Nov 14, 2011 at 4:04 AM, Asias He <asias.hejun@gmail.com> wrote:
> > Why both the bandwidth and latency performance are dropping so dramatically
> > with multiple VQ?
> 
> What's the expected benefit from multiple VQs

Heh, the original patchset didn't mention this :) It really should.
They are supposed to speed up networking for high smp guests.

> i.e. why are doing the patches Sasha?

-- 
MST

^ permalink raw reply

* Re: [PATCH RFC] ndo: ndo_queue_xmit/ndo_flush_xmit (was Re: [RFC] [ver3 PATCH 0/6] Implement multiqueue virtio-net)
From: Michael S. Tsirkin @ 2011-11-14 16:21 UTC (permalink / raw)
  To: Krishna Kumar; +Cc: netdev, davem, kvm, virtualization
In-Reply-To: <20111113174827.GA23310@redhat.com>

On Sun, Nov 13, 2011 at 07:48:28PM +0200, Michael S. Tsirkin wrote:
> @@ -863,6 +872,9 @@ struct net_device_ops {
>  	int			(*ndo_stop)(struct net_device *dev);
>  	netdev_tx_t		(*ndo_start_xmit) (struct sk_buff *skb,
>  						   struct net_device *dev);
> +	netdev_tx_t		(*ndo_queue_xmit)(struct sk_buff *skb,
> +						  struct net_device *dev);
> +	void			(*ndo_flush_xmit)(struct net_device *dev);
>  	u16			(*ndo_select_queue)(struct net_device *dev,
>  						    struct sk_buff *skb);
>  	void			(*ndo_change_rx_flags)(struct net_device *dev,
> @@ -2099,6 +2111,10 @@ extern int		dev_set_mac_address(struct net_device *,

An alternative I considered was to add a boolean flag to ndo_start_xmit
'bool queue' or something like this, plus ndo_flush_xmit.
This will lead to cleaner code I think but will require all drivers
to be changed, so for a proof of concept I decided to go for one
that is less work.

Let me know what looks more palatable ... 

-- 
MST

^ permalink raw reply

* [PATCHv2 RFC] virtio-pci: flexible configuration layout
From: Michael S. Tsirkin @ 2011-11-14 18:18 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Krishna Kumar, kvm, Pawel Moll, Wang Sheng-Hui,
	Alexey Kardashevskiy, lkml - Kernel Mailing List, Jesse Barnes,
	virtualization, Christian Borntraeger, Sasha Levin, Amit Shah,
	Linus Torvalds

Add a flexible mechanism to specify virtio configuration layout, using
pci vendor-specific capability.  A separate capability is used for each
of common, device specific and data-path accesses.

Warning: compiled only.
This patch also needs to be split up, pci_iomap changes
also need arch updates for non-x86.
There might also be more spec changes.

Posting here for early feedback, and to allow Sasha to
proceed with his "kvm tool" work.

Changes from v1:
Updated to match v3 of the spec, see:
	Subject: [PATCHv3 RFC] virtio-spec: flexible configuration layout
	Message-ID: <20111110122436.GA13144@redhat.com>
	In-Reply-To: <20111109195901.GA28155@redhat.com>

In particular:
- split ISR out
- reorg capability
- add offset alignment

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci.c |  184 +++++++++++++++++++++++++++++++++++++++---
 include/asm-generic/io.h    |    4 +
 include/asm-generic/iomap.h |   11 +++
 include/linux/pci_regs.h    |    4 +
 include/linux/virtio_pci.h  |   41 ++++++++++
 lib/iomap.c                 |   40 ++++++++-
 6 files changed, 265 insertions(+), 19 deletions(-)

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index ecb9254..bdd3876 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -37,8 +37,12 @@ struct virtio_pci_device
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
 
-	/* the IO mapping for the PCI config space */
+	/* the IO address for the common PCI config space */
 	void __iomem *ioaddr;
+	/* the IO address for device specific config */
+	void __iomem *ioaddr_device;
+	/* the IO address to use for notifications operations */
+	void __iomem *ioaddr_notify;
 
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
@@ -57,8 +61,158 @@ struct virtio_pci_device
 	unsigned msix_used_vectors;
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
+
+	/* Various IO mappings: used for resource tracking only. */
+
+	/* Legacy BAR0: typically PIO. */
+	void __iomem *legacy_map;
+
+	/* Mappings specified by device capabilities: typically in MMIO */
+	void __iomem *isr_map;
+	void __iomem *notify_map;
+	void __iomem *common_map;
+	void __iomem *device_map;
 };
 
+/*
+ * With PIO, device-specific config moves as MSI-X is enabled/disabled.
+ * Use this accessor to keep pointer to that config in sync.
+ */
+static void virtio_pci_set_msix_enabled(struct virtio_pci_device *vp_dev, int enabled)
+{
+	vp_dev->msix_enabled = enabled;
+	if (vp_dev->device_map)
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	else
+		vp_dev->ioaddr_device = vp_dev->legacy_map +
+			VIRTIO_PCI_CONFIG(vp_dev);
+}
+
+static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap_id,
+					u32 align)
+{
+        u32 size;
+        u32 offset;
+        u8 bir;
+        u8 cap_len;
+	int pos;
+	struct pci_dev *dev = vp_dev->pci_dev;
+	void __iomem *p;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 id;
+		pci_read_config_byte(dev, pos + PCI_VNDR_CAP_LEN, &cap_len);
+		if (cap_len < VIRTIO_PCI_CAP_ID + 1)
+			continue;
+		pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_ID, &id);
+		id >>= VIRTIO_PCI_CAP_ID_SHIFT;
+		id &= VIRTIO_PCI_CAP_ID_MASK;
+		if (id == cap_id)
+			break;
+	}
+
+	if (pos <= 0)
+		return NULL;
+
+	if (cap_len < VIRTIO_PCI_CAP_CFG_BIR + 1)
+		goto err;
+        pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_CFG_BIR, &bir);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_OFF + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_OFF, &offset);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_SIZE + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_SIZE, &size);
+        bir >>= VIRTIO_PCI_CAP_CFG_BIR_SHIFT;
+        bir &= VIRTIO_PCI_CAP_CFG_BIR_MASK;
+        size >>= VIRTIO_PCI_CAP_CFG_SIZE_SHIFT;
+        size &= VIRTIO_PCI_CAP_CFG_SIZE_MASK;
+        off >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
+        off &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
+	/* Align offset appropriately */
+	off &= ~(align - 1);
+
+	/* It's possible for a device to supply a huge config space,
+	 * but we'll never need to map more than a page ATM. */
+	p = pci_iomap_range(dev, bir, offset, size, PAGE_SIZE);
+	if (!p)
+		dev_err(&vp_dev->vdev.dev, "Unable to map virtio pci memory");
+	return p;
+err:
+	dev_err(&vp_dev->vdev.dev, "Unable to parse virtio pci capability");
+	return NULL;
+}
+
+static void virtio_pci_iounmap(struct virtio_pci_device *vp_dev)
+{
+	if (vp_dev->legacy_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->legacy_map);
+	if (vp_dev->isr_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->isr_map);
+	if (vp_dev->notify_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->notify_map);
+	if (vp_dev->common_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->common_map);
+	if (vp_dev->device_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->device_map);
+}
+
+static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
+{
+	vp_dev->isr_map = virtio_pci_map_cfg(vp_dev,
+					     VIRTIO_PCI_CAP_ISR_CFG,
+					     sizeof(u8));
+	vp_dev->notify_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_NOTIFY_CFG,
+						sizeof(u16));
+	vp_dev->common_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_COMMON_CFG,
+						sizeof(u32));
+	vp_dev->device_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_DEVICE_CFG,
+						sizeof(u8));
+
+	if (!vp_dev->notify_map || !vp_dev->common_map ||
+	    !vp_dev->device_map) {
+		/*
+		 * If not all capabilities present, map legacy PIO.
+		 * Legacy access is at BAR 0. We never need to map
+		 * more than 256 bytes there, since legacy config space
+		 * used PIO which has this size limit.
+		 * */
+		vp_dev->legacy_map = pci_iomap(vp_dev->pci_dev, 0, 256);
+		if (!vp_dev->legacy_map) {
+			dev_err(&vp_dev->vdev.dev, "Unable to map legacy PIO");
+			goto err;
+		}
+	}
+
+	/* Prefer MMIO if available. If not, fallback to legacy PIO. */
+	if (vp_dev->common_map)
+		vp_dev->ioaddr = vp_dev->common_map;
+	else
+		vp_dev->ioaddr = vp_dev->legacy_map;
+
+	if (vp_dev->device_map)
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	else
+		vp_dev->ioaddr_device = vp_dev->legacy_map +
+			VIRTIO_PCI_CONFIG(vp_dev);
+
+	if (vp_dev->notify_map)
+		vp_dev->ioaddr_notify = vp_dev->notify_map;
+	else
+		vp_dev->ioaddr_notify = vp_dev->legacy_map +
+			VIRTIO_PCI_QUEUE_NOTIFY;
+
+	return 0;
+err:
+	virtio_pci_iounmap(vp_dev);
+	return -EINVAL;
+}
+
 /* Constants for MSI-X */
 /* Use first vector for configuration changes, second and the rest for
  * virtqueues Thus, we need at least 2 vectors for MSI. */
@@ -130,8 +284,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	u8 *ptr = buf;
 	int i;
 
@@ -145,8 +298,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	const u8 *ptr = buf;
 	int i;
 
@@ -184,7 +336,7 @@ static void vp_notify(struct virtqueue *vq)
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(info->queue_index, vp_dev->ioaddr_notify);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -231,7 +383,8 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
-	isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	isr = ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 
 	/* It's definitely not us if the ISR was not high */
 	if (!isr)
@@ -265,7 +418,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
 
 		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
+                virtio_pci_set_msix_enabled(vp_dev, 0);
 		vp_dev->msix_vectors = 0;
 	}
 
@@ -303,7 +456,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	if (err)
 		goto error;
 	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
+        virtio_pci_set_msix_enabled(vp_dev, 1);
 
 	/* Set the vector used for configuration */
 	v = vp_dev->msix_used_vectors;
@@ -447,7 +600,10 @@ static void vp_del_vq(struct virtqueue *vq)
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		/* And clear ISR: TODO: really needed? */
+		ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 	}
 
 	vring_del_virtqueue(vq);
@@ -638,8 +794,8 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	err = virtio_pci_iomap(vp_dev);
+	if (err)
 		goto out_req_regions;
 
 	pci_set_drvdata(pci_dev, vp_dev);
@@ -661,7 +817,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 out_req_regions:
 	pci_release_regions(pci_dev);
 out_enable_device:
@@ -679,7 +835,7 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
 
 	vp_del_vqs(&vp_dev->vdev);
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
 	kfree(vp_dev);
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..3cf1787 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..6f192d4 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                            unsigned offset,
+                                            unsigned long minlen,
+                                            unsigned long maxlen)
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index e884096..2ec9f81 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,10 @@
 #define  PCI_X_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
 #define  PCI_X_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
 
+/* Vendor specific capability */
+#define PCI_VNDR_CAP_LEN	2	/* Capability length (8 bits), including
+					   bytes: ID, NEXT and LEN itself. */
+
 /* PCI Bridge Subsystem ID registers */
 
 #define PCI_SSVID_VENDOR_ID     4	/* PCI-Bridge subsystem vendor id register */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index ea66f3f..d6568e7 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -92,4 +92,45 @@
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN		4096
+
+/*
+ * Layout for Virtio PCI vendor specific capability (little-endian):
+ * 5 bit virtio capability id.
+ * 3 bit BAR index register, specifying which BAR to use.
+ * 4 byte cfg offset within the BAR.
+ * 4 byte cfg size.
+ */
+
+/* A single virtio device has multiple vendor specific capabilities, we use the
+ * 5 bit ID field to distinguish between these. */
+#define VIRTIO_PCI_CAP_ID		3
+#define VIRTIO_PCI_CAP_ID_MASK		0xff
+#define VIRTIO_PCI_CAP_ID_SHIFT		0
+
+/* IDs for different capabilities. If a specific configuration
+ * is missing, legacy PIO path is used. */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* Index of the BAR including this configuration */
+#define VIRTIO_PCI_CAP_CFG_BIR		4
+#define VIRTIO_PCI_CAP_CFG_BIR_MASK	(0x7)
+#define VIRTIO_PCI_CAP_CFG_BIR_SHIFT	0
+
+/* Size of the configuration space */
+#define VIRTIO_PCI_CAP_CFG_SIZE		4
+#define VIRTIO_PCI_CAP_CFG_SIZE_MASK	0xffffff
+#define VIRTIO_PCI_CAP_CFG_SIZE_SHIFT	8
+
+/* Offset within the BAR */
+#define VIRTIO_PCI_CAP_CFG_OFF		8
+#define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
+#define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
+
 #endif
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..f28720e 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,26 +243,36 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                              unsigned offset,
+                              unsigned long minlen,
+                              unsigned long maxlen)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+        len -= offset;
+        if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
@@ -277,10 +287,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+    return pci_iomap_range(dev, bar, 0, 0, maxlen);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.5.53.gc233e

^ permalink raw reply related

* [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Rick Jones @ 2011-11-14 21:52 UTC (permalink / raw)
  To: netdev, Rusty Russell, Michael Tsirkin, virtualization

From: Rick Jones <rick.jones2@hp.com>

Add a new .bus_name to virtio_config_ops then modify virtio_net to 
call through to it in an ethtool .get_drvinfo routine to report
bus_info in ethtool -i output which is consistent with other
emulated NICs and the output of lspci.  

Signed-off-by: Rick Jones <rick.jones2@hp.com>

---

The changes to drivers/lguest/lguest_device.c, drivers/s390/kvm/kvm_virtio.c,
and drivers/virtio/virtio_mmio.c code inspected only, not compiled.

raj@raj-ubuntu-guest:~$ ethtool -i eth0
driver: virtio_net
version: 1.0.0
firmware-version: 
bus-info: 0000:00:03.0
raj@raj-ubuntu-guest:~$ lspci | grep Ether
00:03.0 Ethernet controller: Red Hat, Inc Virtio network device

 drivers/lguest/lguest_device.c |    6 ++++++
 drivers/net/virtio_net.c       |   15 +++++++++++++++
 drivers/s390/kvm/kvm_virtio.c  |    6 ++++++
 drivers/virtio/virtio_mmio.c   |    6 ++++++
 drivers/virtio/virtio_pci.c    |    8 ++++++++
 include/linux/virtio_config.h  |   14 ++++++++++++++
 6 files changed, 55 insertions(+), 0 deletions(-)


diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 0dc30ff..3724d45 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -381,6 +381,11 @@ error:
 	return PTR_ERR(vqs[i]);
 }
 
+static const char *lg_bus_name(struct virtio_device *vdev)
+{
+	return "Not Implemented";
+}
+
 /* The ops structure which hooks everything together. */
 static struct virtio_config_ops lguest_config_ops = {
 	.get_features = lg_get_features,
@@ -392,6 +397,7 @@ static struct virtio_config_ops lguest_config_ops = {
 	.reset = lg_reset,
 	.find_vqs = lg_find_vqs,
 	.del_vqs = lg_del_vqs,
+	.bus_name = lg_bus_name,
 };
 
 /*
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6ee8410..4dc9d84 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,6 +39,7 @@ module_param(gso, bool, 0444);
 #define GOOD_COPY_LEN	128
 
 #define VIRTNET_SEND_COMMAND_SG_MAX    2
+#define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
 	struct u64_stats_sync syncp;
@@ -889,7 +890,21 @@ static void virtnet_get_ringparam(struct net_device *dev,
 
 }
 
+
+static void virtnet_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtio_device *vdev = vi->vdev;
+
+	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
+
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
+	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
 };
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 94f49ff..725d90e 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -263,6 +263,11 @@ error:
 	return PTR_ERR(vqs[i]);
 }
 
+static const char *kvm_bus_name(struct virtio_device *vdev)
+{
+	return "Not Implemented";
+}
+
 /*
  * The config ops structure as defined by virtio config
  */
@@ -276,6 +281,7 @@ static struct virtio_config_ops kvm_vq_configspace_ops = {
 	.reset = kvm_reset,
 	.find_vqs = kvm_find_vqs,
 	.del_vqs = kvm_del_vqs,
+	.bus_name = kvm_bus_name,
 };
 
 /*
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index acc5e43..2f57380 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -361,7 +361,12 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	return 0;
 }
 
+static const char *vm_bus_name(struct virtio_device *vdev)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 
+	return vm_dev->pdev->name;
+}
 
 static struct virtio_config_ops virtio_mmio_config_ops = {
 	.get		= vm_get,
@@ -373,6 +378,7 @@ static struct virtio_config_ops virtio_mmio_config_ops = {
 	.del_vqs	= vm_del_vqs,
 	.get_features	= vm_get_features,
 	.finalize_features = vm_finalize_features,
+	.bus_name	= vm_bus_name,
 };
 
 
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 79a31e5..764ec05 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -580,6 +580,13 @@ static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 				  false, false);
 }
 
+static const char *vp_bus_name(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	return pci_name(vp_dev->pci_dev);
+}
+
 static struct virtio_config_ops virtio_pci_config_ops = {
 	.get		= vp_get,
 	.set		= vp_set,
@@ -590,6 +597,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
 	.del_vqs	= vp_del_vqs,
 	.get_features	= vp_get_features,
 	.finalize_features = vp_finalize_features,
+	.bus_name	= vp_bus_name,
 };
 
 static void virtio_pci_release_dev(struct device *_d)
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index add4790..63f98d0 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -100,6 +100,10 @@
  *	vdev: the virtio_device
  *	This gives the final feature bits for the device: it can change
  *	the dev->feature bits if it wants.
+ * @bus_name: return the bus name associated with the device
+ *	vdev: the virtio_device
+ *      This returns a pointer to the bus name a la pci_name from which
+ *      the caller can then copy.
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -117,6 +121,7 @@ struct virtio_config_ops {
 	void (*del_vqs)(struct virtio_device *);
 	u32 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
+	const char *(*bus_name)(struct virtio_device *vdev);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
@@ -182,5 +187,14 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 		return ERR_PTR(err);
 	return vq;
 }
+
+static inline
+const char *virtio_bus_name(struct virtio_device *vdev)
+{
+	if (!vdev->config->bus_name)
+		return "virtio";
+	return vdev->config->bus_name(vdev);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_CONFIG_H */

^ permalink raw reply related

* Re: [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Ben Hutchings @ 2011-11-14 22:30 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev, virtualization, Michael Tsirkin
In-Reply-To: <20111114215241.5B8BF2900307@tardy>

On Mon, 2011-11-14 at 13:52 -0800, Rick Jones wrote:
> From: Rick Jones <rick.jones2@hp.com>
> 
> Add a new .bus_name to virtio_config_ops then modify virtio_net to 
> call through to it in an ethtool .get_drvinfo routine to report
> bus_info in ethtool -i output which is consistent with other
> emulated NICs and the output of lspci.  
[...]
> diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
> index 0dc30ff..3724d45 100644
> --- a/drivers/lguest/lguest_device.c
> +++ b/drivers/lguest/lguest_device.c
> @@ -381,6 +381,11 @@ error:
>  	return PTR_ERR(vqs[i]);
>  }
>  
> +static const char *lg_bus_name(struct virtio_device *vdev)
> +{
> +	return "Not Implemented";
> +}
[...]
> +static const char *kvm_bus_name(struct virtio_device *vdev)
> +{
> +	return "Not Implemented";
> +}
[...]

Please use the existing 'not implemented' value, which is the empty
string.   If you think ethtool should print some helpful message instead
of an empty string, please submit a patch for ethtool.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Rick Jones @ 2011-11-14 22:40 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Rick Jones, virtualization, Michael Tsirkin
In-Reply-To: <1321309800.2827.22.camel@bwh-desktop>

On 11/14/2011 02:30 PM, Ben Hutchings wrote:
> On Mon, 2011-11-14 at 13:52 -0800, Rick Jones wrote:
>> From: Rick Jones<rick.jones2@hp.com>
>>
>> Add a new .bus_name to virtio_config_ops then modify virtio_net to
>> call through to it in an ethtool .get_drvinfo routine to report
>> bus_info in ethtool -i output which is consistent with other
>> emulated NICs and the output of lspci.
> [...]
>> diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
>> index 0dc30ff..3724d45 100644
>> --- a/drivers/lguest/lguest_device.c
>> +++ b/drivers/lguest/lguest_device.c
>> @@ -381,6 +381,11 @@ error:
>>   	return PTR_ERR(vqs[i]);
>>   }
>>
>> +static const char *lg_bus_name(struct virtio_device *vdev)
>> +{
>> +	return "Not Implemented";
>> +}
> [...]
>> +static const char *kvm_bus_name(struct virtio_device *vdev)
>> +{
>> +	return "Not Implemented";
>> +}
> [...]
>
> Please use the existing 'not implemented' value, which is the empty
> string.

Will do.

thanks,

rick

^ permalink raw reply

* [RFC 0/5] virtio-pci, kvm tools: Support new virtio-pci spec in kvm tools
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, Sasha Levin, mingo
In-Reply-To: <20111114181854.GA24953@redhat.com>

This patch series contains two fixes for the RFC patch send by Michael. These
patches are pretty basic and should probably be merged into the next version
of that patch.

Other two patches add support to the new virtio-pci spec in KVM tools, allowing
for easy testing of any future virtio-pci kernel side patches. The usermode code
isn't complete, but it's working properly and should be enough for functionality
testing.

Michael S. Tsirkin (1):
  virtio-pci: flexible configuration layout

Sasha Levin (4):
  virtio-pci: Fix compilation issue
  iomap: Don't ignore offset
  kvm tools: Free up the MSI-X PBA BAR
  kvm tools: Support new virtio-pci configuration layout

 drivers/virtio/virtio_pci.c        |  184 ++++++++++++++++++++++--
 include/asm-generic/io.h           |    4 +
 include/asm-generic/iomap.h        |   11 ++
 include/linux/pci_regs.h           |    4 +
 include/linux/virtio_pci.h         |   41 ++++++
 lib/iomap.c                        |   41 +++++-
 tools/kvm/include/kvm/pci.h        |   13 ++-
 tools/kvm/include/kvm/virtio-pci.h |    5 +-
 tools/kvm/virtio/pci.c             |  275 ++++++++++++++++++++++++++++++++----
 9 files changed, 530 insertions(+), 48 deletions(-)

-- 
1.7.8.rc1

^ permalink raw reply

* [RFC 1/5] virtio-pci: flexible configuration layout
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, mingo
In-Reply-To: <1321314197-5265-1-git-send-email-levinsasha928@gmail.com>

From: "Michael S. Tsirkin" <mst@redhat.com>

Add a flexible mechanism to specify virtio configuration layout, using
pci vendor-specific capability.  A separate capability is used for each
of common, device specific and data-path accesses.

Warning: compiled only.
This patch also needs to be split up, pci_iomap changes
also need arch updates for non-x86.
There might also be more spec changes.

Posting here for early feedback, and to allow Sasha to
proceed with his "kvm tool" work.

Changes from v1:
Updated to match v3 of the spec, see:
	Subject: [PATCHv3 RFC] virtio-spec: flexible configuration layout
	Message-ID: <20111110122436.GA13144@redhat.com>
	In-Reply-To: <20111109195901.GA28155@redhat.com>

In particular:
- split ISR out
- reorg capability
- add offset alignment

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/virtio/virtio_pci.c |  184 +++++++++++++++++++++++++++++++++++++++---
 include/asm-generic/io.h    |    4 +
 include/asm-generic/iomap.h |   11 +++
 include/linux/pci_regs.h    |    4 +
 include/linux/virtio_pci.h  |   41 ++++++++++
 lib/iomap.c                 |   40 ++++++++-
 6 files changed, 265 insertions(+), 19 deletions(-)

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 3d1bf41..7625434 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -37,8 +37,12 @@ struct virtio_pci_device
 	struct virtio_device vdev;
 	struct pci_dev *pci_dev;
 
-	/* the IO mapping for the PCI config space */
+	/* the IO address for the common PCI config space */
 	void __iomem *ioaddr;
+	/* the IO address for device specific config */
+	void __iomem *ioaddr_device;
+	/* the IO address to use for notifications operations */
+	void __iomem *ioaddr_notify;
 
 	/* a list of queues so we can dispatch IRQs */
 	spinlock_t lock;
@@ -57,8 +61,158 @@ struct virtio_pci_device
 	unsigned msix_used_vectors;
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
+
+	/* Various IO mappings: used for resource tracking only. */
+
+	/* Legacy BAR0: typically PIO. */
+	void __iomem *legacy_map;
+
+	/* Mappings specified by device capabilities: typically in MMIO */
+	void __iomem *isr_map;
+	void __iomem *notify_map;
+	void __iomem *common_map;
+	void __iomem *device_map;
 };
 
+/*
+ * With PIO, device-specific config moves as MSI-X is enabled/disabled.
+ * Use this accessor to keep pointer to that config in sync.
+ */
+static void virtio_pci_set_msix_enabled(struct virtio_pci_device *vp_dev, int enabled)
+{
+	vp_dev->msix_enabled = enabled;
+	if (vp_dev->device_map)
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	else
+		vp_dev->ioaddr_device = vp_dev->legacy_map +
+			VIRTIO_PCI_CONFIG(vp_dev);
+}
+
+static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap_id,
+					u32 align)
+{
+        u32 size;
+        u32 offset;
+        u8 bir;
+        u8 cap_len;
+	int pos;
+	struct pci_dev *dev = vp_dev->pci_dev;
+	void __iomem *p;
+
+	for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR);
+	     pos > 0;
+	     pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) {
+		u8 id;
+		pci_read_config_byte(dev, pos + PCI_VNDR_CAP_LEN, &cap_len);
+		if (cap_len < VIRTIO_PCI_CAP_ID + 1)
+			continue;
+		pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_ID, &id);
+		id >>= VIRTIO_PCI_CAP_ID_SHIFT;
+		id &= VIRTIO_PCI_CAP_ID_MASK;
+		if (id == cap_id)
+			break;
+	}
+
+	if (pos <= 0)
+		return NULL;
+
+	if (cap_len < VIRTIO_PCI_CAP_CFG_BIR + 1)
+		goto err;
+        pci_read_config_byte(dev, pos + VIRTIO_PCI_CAP_CFG_BIR, &bir);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_OFF + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_OFF, &offset);
+	if (cap_len < VIRTIO_PCI_CAP_CFG_SIZE + 4)
+		goto err;
+        pci_read_config_dword(dev, pos + VIRTIO_PCI_CAP_CFG_SIZE, &size);
+        bir >>= VIRTIO_PCI_CAP_CFG_BIR_SHIFT;
+        bir &= VIRTIO_PCI_CAP_CFG_BIR_MASK;
+        size >>= VIRTIO_PCI_CAP_CFG_SIZE_SHIFT;
+        size &= VIRTIO_PCI_CAP_CFG_SIZE_MASK;
+        off >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
+        off &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
+	/* Align offset appropriately */
+	off &= ~(align - 1);
+
+	/* It's possible for a device to supply a huge config space,
+	 * but we'll never need to map more than a page ATM. */
+	p = pci_iomap_range(dev, bir, offset, size, PAGE_SIZE);
+	if (!p)
+		dev_err(&vp_dev->vdev.dev, "Unable to map virtio pci memory");
+	return p;
+err:
+	dev_err(&vp_dev->vdev.dev, "Unable to parse virtio pci capability");
+	return NULL;
+}
+
+static void virtio_pci_iounmap(struct virtio_pci_device *vp_dev)
+{
+	if (vp_dev->legacy_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->legacy_map);
+	if (vp_dev->isr_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->isr_map);
+	if (vp_dev->notify_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->notify_map);
+	if (vp_dev->common_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->common_map);
+	if (vp_dev->device_map)
+		pci_iounmap(vp_dev->pci_dev, vp_dev->device_map);
+}
+
+static int virtio_pci_iomap(struct virtio_pci_device *vp_dev)
+{
+	vp_dev->isr_map = virtio_pci_map_cfg(vp_dev,
+					     VIRTIO_PCI_CAP_ISR_CFG,
+					     sizeof(u8));
+	vp_dev->notify_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_NOTIFY_CFG,
+						sizeof(u16));
+	vp_dev->common_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_COMMON_CFG,
+						sizeof(u32));
+	vp_dev->device_map = virtio_pci_map_cfg(vp_dev,
+						VIRTIO_PCI_CAP_DEVICE_CFG,
+						sizeof(u8));
+
+	if (!vp_dev->notify_map || !vp_dev->common_map ||
+	    !vp_dev->device_map) {
+		/*
+		 * If not all capabilities present, map legacy PIO.
+		 * Legacy access is at BAR 0. We never need to map
+		 * more than 256 bytes there, since legacy config space
+		 * used PIO which has this size limit.
+		 * */
+		vp_dev->legacy_map = pci_iomap(vp_dev->pci_dev, 0, 256);
+		if (!vp_dev->legacy_map) {
+			dev_err(&vp_dev->vdev.dev, "Unable to map legacy PIO");
+			goto err;
+		}
+	}
+
+	/* Prefer MMIO if available. If not, fallback to legacy PIO. */
+	if (vp_dev->common_map)
+		vp_dev->ioaddr = vp_dev->common_map;
+	else
+		vp_dev->ioaddr = vp_dev->legacy_map;
+
+	if (vp_dev->device_map)
+		vp_dev->ioaddr_device = vp_dev->device_map;
+	else
+		vp_dev->ioaddr_device = vp_dev->legacy_map +
+			VIRTIO_PCI_CONFIG(vp_dev);
+
+	if (vp_dev->notify_map)
+		vp_dev->ioaddr_notify = vp_dev->notify_map;
+	else
+		vp_dev->ioaddr_notify = vp_dev->legacy_map +
+			VIRTIO_PCI_QUEUE_NOTIFY;
+
+	return 0;
+err:
+	virtio_pci_iounmap(vp_dev);
+	return -EINVAL;
+}
+
 /* Constants for MSI-X */
 /* Use first vector for configuration changes, second and the rest for
  * virtqueues Thus, we need at least 2 vectors for MSI. */
@@ -130,8 +284,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset,
 		   void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	u8 *ptr = buf;
 	int i;
 
@@ -145,8 +298,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset,
 		   const void *buf, unsigned len)
 {
 	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
-	void __iomem *ioaddr = vp_dev->ioaddr +
-				VIRTIO_PCI_CONFIG(vp_dev) + offset;
+	void __iomem *ioaddr = vp_dev->ioaddr_device + offset;
 	const u8 *ptr = buf;
 	int i;
 
@@ -184,7 +336,7 @@ static void vp_notify(struct virtqueue *vq)
 
 	/* we write the queue's selector into the notification register to
 	 * signal the other end */
-	iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY);
+	iowrite16(info->queue_index, vp_dev->ioaddr_notify);
 }
 
 /* Handle a configuration change: Tell driver if it wants to know. */
@@ -231,7 +383,8 @@ static irqreturn_t vp_interrupt(int irq, void *opaque)
 
 	/* reading the ISR has the effect of also clearing it so it's very
 	 * important to save off the value. */
-	isr = ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+	isr = ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 
 	/* It's definitely not us if the ISR was not high */
 	if (!isr)
@@ -265,7 +418,7 @@ static void vp_free_vectors(struct virtio_device *vdev)
 		ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
 
 		pci_disable_msix(vp_dev->pci_dev);
-		vp_dev->msix_enabled = 0;
+                virtio_pci_set_msix_enabled(vp_dev, 0);
 		vp_dev->msix_vectors = 0;
 	}
 
@@ -303,7 +456,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors,
 	if (err)
 		goto error;
 	vp_dev->msix_vectors = nvectors;
-	vp_dev->msix_enabled = 1;
+        virtio_pci_set_msix_enabled(vp_dev, 1);
 
 	/* Set the vector used for configuration */
 	v = vp_dev->msix_used_vectors;
@@ -451,7 +604,10 @@ static void vp_del_vq(struct virtqueue *vq)
 		iowrite16(VIRTIO_MSI_NO_VECTOR,
 			  vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
 		/* Flush the write out to device */
-		ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR);
+		ioread8(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
+		/* And clear ISR: TODO: really needed? */
+		ioread8(vp_dev->ioaddr_notify +
+		      VIRTIO_PCI_ISR - VIRTIO_PCI_QUEUE_NOTIFY);
 	}
 
 	vring_del_virtqueue(vq);
@@ -642,8 +798,8 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 	if (err)
 		goto out_enable_device;
 
-	vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0);
-	if (vp_dev->ioaddr == NULL)
+	err = virtio_pci_iomap(vp_dev);
+	if (err)
 		goto out_req_regions;
 
 	pci_set_drvdata(pci_dev, vp_dev);
@@ -665,7 +821,7 @@ static int __devinit virtio_pci_probe(struct pci_dev *pci_dev,
 
 out_set_drvdata:
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 out_req_regions:
 	pci_release_regions(pci_dev);
 out_enable_device:
@@ -683,7 +839,7 @@ static void __devexit virtio_pci_remove(struct pci_dev *pci_dev)
 
 	vp_del_vqs(&vp_dev->vdev);
 	pci_set_drvdata(pci_dev, NULL);
-	pci_iounmap(pci_dev, vp_dev->ioaddr);
+	virtio_pci_iounmap(vp_dev);
 	pci_release_regions(pci_dev);
 	pci_disable_device(pci_dev);
 	kfree(vp_dev);
diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h
index 9120887..3cf1787 100644
--- a/include/asm-generic/io.h
+++ b/include/asm-generic/io.h
@@ -286,6 +286,10 @@ static inline void writesb(const void __iomem *addr, const void *buf, int len)
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 static inline void pci_iounmap(struct pci_dev *dev, void __iomem *p)
 {
 }
diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h
index 98dcd76..6f192d4 100644
--- a/include/asm-generic/iomap.h
+++ b/include/asm-generic/iomap.h
@@ -70,8 +70,19 @@ extern void ioport_unmap(void __iomem *);
 /* Create a virtual mapping cookie for a PCI BAR (memory or IO) */
 struct pci_dev;
 extern void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max);
+extern void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                     unsigned offset,
+                                     unsigned long minlen,
+                                     unsigned long maxlen);
 extern void pci_iounmap(struct pci_dev *dev, void __iomem *);
 #else
+static inline void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                                            unsigned offset,
+                                            unsigned long minlen,
+                                            unsigned long maxlen)
+{
+	return NULL;
+}
 struct pci_dev;
 static inline void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long max)
 {
diff --git a/include/linux/pci_regs.h b/include/linux/pci_regs.h
index b5d9657..3e7905c 100644
--- a/include/linux/pci_regs.h
+++ b/include/linux/pci_regs.h
@@ -375,6 +375,10 @@
 #define  PCI_X_STATUS_266MHZ	0x40000000	/* 266 MHz capable */
 #define  PCI_X_STATUS_533MHZ	0x80000000	/* 533 MHz capable */
 
+/* Vendor specific capability */
+#define PCI_VNDR_CAP_LEN	2	/* Capability length (8 bits), including
+					   bytes: ID, NEXT and LEN itself. */
+
 /* PCI Bridge Subsystem ID registers */
 
 #define PCI_SSVID_VENDOR_ID     4	/* PCI-Bridge subsystem vendor id register */
diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h
index ea66f3f..d6568e7 100644
--- a/include/linux/virtio_pci.h
+++ b/include/linux/virtio_pci.h
@@ -92,4 +92,45 @@
 /* The alignment to use between consumer and producer parts of vring.
  * x86 pagesize again. */
 #define VIRTIO_PCI_VRING_ALIGN		4096
+
+/*
+ * Layout for Virtio PCI vendor specific capability (little-endian):
+ * 5 bit virtio capability id.
+ * 3 bit BAR index register, specifying which BAR to use.
+ * 4 byte cfg offset within the BAR.
+ * 4 byte cfg size.
+ */
+
+/* A single virtio device has multiple vendor specific capabilities, we use the
+ * 5 bit ID field to distinguish between these. */
+#define VIRTIO_PCI_CAP_ID		3
+#define VIRTIO_PCI_CAP_ID_MASK		0xff
+#define VIRTIO_PCI_CAP_ID_SHIFT		0
+
+/* IDs for different capabilities. If a specific configuration
+ * is missing, legacy PIO path is used. */
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR access */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific confiuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+
+/* Index of the BAR including this configuration */
+#define VIRTIO_PCI_CAP_CFG_BIR		4
+#define VIRTIO_PCI_CAP_CFG_BIR_MASK	(0x7)
+#define VIRTIO_PCI_CAP_CFG_BIR_SHIFT	0
+
+/* Size of the configuration space */
+#define VIRTIO_PCI_CAP_CFG_SIZE		4
+#define VIRTIO_PCI_CAP_CFG_SIZE_MASK	0xffffff
+#define VIRTIO_PCI_CAP_CFG_SIZE_SHIFT	8
+
+/* Offset within the BAR */
+#define VIRTIO_PCI_CAP_CFG_OFF		8
+#define VIRTIO_PCI_CAP_CFG_OFF_MASK	0xffffffff
+#define VIRTIO_PCI_CAP_CFG_OFF_SHIFT	0
+
 #endif
diff --git a/lib/iomap.c b/lib/iomap.c
index 5dbcb4b..f28720e 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -243,26 +243,36 @@ EXPORT_SYMBOL(ioport_unmap);
 
 #ifdef CONFIG_PCI
 /**
- * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * pci_iomap_range - create a virtual mapping cookie for a PCI BAR
  * @dev: PCI device that owns the BAR
  * @bar: BAR number
- * @maxlen: length of the memory to map
+ * @offset: map memory at the given offset in BAR
+ * @minlen: min length of the memory to map
+ * @maxlen: max length of the memory to map
  *
  * Using this function you will get a __iomem address to your device BAR.
  * You can access it using ioread*() and iowrite*(). These functions hide
  * the details if this is a MMIO or PIO address space and will just do what
  * you expect from them in the correct way.
  *
+ * @minlen specifies the minimum length to map. We check that BAR is
+ * large enough.
  * @maxlen specifies the maximum length to map. If you want to get access to
- * the complete BAR without checking for its length first, pass %0 here.
+ * the complete BAR from offset to the end, pass %0 here.
  * */
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
+                              unsigned offset,
+                              unsigned long minlen,
+                              unsigned long maxlen)
 {
 	resource_size_t start = pci_resource_start(dev, bar);
 	resource_size_t len = pci_resource_len(dev, bar);
 	unsigned long flags = pci_resource_flags(dev, bar);
 
-	if (!len || !start)
+	if (len <= offset || !start)
+		return NULL;
+        len -= offset;
+        if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
 		len = maxlen;
@@ -277,10 +287,30 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	return NULL;
 }
 
+/**
+ * pci_iomap - create a virtual mapping cookie for a PCI BAR
+ * @dev: PCI device that owns the BAR
+ * @bar: BAR number
+ * @maxlen: length of the memory to map
+ *
+ * Using this function you will get a __iomem address to your device BAR.
+ * You can access it using ioread*() and iowrite*(). These functions hide
+ * the details if this is a MMIO or PIO address space and will just do what
+ * you expect from them in the correct way.
+ *
+ * @maxlen specifies the maximum length to map. If you want to get access to
+ * the complete BAR without checking for its length first, pass %0 here.
+ * */
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+    return pci_iomap_range(dev, bar, 0, 0, maxlen);
+}
+
 void pci_iounmap(struct pci_dev *dev, void __iomem * addr)
 {
 	IO_COND(addr, /* nothing */, iounmap(addr));
 }
 EXPORT_SYMBOL(pci_iomap);
+EXPORT_SYMBOL(pci_iomap_range);
 EXPORT_SYMBOL(pci_iounmap);
 #endif /* CONFIG_PCI */
-- 
1.7.8.rc1

^ permalink raw reply related

* [RFC 2/5] virtio-pci: Fix compilation issue
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, Sasha Levin, mingo
In-Reply-To: <1321314197-5265-1-git-send-email-levinsasha928@gmail.com>

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 drivers/virtio/virtio_pci.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 7625434..d242fcc 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -129,10 +129,10 @@ static void __iomem *virtio_pci_map_cfg(struct virtio_pci_device *vp_dev, u8 cap
         bir &= VIRTIO_PCI_CAP_CFG_BIR_MASK;
         size >>= VIRTIO_PCI_CAP_CFG_SIZE_SHIFT;
         size &= VIRTIO_PCI_CAP_CFG_SIZE_MASK;
-        off >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
-        off &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
+        offset >>= VIRTIO_PCI_CAP_CFG_OFF_SHIFT;
+        offset &= VIRTIO_PCI_CAP_CFG_OFF_MASK;
 	/* Align offset appropriately */
-	off &= ~(align - 1);
+	offset &= ~(align - 1);
 
 	/* It's possible for a device to supply a huge config space,
 	 * but we'll never need to map more than a page ATM. */
-- 
1.7.8.rc1

^ permalink raw reply related

* [RFC 3/5] iomap: Don't ignore offset
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, Sasha Levin, mingo
In-Reply-To: <1321314197-5265-1-git-send-email-levinsasha928@gmail.com>

Offset was ignored for start calulcations, making all mappings start at
offset 0 always.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 lib/iomap.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/lib/iomap.c b/lib/iomap.c
index f28720e..93ae915 100644
--- a/lib/iomap.c
+++ b/lib/iomap.c
@@ -272,6 +272,7 @@ void __iomem *pci_iomap_range(struct pci_dev *dev, int bar,
 	if (len <= offset || !start)
 		return NULL;
         len -= offset;
+	start += offset;
         if (len < minlen)
 		return NULL;
 	if (maxlen && len > maxlen)
-- 
1.7.8.rc1

^ permalink raw reply related

* [RFC 4/5] kvm tools: Free up the MSI-X PBA BAR
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, Sasha Levin, mingo
In-Reply-To: <1321314197-5265-1-git-send-email-levinsasha928@gmail.com>

Free up the BAR to make space for the new virtio BARs. It isn't required
to have the PBA and the table in the separate BARs, and uniting them will
just give us extra BARs to play with.

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/virtio-pci.h |    1 -
 tools/kvm/virtio/pci.c             |   35 ++++++++++++++---------------------
 2 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
index 2bbb271..73f7486 100644
--- a/tools/kvm/include/kvm/virtio-pci.h
+++ b/tools/kvm/include/kvm/virtio-pci.h
@@ -30,7 +30,6 @@ struct virtio_pci {
 	u32			vq_vector[VIRTIO_PCI_MAX_VQ];
 	u32			gsis[VIRTIO_PCI_MAX_VQ];
 	u32			msix_io_block;
-	u32			msix_pba_block;
 	u64			msix_pba;
 	struct msix_table	msix_table[VIRTIO_PCI_MAX_VQ + VIRTIO_PCI_MAX_CONFIG];
 
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
index 1660f06..da38ba5 100644
--- a/tools/kvm/virtio/pci.c
+++ b/tools/kvm/virtio/pci.c
@@ -214,23 +214,21 @@ static struct ioport_operations virtio_pci__io_ops = {
 static void callback_mmio_table(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
 {
 	struct virtio_pci *vpci = ptr;
-	void *table = &vpci->msix_table;
+	void *table;
+	u32 offset;
 
-	if (is_write)
-		memcpy(table + addr - vpci->msix_io_block, data, len);
-	else
-		memcpy(data, table + addr - vpci->msix_io_block, len);
-}
-
-static void callback_mmio_pba(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
-{
-	struct virtio_pci *vpci = ptr;
-	void *pba = &vpci->msix_pba;
+	if (addr > vpci->msix_io_block + PCI_IO_SIZE) {
+		table	= &vpci->msix_pba;
+		offset	= vpci->msix_io_block + PCI_IO_SIZE;
+	} else {
+		table	= &vpci->msix_table;
+		offset	= vpci->msix_io_block;
+	}
 
 	if (is_write)
-		memcpy(pba + addr - vpci->msix_pba_block, data, len);
+		memcpy(table + addr - offset, data, len);
 	else
-		memcpy(data, pba + addr - vpci->msix_pba_block, len);
+		memcpy(data, table + addr - offset, len);
 }
 
 int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_trans *vtrans, u32 vq)
@@ -283,12 +281,10 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 	u8 pin, line, ndev;
 
 	vpci->dev = dev;
-	vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE);
-	vpci->msix_pba_block = pci_get_io_space_block(PCI_IO_SIZE);
+	vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
 
 	vpci->base_addr = ioport__register(IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vtrans);
 	kvm__register_mmio(kvm, vpci->msix_io_block, 0x100, callback_mmio_table, vpci);
-	kvm__register_mmio(kvm, vpci->msix_pba_block, 0x100, callback_mmio_pba, vpci);
 
 	vpci->pci_hdr = (struct pci_device_header) {
 		.vendor_id		= PCI_VENDOR_ID_REDHAT_QUMRANET,
@@ -299,10 +295,7 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 		.subsys_vendor_id	= PCI_SUBSYSTEM_VENDOR_ID_REDHAT_QUMRANET,
 		.subsys_id		= subsys_id,
 		.bar[0]			= vpci->base_addr | PCI_BASE_ADDRESS_SPACE_IO,
-		.bar[1]			= vpci->msix_io_block | PCI_BASE_ADDRESS_SPACE_MEMORY
-					| PCI_BASE_ADDRESS_MEM_TYPE_64,
-		.bar[3]			= vpci->msix_pba_block | PCI_BASE_ADDRESS_SPACE_MEMORY
-					| PCI_BASE_ADDRESS_MEM_TYPE_64,
+		.bar[1]			= vpci->msix_io_block | PCI_BASE_ADDRESS_SPACE_MEMORY,
 		.status			= PCI_STATUS_CAP_LIST,
 		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
 	};
@@ -327,7 +320,7 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 	 * we're not in short of BARs
 	 */
 	vpci->pci_hdr.msix.table_offset = 1; /* Use BAR 1 */
-	vpci->pci_hdr.msix.pba_offset = 3; /* Use BAR 3 */
+	vpci->pci_hdr.msix.pba_offset = 1 | PCI_IO_SIZE; /* Use BAR 1 with offset */
 	vpci->config_vector = 0;
 
 	if (irq__register_device(subsys_id, &ndev, &pin, &line) < 0)
-- 
1.7.8.rc1

^ permalink raw reply related

* [RFC 5/5] kvm tools: Support new virtio-pci configuration layout
From: Sasha Levin @ 2011-11-14 23:43 UTC (permalink / raw)
  To: linux-kernel; +Cc: kvm, mst, virtualization, penberg, Sasha Levin, mingo
In-Reply-To: <1321314197-5265-1-git-send-email-levinsasha928@gmail.com>

Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
---
 tools/kvm/include/kvm/pci.h        |   13 ++-
 tools/kvm/include/kvm/virtio-pci.h |    4 +
 tools/kvm/virtio/pci.c             |  248 ++++++++++++++++++++++++++++++++++--
 3 files changed, 254 insertions(+), 11 deletions(-)

diff --git a/tools/kvm/include/kvm/pci.h b/tools/kvm/include/kvm/pci.h
index f71af0b..4f5a09f 100644
--- a/tools/kvm/include/kvm/pci.h
+++ b/tools/kvm/include/kvm/pci.h
@@ -39,6 +39,16 @@ struct msix_cap {
 	u32 pba_offset;
 };
 
+struct virtio_cap {
+	u8 cap;
+	u8 next;
+	u8 cap_len;
+	u8 structure_id;
+	u8 bir;
+	u32 size:24;
+	u32 offset;
+};
+
 struct pci_device_header {
 	u16		vendor_id;
 	u16		device_id;
@@ -63,7 +73,8 @@ struct pci_device_header {
 	u8		min_gnt;
 	u8		max_lat;
 	struct msix_cap msix;
-	u8		empty[136]; /* Rest of PCI config space */
+	struct virtio_cap virtio[4];
+	u8		empty[90]; /* Rest of PCI config space */
 	u32		bar_size[6];
 };
 
diff --git a/tools/kvm/include/kvm/virtio-pci.h b/tools/kvm/include/kvm/virtio-pci.h
index 73f7486..fc3c03f 100644
--- a/tools/kvm/include/kvm/virtio-pci.h
+++ b/tools/kvm/include/kvm/virtio-pci.h
@@ -19,6 +19,7 @@ struct virtio_pci_ioevent_param {
 struct virtio_pci {
 	struct pci_device_header pci_hdr;
 	void			*dev;
+	struct kvm		*kvm;
 
 	u16			base_addr;
 	u8			status;
@@ -36,6 +37,9 @@ struct virtio_pci {
 	/* virtio queue */
 	u16			queue_selector;
 	struct virtio_pci_ioevent_param ioeventfds[VIRTIO_PCI_MAX_VQ];
+
+	u32			virtio_mmio;
+	u16			virtio_pio;
 };
 
 int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
diff --git a/tools/kvm/virtio/pci.c b/tools/kvm/virtio/pci.c
index da38ba5..8606d87 100644
--- a/tools/kvm/virtio/pci.c
+++ b/tools/kvm/virtio/pci.c
@@ -40,7 +40,7 @@ static int virtio_pci__init_ioeventfd(struct kvm *kvm, struct virtio_trans *vtra
 	};
 
 	ioevent = (struct ioevent) {
-		.io_addr	= vpci->base_addr + VIRTIO_PCI_QUEUE_NOTIFY,
+		.io_addr	= vpci->virtio_pio,
 		.io_len		= sizeof(u16),
 		.fn		= virtio_pci__ioevent_callback,
 		.fn_ptr		= &vpci->ioeventfds[vq],
@@ -59,7 +59,7 @@ static inline bool virtio_pci__msix_enabled(struct virtio_pci *vpci)
 	return vpci->pci_hdr.msix.ctrl & PCI_MSIX_FLAGS_ENABLE;
 }
 
-static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_trans *vtrans, u16 port,
+static bool virtio_pci_legacy__specific_io_in(struct kvm *kvm, struct virtio_trans *vtrans, u16 port,
 					void *data, int size, int offset)
 {
 	u32 config_offset;
@@ -89,7 +89,8 @@ static bool virtio_pci__specific_io_in(struct kvm *kvm, struct virtio_trans *vtr
 	return false;
 }
 
-static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+static bool virtio_pci_legacy__io_in(struct ioport *ioport, struct kvm *kvm,
+					u16 port, void *data, int size)
 {
 	unsigned long offset;
 	bool ret = true;
@@ -124,15 +125,15 @@ static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm, u16 port,
 		vpci->isr = VIRTIO_IRQ_LOW;
 		break;
 	default:
-		ret = virtio_pci__specific_io_in(kvm, vtrans, port, data, size, offset);
+		ret = virtio_pci_legacy__specific_io_in(kvm, vtrans, port, data, size, offset);
 		break;
 	};
 
 	return ret;
 }
 
-static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_trans *vtrans, u16 port,
-					void *data, int size, int offset)
+static bool virtio_pci_legacy__specific_io_out(struct kvm *kvm, struct virtio_trans *vtrans,
+					u16 port, void *data, int size, int offset)
 {
 	struct virtio_pci *vpci = vtrans->virtio;
 	u32 config_offset, gsi, vec;
@@ -166,7 +167,8 @@ static bool virtio_pci__specific_io_out(struct kvm *kvm, struct virtio_trans *vt
 	return false;
 }
 
-static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port, void *data, int size)
+static bool virtio_pci_legacy__io_out(struct ioport *ioport,
+				struct kvm *kvm, u16 port, void *data, int size)
 {
 	unsigned long offset;
 	bool ret = true;
@@ -199,7 +201,76 @@ static bool virtio_pci__io_out(struct ioport *ioport, struct kvm *kvm, u16 port,
 		vpci->status		= ioport__read8(data);
 		break;
 	default:
-		ret = virtio_pci__specific_io_out(kvm, vtrans, port, data, size, offset);
+		ret = virtio_pci_legacy__specific_io_out(kvm, vtrans, port,
+							data, size, offset);
+		break;
+	};
+
+	return ret;
+}
+
+static struct ioport_operations virtio_pci_legacy__io_ops = {
+	.io_in	= virtio_pci_legacy__io_in,
+	.io_out	= virtio_pci_legacy__io_out,
+};
+
+static bool virtio_pci__io_out(struct ioport *ioport,
+				struct kvm *kvm, u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	u16 val;
+	struct virtio_pci *vpci;
+	struct virtio_trans *vtrans;
+
+	vtrans = ioport->priv;
+	vpci = vtrans->virtio;
+	offset = port - vpci->virtio_pio;
+
+	/*
+	 * 0-1: Notifications
+	 * 2: ISR
+	 */
+	switch (offset) {
+	case 0:
+		val		= ioport__read16(data);
+		vtrans->virtio_ops->notify_vq(kvm, vpci->dev, val);
+		break;
+	case 2:
+		vpci->isr	= ioport__read8(data);
+		break;
+	default:
+		ret = false;
+		break;
+	}
+
+	return ret;
+}
+
+static bool virtio_pci__io_in(struct ioport *ioport, struct kvm *kvm,
+					u16 port, void *data, int size)
+{
+	unsigned long offset;
+	bool ret = true;
+	struct virtio_pci *vpci;
+
+	vpci = ioport->priv;
+	offset = port - vpci->virtio_pio;
+
+	/*
+	 * 0-1: Notifications
+	 * 2: ISR
+	 */
+	switch (offset) {
+	case 0:
+		/* Shouldn't happen */
+	case 2:
+		ioport__write8(data, vpci->isr);
+		kvm__irq_line(kvm, vpci->pci_hdr.irq_line, VIRTIO_IRQ_LOW);
+		vpci->isr = VIRTIO_IRQ_LOW;
+		break;
+	default:
+		ret = false;
 		break;
 	};
 
@@ -231,6 +302,113 @@ static void callback_mmio_table(u64 addr, u8 *data, u32 len, u8 is_write, void *
 		memcpy(data, table + addr - offset, len);
 }
 
+static void virtio_mmio_dev_specific(u64 addr, u8 *data, u32 len, u8 is_write,
+					struct virtio_trans *vtrans)
+{
+	struct virtio_pci *vpci = vtrans->virtio;
+	u32 i;
+
+	for (i = 0; i < len; i++) {
+		if (is_write)
+			vtrans->virtio_ops->set_config(vpci->kvm, vpci->dev,
+							*(u8 *)data + i, addr + i);
+		else
+			data[i] =
+				vtrans->virtio_ops->get_config(vpci->kvm, vpci->dev, addr + i);
+	}
+}
+
+static void virtio_mmio_in(u64 addr, u8 *data, u32 len, u8 is_write,
+					struct virtio_trans *vtrans)
+{
+	struct virtio_pci *vpci = vtrans->virtio;
+
+	switch (addr) {
+	case VIRTIO_PCI_HOST_FEATURES:
+		*(u32 *)data  = vtrans->virtio_ops->get_host_features(vpci->kvm, vpci->dev);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		*(u32 *)data  = vtrans->virtio_ops->get_pfn_vq(vpci->kvm,
+							vpci->dev, vpci->queue_selector);
+		break;
+	case VIRTIO_PCI_QUEUE_NUM:
+		*(u32 *)data  = vtrans->virtio_ops->get_size_vq(vpci->kvm,
+							vpci->dev, vpci->queue_selector);
+		break;
+	case VIRTIO_PCI_STATUS:
+		*(u8 *)data  = vpci->status;
+		break;
+	case VIRTIO_MSI_CONFIG_VECTOR:
+		*(u8 *)data  = vpci->config_vector;
+		break;
+	case VIRTIO_MSI_QUEUE_VECTOR:
+		*(u8 *)data = vpci->vq_vector[vpci->queue_selector];
+		break;
+	};
+}
+
+static void virtio_mmio_out(u64 addr, u8 *data, u32 len, u8 is_write,
+					struct virtio_trans *vtrans)
+{
+	struct virtio_pci *vpci = vtrans->virtio;
+	u32 val;
+
+	switch (addr) {
+	case VIRTIO_PCI_GUEST_FEATURES:
+		val = *(u32 *)data;
+		vtrans->virtio_ops->set_guest_features(vpci->kvm, vpci, val);
+		break;
+	case VIRTIO_PCI_QUEUE_PFN:
+		val = *(u32 *)data;
+		virtio_pci__init_ioeventfd(vpci->kvm, vtrans, vpci->queue_selector);
+		vtrans->virtio_ops->init_vq(vpci->kvm, vpci->dev, vpci->queue_selector, val);
+		break;
+	case VIRTIO_PCI_QUEUE_SEL:
+		vpci->queue_selector	= *(u16 *)data;
+		break;
+	case VIRTIO_PCI_QUEUE_NOTIFY:
+		val			= *(u16 *)data;
+		vtrans->virtio_ops->notify_vq(vpci->kvm, vpci->dev, val);
+		break;
+	case VIRTIO_PCI_STATUS:
+		vpci->status		= *(u8 *)data;
+		break;
+	case VIRTIO_MSI_CONFIG_VECTOR: {
+		u16 vec, gsi;
+
+		vec = *(u16 *)data;
+		gsi = irq__add_msix_route(vpci->kvm, &vpci->msix_table[vec].msg);
+		vpci->config_gsi = gsi;
+		break;
+	}
+	case VIRTIO_MSI_QUEUE_VECTOR: {
+		u16 vec, gsi;
+
+		vec = vpci->vq_vector[vpci->queue_selector] = *(u16 *)data;
+		gsi = irq__add_msix_route(vpci->kvm, &vpci->msix_table[vec].msg);
+		vpci->gsis[vpci->queue_selector] = gsi;
+		break;
+	}
+	};
+}
+
+static void callback_virtio_mmio(u64 addr, u8 *data, u32 len, u8 is_write, void *ptr)
+{
+	struct virtio_trans *vtrans = ptr;
+	struct virtio_pci *vpci = vtrans->virtio;
+	u32 offset = addr - vpci->virtio_mmio;
+
+	if (offset >= 0x100) {
+		virtio_mmio_dev_specific(offset - 0x100, data, len, is_write, vtrans);
+		return;
+	}
+
+	if (is_write == 0)
+		virtio_mmio_in(offset, data, len, is_write, vtrans);
+	else
+		virtio_mmio_out(offset, data, len, is_write, vtrans);
+}
+
 int virtio_pci__signal_vq(struct kvm *kvm, struct virtio_trans *vtrans, u32 vq)
 {
 	struct virtio_pci *vpci = vtrans->virtio;
@@ -282,10 +460,16 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 
 	vpci->dev = dev;
 	vpci->msix_io_block = pci_get_io_space_block(PCI_IO_SIZE * 2);
+	vpci->virtio_mmio = pci_get_io_space_block(PCI_IO_SIZE * 2);
 
-	vpci->base_addr = ioport__register(IOPORT_EMPTY, &virtio_pci__io_ops, IOPORT_SIZE, vtrans);
+	vpci->base_addr = ioport__register(IOPORT_EMPTY,
+				&virtio_pci_legacy__io_ops, IOPORT_SIZE, vtrans);
+	vpci->virtio_pio = ioport__register(IOPORT_EMPTY,
+				&virtio_pci__io_ops, IOPORT_SIZE, vtrans);
 	kvm__register_mmio(kvm, vpci->msix_io_block, 0x100, callback_mmio_table, vpci);
+	kvm__register_mmio(kvm, vpci->virtio_mmio, 0x200, callback_virtio_mmio, vtrans);
 
+	vpci->kvm = kvm;
 	vpci->pci_hdr = (struct pci_device_header) {
 		.vendor_id		= PCI_VENDOR_ID_REDHAT_QUMRANET,
 		.device_id		= device_id,
@@ -296,12 +480,16 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 		.subsys_id		= subsys_id,
 		.bar[0]			= vpci->base_addr | PCI_BASE_ADDRESS_SPACE_IO,
 		.bar[1]			= vpci->msix_io_block | PCI_BASE_ADDRESS_SPACE_MEMORY,
+		.bar[2]			= vpci->virtio_mmio | PCI_BASE_ADDRESS_SPACE_MEMORY,
+		.bar_size[2]		= PCI_IO_SIZE * 2,
+		.bar[3]			= vpci->virtio_pio | PCI_BASE_ADDRESS_SPACE_IO,
+		.bar_size[3]		= PCI_IO_SIZE * 2,
 		.status			= PCI_STATUS_CAP_LIST,
 		.capabilities		= (void *)&vpci->pci_hdr.msix - (void *)&vpci->pci_hdr,
 	};
 
 	vpci->pci_hdr.msix.cap = PCI_CAP_ID_MSIX;
-	vpci->pci_hdr.msix.next = 0;
+	vpci->pci_hdr.msix.next = (void *)&vpci->pci_hdr.virtio[0] - (void *)&vpci->pci_hdr;
 	/*
 	 * We at most have VIRTIO_PCI_MAX_VQ entries for virt queue,
 	 * VIRTIO_PCI_MAX_CONFIG entries for config.
@@ -321,6 +509,46 @@ int virtio_pci__init(struct kvm *kvm, struct virtio_trans *vtrans, void *dev,
 	 */
 	vpci->pci_hdr.msix.table_offset = 1; /* Use BAR 1 */
 	vpci->pci_hdr.msix.pba_offset = 1 | PCI_IO_SIZE; /* Use BAR 1 with offset */
+
+	vpci->pci_hdr.virtio[0] = (struct virtio_cap) {
+		.cap = PCI_CAP_ID_VNDR,
+		.next = (void *)&vpci->pci_hdr.virtio[1] - (void *)&vpci->pci_hdr,
+		.cap_len = sizeof(struct virtio_cap),
+		.structure_id = VIRTIO_PCI_CAP_COMMON_CFG,
+		.bir = 2,
+		.size = PCI_IO_SIZE,
+		.offset = 0,
+	};
+
+	vpci->pci_hdr.virtio[1] = (struct virtio_cap) {
+		.cap = PCI_CAP_ID_VNDR,
+		.next = (void *)&vpci->pci_hdr.virtio[2] - (void *)&vpci->pci_hdr,
+		.cap_len = sizeof(struct virtio_cap),
+		.structure_id = VIRTIO_PCI_CAP_ISR_CFG,
+		.bir = 3,
+		.size = 1,
+		.offset = 2,
+	};
+
+	vpci->pci_hdr.virtio[2] = (struct virtio_cap) {
+		.cap = PCI_CAP_ID_VNDR,
+		.next = (void *)&vpci->pci_hdr.virtio[3] - (void *)&vpci->pci_hdr,
+		.cap_len = sizeof(struct virtio_cap),
+		.structure_id = VIRTIO_PCI_CAP_NOTIFY_CFG,
+		.bir = 3,
+		.size = 2,
+		.offset = 0,
+	};
+
+	vpci->pci_hdr.virtio[3] = (struct virtio_cap) {
+		.cap = PCI_CAP_ID_VNDR,
+		.next = 0,
+		.cap_len = sizeof(struct virtio_cap),
+		.structure_id = VIRTIO_PCI_CAP_DEVICE_CFG,
+		.bir = 2,
+		.size = PCI_IO_SIZE,
+		.offset = PCI_IO_SIZE,
+	};
 	vpci->config_vector = 0;
 
 	if (irq__register_device(subsys_id, &ndev, &pin, &line) < 0)
-- 
1.7.8.rc1

^ permalink raw reply related

* Re: [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Rick Jones @ 2011-11-15  0:06 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, Rick Jones, virtualization, Michael Tsirkin
In-Reply-To: <1321309800.2827.22.camel@bwh-desktop>

On 11/14/2011 02:30 PM, Ben Hutchings wrote:
> On Mon, 2011-11-14 at 13:52 -0800, Rick Jones wrote:
>> From: Rick Jones<rick.jones2@hp.com>
>>
>> Add a new .bus_name to virtio_config_ops then modify virtio_net to
>> call through to it in an ethtool .get_drvinfo routine to report
>> bus_info in ethtool -i output which is consistent with other
>> emulated NICs and the output of lspci.
> [...]
>> diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
>> index 0dc30ff..3724d45 100644
>> --- a/drivers/lguest/lguest_device.c
>> +++ b/drivers/lguest/lguest_device.c
>> @@ -381,6 +381,11 @@ error:
>>   	return PTR_ERR(vqs[i]);
>>   }
>>
>> +static const char *lg_bus_name(struct virtio_device *vdev)
>> +{
>> +	return "Not Implemented";
>> +}
> [...]
>> +static const char *kvm_bus_name(struct virtio_device *vdev)
>> +{
>> +	return "Not Implemented";
>> +}
> [...]
>
> Please use the existing 'not implemented' value, which is the empty
> string.   If you think ethtool should print some helpful message instead
> of an empty string, please submit a patch for ethtool.


One question - will those actually be called via an ethtool path?  In my 
poking about through the virtio code, I got the impression those modules 
were for "other than networking" sorts of things.

rick

^ permalink raw reply

* Re: [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Ben Hutchings @ 2011-11-15  0:10 UTC (permalink / raw)
  To: Rick Jones; +Cc: netdev, Rick Jones, virtualization, Michael Tsirkin
In-Reply-To: <4EC1ACF6.9060908@hp.com>

On Mon, 2011-11-14 at 16:06 -0800, Rick Jones wrote:
> On 11/14/2011 02:30 PM, Ben Hutchings wrote:
> > On Mon, 2011-11-14 at 13:52 -0800, Rick Jones wrote:
> >> From: Rick Jones<rick.jones2@hp.com>
> >>
> >> Add a new .bus_name to virtio_config_ops then modify virtio_net to
> >> call through to it in an ethtool .get_drvinfo routine to report
> >> bus_info in ethtool -i output which is consistent with other
> >> emulated NICs and the output of lspci.
> > [...]
> >> diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
> >> index 0dc30ff..3724d45 100644
> >> --- a/drivers/lguest/lguest_device.c
> >> +++ b/drivers/lguest/lguest_device.c
> >> @@ -381,6 +381,11 @@ error:
> >>   	return PTR_ERR(vqs[i]);
> >>   }
> >>
> >> +static const char *lg_bus_name(struct virtio_device *vdev)
> >> +{
> >> +	return "Not Implemented";
> >> +}
> > [...]
> >> +static const char *kvm_bus_name(struct virtio_device *vdev)
> >> +{
> >> +	return "Not Implemented";
> >> +}
> > [...]
> >
> > Please use the existing 'not implemented' value, which is the empty
> > string.   If you think ethtool should print some helpful message instead
> > of an empty string, please submit a patch for ethtool.
> 
> 
> One question - will those actually be called via an ethtool path?  In my 
> poking about through the virtio code, I got the impression those modules 
> were for "other than networking" sorts of things.

I don't know; I just assumed that was why you were adding them!  In
other contexts such as dev_printk() this string would make even less
sense.

Ben.

-- 
Ben Hutchings, Staff Engineer, Solarflare
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.

^ permalink raw reply

* Re: [RFC PATCH net-next] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Rick Jones @ 2011-11-15  0:14 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: netdev, virtualization, Michael Tsirkin
In-Reply-To: <1321315839.2827.25.camel@bwh-desktop>

>>> Please use the existing 'not implemented' value, which is the empty
>>> string.   If you think ethtool should print some helpful message instead
>>> of an empty string, please submit a patch for ethtool.
>>
>>
>> One question - will those actually be called via an ethtool path?  In my
>> poking about through the virtio code, I got the impression those modules
>> were for "other than networking" sorts of things.
>
> I don't know; I just assumed that was why you were adding them!  In
> other contexts such as dev_printk() this string would make even less
> sense.

Those were added to make sure there were no dangling references in the 
config_ops structure defined in those files and that the code calling 
through wouldn't go off into la-la land.  Perhaps it isn't necessary 
with Rusty's suggestion that I check ".bus_info" against NULL? But that 
is why those were there, and not simply the instance in virtio_pci.c. 
I'll spin a v2 regardless.

rick

^ permalink raw reply

* [RFC PATCH net-next v2] enable virtio_net to return bus_info in ethtool -i consistent with emulated NICs
From: Rick Jones @ 2011-11-15  0:17 UTC (permalink / raw)
  To: netdev, rusty, mst, virtualization

From: Rick Jones <rick.jones2@hp.com>

Add a new .bus_name to virtio_config_ops then modify virtio_net to 
call through to it in an ethtool .get_drvinfo routine to report
bus_info in ethtool -i output which is consistent with other
emulated NICs and the output of lspci.  

Signed-off-by: Rick Jones <rick.jones2@hp.com>

---

The changes to drivers/lguest/lguest_device.c, drivers/s390/kvm/kvm_virtio.c,
and drivers/virtio/virtio_mmio.c code inspected only, not compiled.

j-ubuntu-guest:~$ ethtool -i eth0
driver: virtio_net
version: 1.0.0
firmware-version: 
bus-info: 0000:00:03.0
raj@raj-ubuntu-guest:~$ lspci | grep Ether
00:03.0 Ethernet controller: Red Hat, Inc Virtio network device

 drivers/lguest/lguest_device.c |    6 ++++++
 drivers/net/virtio_net.c       |   15 +++++++++++++++
 drivers/s390/kvm/kvm_virtio.c  |    6 ++++++
 drivers/virtio/virtio_mmio.c   |    6 ++++++
 drivers/virtio/virtio_pci.c    |    8 ++++++++
 include/linux/virtio_config.h  |   14 ++++++++++++++
 6 files changed, 55 insertions(+), 0 deletions(-)


diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index 0dc30ff..595d731 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -381,6 +381,11 @@ error:
 	return PTR_ERR(vqs[i]);
 }
 
+static const char *lg_bus_name(struct virtio_device *vdev)
+{
+	return "";
+}
+
 /* The ops structure which hooks everything together. */
 static struct virtio_config_ops lguest_config_ops = {
 	.get_features = lg_get_features,
@@ -392,6 +397,7 @@ static struct virtio_config_ops lguest_config_ops = {
 	.reset = lg_reset,
 	.find_vqs = lg_find_vqs,
 	.del_vqs = lg_del_vqs,
+	.bus_name = lg_bus_name,
 };
 
 /*
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 6ee8410..4dc9d84 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,6 +39,7 @@ module_param(gso, bool, 0444);
 #define GOOD_COPY_LEN	128
 
 #define VIRTNET_SEND_COMMAND_SG_MAX    2
+#define VIRTNET_DRIVER_VERSION "1.0.0"
 
 struct virtnet_stats {
 	struct u64_stats_sync syncp;
@@ -889,7 +890,21 @@ static void virtnet_get_ringparam(struct net_device *dev,
 
 }
 
+
+static void virtnet_get_drvinfo(struct net_device *dev,
+				struct ethtool_drvinfo *info)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct virtio_device *vdev = vi->vdev;
+
+	strlcpy(info->driver, KBUILD_MODNAME, sizeof(info->driver));
+	strlcpy(info->version, VIRTNET_DRIVER_VERSION, sizeof(info->version));
+	strlcpy(info->bus_info, virtio_bus_name(vdev), sizeof(info->bus_info));
+
+}
+
 static const struct ethtool_ops virtnet_ethtool_ops = {
+	.get_drvinfo = virtnet_get_drvinfo,
 	.get_link = ethtool_op_get_link,
 	.get_ringparam = virtnet_get_ringparam,
 };
diff --git a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
index 94f49ff..8af868b 100644
--- a/drivers/s390/kvm/kvm_virtio.c
+++ b/drivers/s390/kvm/kvm_virtio.c
@@ -263,6 +263,11 @@ error:
 	return PTR_ERR(vqs[i]);
 }
 
+static const char *kvm_bus_name(struct virtio_device *vdev)
+{
+	return "";
+}
+
 /*
  * The config ops structure as defined by virtio config
  */
@@ -276,6 +281,7 @@ static struct virtio_config_ops kvm_vq_configspace_ops = {
 	.reset = kvm_reset,
 	.find_vqs = kvm_find_vqs,
 	.del_vqs = kvm_del_vqs,
+	.bus_name = kvm_bus_name,
 };
 
 /*
diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
index acc5e43..2f57380 100644
--- a/drivers/virtio/virtio_mmio.c
+++ b/drivers/virtio/virtio_mmio.c
@@ -361,7 +361,12 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 	return 0;
 }
 
+static const char *vm_bus_name(struct virtio_device *vdev)
+{
+	struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev);
 
+	return vm_dev->pdev->name;
+}
 
 static struct virtio_config_ops virtio_mmio_config_ops = {
 	.get		= vm_get,
@@ -373,6 +378,7 @@ static struct virtio_config_ops virtio_mmio_config_ops = {
 	.del_vqs	= vm_del_vqs,
 	.get_features	= vm_get_features,
 	.finalize_features = vm_finalize_features,
+	.bus_name	= vm_bus_name,
 };
 
 
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index 79a31e5..764ec05 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -580,6 +580,13 @@ static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
 				  false, false);
 }
 
+static const char *vp_bus_name(struct virtio_device *vdev)
+{
+	struct virtio_pci_device *vp_dev = to_vp_device(vdev);
+
+	return pci_name(vp_dev->pci_dev);
+}
+
 static struct virtio_config_ops virtio_pci_config_ops = {
 	.get		= vp_get,
 	.set		= vp_set,
@@ -590,6 +597,7 @@ static struct virtio_config_ops virtio_pci_config_ops = {
 	.del_vqs	= vp_del_vqs,
 	.get_features	= vp_get_features,
 	.finalize_features = vp_finalize_features,
+	.bus_name	= vp_bus_name,
 };
 
 static void virtio_pci_release_dev(struct device *_d)
diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h
index add4790..63f98d0 100644
--- a/include/linux/virtio_config.h
+++ b/include/linux/virtio_config.h
@@ -100,6 +100,10 @@
  *	vdev: the virtio_device
  *	This gives the final feature bits for the device: it can change
  *	the dev->feature bits if it wants.
+ * @bus_name: return the bus name associated with the device
+ *	vdev: the virtio_device
+ *      This returns a pointer to the bus name a la pci_name from which
+ *      the caller can then copy.
  */
 typedef void vq_callback_t(struct virtqueue *);
 struct virtio_config_ops {
@@ -117,6 +121,7 @@ struct virtio_config_ops {
 	void (*del_vqs)(struct virtio_device *);
 	u32 (*get_features)(struct virtio_device *vdev);
 	void (*finalize_features)(struct virtio_device *vdev);
+	const char *(*bus_name)(struct virtio_device *vdev);
 };
 
 /* If driver didn't advertise the feature, it will never appear. */
@@ -182,5 +187,14 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev,
 		return ERR_PTR(err);
 	return vq;
 }
+
+static inline
+const char *virtio_bus_name(struct virtio_device *vdev)
+{
+	if (!vdev->config->bus_name)
+		return "virtio";
+	return vdev->config->bus_name(vdev);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_VIRTIO_CONFIG_H */

^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox