Kernel KVM virtualization development

Kernel KVM virtualization development
 help / color / mirror / Atom feed

* [PATCH 02/38] KVM: PPC: use definitions in epapr header for hcalls
From: Alexander Graf @ 2012-08-14 23:04 UTC (permalink / raw)
  To: kvm-ppc; +Cc: KVM list, Stuart Yoder
In-Reply-To: <1344985483-7440-1-git-send-email-agraf@suse.de>

From: Stuart Yoder <stuart.yoder@freescale.com>

Signed-off-by: Stuart Yoder <stuart.yoder@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_para.h |   21 +++++++++++----------
 arch/powerpc/kernel/kvm.c           |    2 +-
 arch/powerpc/kvm/powerpc.c          |   10 +++++-----
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_para.h b/arch/powerpc/include/asm/kvm_para.h
index c18916b..a168ce3 100644
--- a/arch/powerpc/include/asm/kvm_para.h
+++ b/arch/powerpc/include/asm/kvm_para.h
@@ -75,9 +75,10 @@ struct kvm_vcpu_arch_shared {
 };
 
 #define KVM_SC_MAGIC_R0		0x4b564d21 /* "KVM!" */
-#define HC_VENDOR_KVM		(42 << 16)
-#define HC_EV_SUCCESS		0
-#define HC_EV_UNIMPLEMENTED	12
+
+#define KVM_HCALL_TOKEN(num)     _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
+
+#include <asm/epapr_hcalls.h>
 
 #define KVM_FEATURE_MAGIC_PAGE	1
 
@@ -121,7 +122,7 @@ static unsigned long kvm_hypercall(unsigned long *in,
 				   unsigned long *out,
 				   unsigned long nr)
 {
-	return HC_EV_UNIMPLEMENTED;
+	return EV_UNIMPLEMENTED;
 }
 
 #endif
@@ -132,7 +133,7 @@ static inline long kvm_hypercall0_1(unsigned int nr, unsigned long *r2)
 	unsigned long out[8];
 	unsigned long r;
 
-	r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 	*r2 = out[0];
 
 	return r;
@@ -143,7 +144,7 @@ static inline long kvm_hypercall0(unsigned int nr)
 	unsigned long in[8];
 	unsigned long out[8];
 
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
@@ -152,7 +153,7 @@ static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
 	unsigned long out[8];
 
 	in[0] = p1;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
@@ -163,7 +164,7 @@ static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
 
 	in[0] = p1;
 	in[1] = p2;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
@@ -175,7 +176,7 @@ static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
 	in[0] = p1;
 	in[1] = p2;
 	in[2] = p3;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
@@ -189,7 +190,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
 	in[1] = p2;
 	in[2] = p3;
 	in[3] = p4;
-	return kvm_hypercall(in, out, nr | HC_VENDOR_KVM);
+	return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr));
 }
 
 
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 867db1d..a61b133 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -419,7 +419,7 @@ static void kvm_map_magic_page(void *data)
 	in[0] = KVM_MAGIC_PAGE;
 	in[1] = KVM_MAGIC_PAGE;
 
-	kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE);
+	kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE));
 
 	*features = out[0];
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 879b14a..62165cc 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -67,18 +67,18 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 	}
 
 	switch (nr) {
-	case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE:
+	case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE):
 	{
 		vcpu->arch.magic_page_pa = param1;
 		vcpu->arch.magic_page_ea = param2;
 
 		r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7;
 
-		r = HC_EV_SUCCESS;
+		r = EV_SUCCESS;
 		break;
 	}
-	case HC_VENDOR_KVM | KVM_HC_FEATURES:
-		r = HC_EV_SUCCESS;
+	case KVM_HCALL_TOKEN(KVM_HC_FEATURES):
+		r = EV_SUCCESS;
 #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2)
 		/* XXX Missing magic page on 44x */
 		r2 |= (1 << KVM_FEATURE_MAGIC_PAGE);
@@ -87,7 +87,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu)
 		/* Second return value is in r4 */
 		break;
 	default:
-		r = HC_EV_UNIMPLEMENTED;
+		r = EV_UNIMPLEMENTED;
 		break;
 	}
 
-- 
1.6.0.2

^ permalink raw reply related

* [PULL 00/38] ppc patch queue 2012-08-15
From: Alexander Graf @ 2012-08-14 23:04 UTC (permalink / raw)
  To: kvm-ppc; +Cc: KVM list

Hi Avi,

This is my current patch queue for ppc. It contains the following improvements:

  * add support for idle hcall on booke
  * icache clear on map
  * mmu notifier support for e500 and book3s_pr
  * revive the 440 support slightly (still not 100% happy)
  * unify booke and book3s_pr entry/exit code a bit
  * add watchdog emulation for booke
  * small bug fixes

Please pull.

Alex


The following changes since commit dbcb4e798072d114fe68813f39a9efd239ab99c0:
  Avi Kivity (1):
        KVM: VMX: Advertize RDTSC exiting to nested guests

are available in the git repository at:

  git://github.com/agraf/linux-2.6.git for-upstream

Alan Cox (1):
      ppc: e500_tlb memset clears nothing

Alexander Graf (24):
      KVM: PPC: PR: Use generic tracepoint for guest exit
      KVM: PPC: Expose SYNC cap based on mmu notifiers
      KVM: PPC: BookE: Expose remote TLB flushes in debugfs
      KVM: PPC: E500: Fix clear_tlb_refs
      KVM: PPC: BookE: Add check_requests helper function
      KVM: PPC: BookE: Add support for vcpu->mode
      KVM: PPC: E500: Implement MMU notifiers
      KVM: PPC: Add cache flush on page map
      KVM: PPC: BookE: Add some more trace points
      KVM: PPC: BookE: No duplicate request != 0 check
      KVM: PPC: Use same kvmppc_prepare_to_enter code for booke and book3s_pr
      KVM: PPC: Book3s: PR: Add (dumb) MMU Notifier support
      KVM: PPC: BookE: Drop redundant vcpu->mode set
      KVM: PPC: Book3S: PR: Only do resched check once per exit
      KVM: PPC: Exit guest context while handling exit
      KVM: PPC: Book3S: PR: Indicate we're out of guest mode
      KVM: PPC: Consistentify vcpu exit path
      KVM: PPC: Book3S: PR: Rework irq disabling
      KVM: PPC: Move kvm_guest_enter call into generic code
      KVM: PPC: Ignore EXITING_GUEST_MODE mode
      KVM: PPC: Add return value in prepare_to_enter
      KVM: PPC: Add return value to core_check_requests
      KVM: PPC: 44x: Initialize PVR
      KVM: PPC: BookE: Add MCSR SPR support

Bharat Bhushan (2):
      KVM: PPC: booke: Add watchdog emulation
      booke: Added ONE_REG interface for IAC/DAC debug registers

Liu Yu-B13201 (3):
      KVM: PPC: Add support for ePAPR idle hcall in host kernel
      KVM: PPC: ev_idle hcall support for e500 guests
      PPC: Don't use hardcoded opcode for ePAPR hcall invocation

Paul Mackerras (2):
      KVM: PPC: Book3S HV: Fix incorrect branch in H_CEDE code
      KVM: PPC: Quieten message about allocating linear regions

Scott Wood (2):
      powerpc/fsl-soc: use CONFIG_EPAPR_PARAVIRT for hcalls
      powerpc/epapr: export epapr_hypercall_start

Stuart Yoder (4):
      PPC: epapr: create define for return code value of success
      KVM: PPC: use definitions in epapr header for hcalls
      KVM: PPC: add pvinfo for hcall opcodes on e500mc/e5500
      PPC: select EPAPR_PARAVIRT for all users of epapr hcalls

 Documentation/virtual/kvm/api.txt       |    7 +-
 arch/powerpc/include/asm/Kbuild         |    1 +
 arch/powerpc/include/asm/epapr_hcalls.h |   36 ++--
 arch/powerpc/include/asm/fsl_hcalls.h   |   36 ++--
 arch/powerpc/include/asm/kvm.h          |   12 ++
 arch/powerpc/include/asm/kvm_host.h     |   30 +++-
 arch/powerpc/include/asm/kvm_para.h     |   21 ++-
 arch/powerpc/include/asm/kvm_ppc.h      |   28 +++
 arch/powerpc/include/asm/reg_booke.h    |    7 +
 arch/powerpc/kernel/epapr_hcalls.S      |   28 +++
 arch/powerpc/kernel/epapr_paravirt.c    |   11 +-
 arch/powerpc/kernel/kvm.c               |    2 +-
 arch/powerpc/kernel/ppc_ksyms.c         |    5 +
 arch/powerpc/kvm/44x.c                  |    1 +
 arch/powerpc/kvm/Kconfig                |    3 +
 arch/powerpc/kvm/book3s.c               |    9 +
 arch/powerpc/kvm/book3s_32_mmu_host.c   |    4 +
 arch/powerpc/kvm/book3s_64_mmu_host.c   |    3 +
 arch/powerpc/kvm/book3s_hv_builtin.c    |    4 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   12 +-
 arch/powerpc/kvm/book3s_mmu_hpte.c      |    5 -
 arch/powerpc/kvm/book3s_pr.c            |  109 ++++++++----
 arch/powerpc/kvm/book3s_rmhandlers.S    |   15 +-
 arch/powerpc/kvm/booke.c                |  279 +++++++++++++++++++++++++------
 arch/powerpc/kvm/booke_emulate.c        |   22 ++-
 arch/powerpc/kvm/e500_tlb.c             |   82 ++++++++--
 arch/powerpc/kvm/powerpc.c              |  128 +++++++++++++--
 arch/powerpc/kvm/trace.h                |  146 +++++++++++++---
 arch/powerpc/mm/mem.c                   |    1 +
 arch/powerpc/platforms/Kconfig          |    1 +
 arch/powerpc/sysdev/fsl_msi.c           |    9 +-
 arch/powerpc/sysdev/fsl_soc.c           |    2 +
 drivers/tty/Kconfig                     |    1 +
 drivers/virt/Kconfig                    |    1 +
 include/linux/kvm.h                     |    4 +
 include/linux/kvm_host.h                |    1 +
 36 files changed, 857 insertions(+), 209 deletions(-)

^ permalink raw reply

* Re: [PATCH v8] kvm: notify host when the guest is panicked
From: Anthony Liguori @ 2012-08-14 22:59 UTC (permalink / raw)
  To: Marcelo Tosatti, Wen Congyang
  Cc: kvm list, Gleb Natapov, Jan Kiszka, Yan Vugenfirer,
	linux-kernel@vger.kernel.org, qemu-devel, Avi Kivity,
	KAMEZAWA Hiroyuki
In-Reply-To: <20120814205339.GA14172@amt.cnet>

Marcelo Tosatti <mtosatti@redhat.com> writes:

> On Tue, Aug 14, 2012 at 02:35:34PM -0500, Anthony Liguori wrote:
>> Marcelo Tosatti <mtosatti@redhat.com> writes:
>> 
>> > On Tue, Aug 14, 2012 at 01:53:01PM -0500, Anthony Liguori wrote:
>> >> Marcelo Tosatti <mtosatti@redhat.com> writes:
>> >> 
>> >> > On Tue, Aug 14, 2012 at 05:55:54PM +0300, Yan Vugenfirer wrote:
>> >> >> 
>> >> >> On Aug 14, 2012, at 1:42 PM, Jan Kiszka wrote:
>> >> >> 
>> >> >> > On 2012-08-14 10:56, Daniel P. Berrange wrote:
>> >> >> >> On Mon, Aug 13, 2012 at 03:21:32PM -0300, Marcelo Tosatti wrote:
>> >> >> >>> On Wed, Aug 08, 2012 at 10:43:01AM +0800, Wen Congyang wrote:
>> >> >> >>>> We can know the guest is panicked when the guest runs on xen.
>> >> >> >>>> But we do not have such feature on kvm.
>> >> >> >>>> 
>> >> >> >>>> Another purpose of this feature is: management app(for example:
>> >> >> >>>> libvirt) can do auto dump when the guest is panicked. If management
>> >> >> >>>> app does not do auto dump, the guest's user can do dump by hand if
>> >> >> >>>> he sees the guest is panicked.
>> >> >> >>>> 
>> >> >> >>>> We have three solutions to implement this feature:
>> >> >> >>>> 1. use vmcall
>> >> >> >>>> 2. use I/O port
>> >> >> >>>> 3. use virtio-serial.
>> >> >> >>>> 
>> >> >> >>>> We have decided to avoid touching hypervisor. The reason why I choose
>> >> >> >>>> choose the I/O port is:
>> >> >> >>>> 1. it is easier to implememt
>> >> >> >>>> 2. it does not depend any virtual device
>> >> >> >>>> 3. it can work when starting the kernel
>> >> >> >>> 
>> >> >> >>> How about searching for the "Kernel panic - not syncing" string 
>> >> >> >>> in the guests serial output? Say libvirtd could take an action upon
>> >> >> >>> that?
>> >> >> >> 
>> >> >> >> No, this is not satisfactory. It depends on the guest OS being
>> >> >> >> configured to use the serial port for console output which we
>> >> >> >> cannot mandate, since it may well be required for other purposes.
>> >> >> > 
>> >> >> Please don't forget Windows guests, there is no console and no "Kernel Panic" string ;)
>> >> >> 
>> >> >> What I used for debugging purposes on Windows guest is to register a bugcheck callback in virtio-net driver and write 1 to VIRTIO_PCI_ISR register.
>> >> >> 
>> >> >> Yan. 
>> >> >
>> >> > Considering whether a "panic-device" should cover other OSes is also \
>> >
>> >> > something to consider. Even for Linux, is "panic" the only case which
>> >> > should be reported via the mechanism? What about oopses without panic? 
>> >> >
>> >> > Is the mechanism general enough for supporting new events, etc.
>> >> 
>> >> Hi,
>> >> 
>> >> I think this discussion is gone of the deep end.
>> >> 
>> >> Forget about !x86 platforms.  They have their own way to do this sort of
>> >> thing.  
>> >
>> > The panic function in kernel/panic.c has the following options, which
>> > appear to be arch independent, on panic:
>> >
>> > - reboot 
>> > - blink
>> 
>> Not sure the semantics of blink but that might be a good place for a
>> pvops hook.
>> 
>> >
>> > None are paravirtual interfaces however.
>> >
>> >> Think of this feature like a status LED on a motherboard.  These
>> >> are very common and usually controlled by IO ports.
>> >> 
>> >> We're simply reserving a "status LED" for the guest to indicate that it
>> >> has paniced.  Let's not over engineer this.
>> >
>> > My concern is that you end up with state that is dependant on x86.
>> >
>> > Subject: [PATCH v8 3/6] add a new runstate: RUN_STATE_GUEST_PANICKED
>> >
>> > Having the ability to stop/restart the guest (and even introducing a 
>> > new VM runstate) is more than a status LED analogy.
>> 
>> I must admit, I don't know why a new runstate is necessary/useful.  The
>> kernel shouldn't have to care about the difference between a halted guest
>> and a panicked guest.  That level of information belongs in userspace IMHO.
>> 
>> > Can this new infrastructure be used by other architectures?
>> 
>> I guess I don't understand why the kernel side of this isn't anything
>> more than a paravirt op hook that does a single outb() with the
>> remaining logic handled 100% in QEMU.
>
> From the patch description:
>
> "Another purpose of this feature is: management app(for example:
> libvirt) can do auto dump when the guest is panicked. If management
> app does not do auto dump, the guest's user can do dump by hand if
> he sees the guest is panicked."

Why does this mandated another runstate?  QEMU can simply mark the VCPUs
as stopped and raise a QMP event.  The kernel doesn't care if the VCPUs
are stopped or panicked.

> Wen, auto dump means dump of guest memory?
>
> In that case, the notification should obviously stop the guest 
> otherwise the guest might be reset by the time memdump from QEMU 
> monitor runs.
>
> But kexec supports dumping of memory already (i suppose it can 
> do automatic dump+{reboot,shutdown}).
>
>> > Do you consider allowing support for Windows as overengineering?
>> 
>> I don't think there is a way to hook BSOD on Windows so attempting to
>> engineer something that works with Windows seems odd, no?
>
> Unsure about hooking at BSOD time. But Windows has configurable 
> memory dump/reset/reboot, so yes it should not necessary.

Do you mean it's not necessary to hook BSOD?

I've very often gotten asked: We know 1 person is experiencing this
crash condition, can we figure out from the host how many other VMs are
experiencing this crash too instead of waiting for a user to complain?

That's the primary use-case for this notification IMHO.  Just a simple
status LED from the guest to indicate that it's in a bad state.

Regards,

Anthony Liguori

>
>> 
>> Regards,
>> 
>> Anthony Liguori
>> 
>> >
>> >> Regards,
>> >> 
>> >> Anthony Liguori
>> >> 
>> >> >
>> >> >> 
>> >> >> > Well, we have more than a single serial port, even when leaving
>> >> >> > virtio-serial aside...
>> >> >> > 
>> >> >> > Jan
>> >> >> > 
>> >> >> > -- 
>> >> >> > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
>> >> >> > Corporate Competence Center Embedded Linux
>> >> >> > --
>> >> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >> >> > the body of a message to majordomo@vger.kernel.org
>> >> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> --
>> To unsubscribe from this list: send the line "unsubscribe kvm" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [patch 1/1] ppc: e500_tlb memset clears nothing
From: Alexander Graf @ 2012-08-14 22:17 UTC (permalink / raw)
  To: akpm; +Cc: benh, kvm-ppc, kvm, alan, avi, dcb314, mtosatti, paulus
In-Reply-To: <20120814221010.7EC405C0065@hpza9.eem.corp.google.com>


On 15.08.2012, at 00:10, akpm@linux-foundation.org wrote:

> From: Alan Cox <alan@linux.intel.com>
> Subject: ppc: e500_tlb memset clears nothing
> 
> Put the parameters the right way around
> 
> Addresses https://bugzilla.kernel.org/show_bug.cgi?id=44031
> 
> Reported-by: David Binderman <dcb314@hotmail.com>
> Signed-off-by: Alan Cox <alan@linux.intel.com>
> Cc: Avi Kivity <avi@redhat.com>
> Cc: Marcelo Tosatti <mtosatti@redhat.com>
> Cc: Alexander Graf <agraf@suse.de>
> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> Cc: Paul Mackerras <paulus@samba.org>
> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

Thanks, applied to kvm-ppc-next.


Alex

^ permalink raw reply

* [patch 1/1] ppc: e500_tlb memset clears nothing
From: akpm @ 2012-08-14 22:10 UTC (permalink / raw)
  To: benh; +Cc: agraf, kvm-ppc, kvm, akpm, alan, avi, dcb314, mtosatti, paulus

From: Alan Cox <alan@linux.intel.com>
Subject: ppc: e500_tlb memset clears nothing

Put the parameters the right way around

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=44031

Reported-by: David Binderman <dcb314@hotmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Cc: Avi Kivity <avi@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Alexander Graf <agraf@suse.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/powerpc/kvm/e500_tlb.c |    8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff -puN arch/powerpc/kvm/e500_tlb.c~ppc-e500_tlb-memset-clears-nothing arch/powerpc/kvm/e500_tlb.c
--- a/arch/powerpc/kvm/e500_tlb.c~ppc-e500_tlb-memset-clears-nothing
+++ a/arch/powerpc/kvm/e500_tlb.c
@@ -320,11 +320,11 @@ static inline void kvmppc_e500_ref_relea
 static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500)
 {
 	if (vcpu_e500->g2h_tlb1_map)
-		memset(vcpu_e500->g2h_tlb1_map,
-		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries, 0);
+		memset(vcpu_e500->g2h_tlb1_map, 0,
+		       sizeof(u64) * vcpu_e500->gtlb_params[1].entries);
 	if (vcpu_e500->h2g_tlb1_rmap)
-		memset(vcpu_e500->h2g_tlb1_rmap,
-		       sizeof(unsigned int) * host_tlb_params[1].entries, 0);
+		memset(vcpu_e500->h2g_tlb1_rmap, 0,
+		       sizeof(unsigned int) * host_tlb_params[1].entries);
 }
 
 static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500)
_

^ permalink raw reply

* Re: [PATCH v7 2/2] kvm: KVM_EOIFD, an eventfd for EOIs
From: Alex Williamson @ 2012-08-14 22:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Michael S. Tsirkin, gleb, kvm, linux-kernel, jan.kiszka
In-Reply-To: <502A462A.1000600@redhat.com>

On Tue, 2012-08-14 at 15:35 +0300, Avi Kivity wrote:
> On 08/12/2012 12:33 PM, Michael S. Tsirkin wrote:
> >> 
> >> Michael, would the interface be more acceptable to you if we added
> >> separate ioctls to allocate and free some representation of an irq
> >> source ID, gsi pair?  For instance, an ioctl might return an idr entry
> >> for an irq source ID/gsi object which would then be passed as a
> >> parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> >> representing the source id/gsi isn't magically freed on it's own.  This
> >> would also allow us to deassign/close one end and reconfigure it later.
> >> Thanks,
> >> 
> >> Alex
> > 
> > It's acceptable to me either way. I was only pointing out that as
> > designed, the interface looks simple at first but then you find out some
> > subtle limitations which are implementation driven. This gives
> > an overall feeling the abstraction is too low level.
> > 
> > If we compare to the existing irqfd, isn't the difference
> > simply that irqfd deasserts immediately ATM, while we
> > want to delay this until later?
> > 
> > If yes, then along the lines that you proposed, and combining with my
> > idea of tracking deasserts, how do you like the following:
> > 
> > /* Keep line asserted until guest has handled the interrupt. */
> > #define KVM_IRQFD_FLAG_DEASSERT_ON_ACK (1 << 1)
> > /* Notify after line is deasserted. */
> > #define KVM_IRQFD_FLAG_DEASSERT_EVENTFD (2 << 1)
> > 
> > 	struct kvm_irqfd {
> > 		__u32 fd;
> > 		__u32 gsi;
> > 		__u32 flags;
> > 		/* eventfd to notify when line is deasserted */
> > 		__u32 deassert_eventfd;
> > 		__u8  pad[16];
> > 	};
> > 
> > now the only limitation is that KVM_IRQFD_FLAG_DEASSERT_ON_ACK is only
> > effective for level interrupts.
> > 
> > Notes about lifetime of objects:
> > 	- closing deassert_eventfd does nothing (we can keep
> > 	  reference to it from irqfd so no need for
> >           complex polling/flushing scheme)
> > 	- closing irqfd or deasserting dis-associates
> > 	  deassert_eventfd automatically
> > 	- source id is internal to irqfd and goes away with it
> > 
> > it looks harder to misuse and fits what we want to do nicely,
> > and needs less code to implement.
> > 
> > Avi, what do you think?
> 
> I think given all the complexity in the separate ioctl approach that
> this makes sense.  There are no lifetime issues or code to match the two
> eventfds.  Alex, would this API simplify the code?

It does though I'm concerned that it's a very specific solution that
only addresses this problem.  Generic userspace eoi/ack is not
addressed.  The latest version using separate ioctls does a lot of
simplification by exposing irq sourceids.  The bulk of the code there is
duplicating what irqfd does just so we can catch the POLLHUP for
cleanup.  If there was an easier way to do that, we don't care about
POLLIN/POLLOUT, much of the code could be removed.  Alternatively we
could make some common infrastructure to simplify both irqfd and
irq_ackfd, but how to frame the helpers isn't easy.

> Yet another option was raised in the past, and that was exiling ioapic
> and pic to userspace.  This moves the entire issue to userspace.  The
> cost is a new interface that implements the APIC bus (betweem APIC and
> IOAPIC) and the INTACK sequence (between APIC and PIC), and potential
> for performance regressions due to the PIC, IOAPIC, and PIT being in
> userspace.  We would still have to keep the IOAPIC/PIC in the kernel,
> but no new features would be added.

Doesn't this assure a performance regression or are we assuming anywhere
we care about performance we're using MSI?  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v7 2/2] kvm: KVM_EOIFD, an eventfd for EOIs
From: Alex Williamson @ 2012-08-14 21:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, gleb, kvm, linux-kernel, jan.kiszka
In-Reply-To: <20120814083548.GC3597@redhat.com>

On Tue, 2012-08-14 at 11:35 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 09:09:43PM -0600, Alex Williamson wrote:
> > On Tue, 2012-08-14 at 02:00 +0300, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 04:41:05PM -0600, Alex Williamson wrote:
> > > > On Tue, 2012-08-14 at 01:06 +0300, Michael S. Tsirkin wrote:
> > > > > On Mon, Aug 13, 2012 at 03:34:01PM -0600, Alex Williamson wrote:
> > > > > > On Sun, 2012-08-12 at 11:36 +0300, Avi Kivity wrote:
> > > > > > > On 08/09/2012 10:26 PM, Alex Williamson wrote:
> > > > > > > > On Mon, 2012-08-06 at 13:40 +0300, Avi Kivity wrote:
> > > > > > > >> On 08/06/2012 01:38 PM, Avi Kivity wrote:
> > > > > > > >> 
> > > > > > > >> > Regarding the implementation, instead of a linked list, would an array
> > > > > > > >> > of counters parallel to the bitmap make it simpler?
> > > > > > > >> 
> > > > > > > >> Or even, replace the bitmap with an array of counters.
> > > > > > > > 
> > > > > > > > I'm not sure a counter array is what we're really after.  That gives us
> > > > > > > > reference counting for the irq source IDs, but not the key->gsi lookup.
> > > > > > > 
> > > > > > > You can look up the gsi while registering the eoifd, so it's accessible
> > > > > > > as eoifd->gsi instead of eoifd->source->gsi.  The irqfd can go away
> > > > > > > while the eoifd is still active, but is this a problem?
> > > > > > 
> > > > > > In my opinion, no, but Michael disagrees.
> > > > > > 
> > > > > > > > It also highlights another issue, that we have a limited set of source
> > > > > > > > IDs.  Looks like we have BITS_PER_LONG IDs, with two already used, one
> > > > > > > > for the shared userspace ID and another for the PIT.  How happy are we
> > > > > > > > going to be with a limit of 62 level interrupts in use at one time?
> > > > > > > 
> > > > > > > When we start being unhappy we can increase that number.  On the other
> > > > > > > hand more locks and lists makes me unhappy now.
> > > > > > 
> > > > > > Yep, good point.  My latest version removes the source ID object lock
> > > > > > and list (and objects).  I still have a lock and list for the ack
> > > > > > notification, but it's hard not to unless we combine them into one
> > > > > > mega-irqfd ioctl as Michael suggests.
> > > > > >
> > > > > > > > It's arguably a reasonable number since the most virtualization friendly
> > > > > > > > devices (sr-iov VFs) don't even support this kind of interrupt.  It's
> > > > > > > > also very wasteful allocating an entire source ID for a single GSI
> > > > > > > > within that source ID.  PCI supports interrupts A, B, C, and D, which,
> > > > > > > > in the most optimal config, each go to different GSIs.  So we could
> > > > > > > > theoretically be more efficient in our use and allocation of irq source
> > > > > > > > IDs if we tracked use by the source ID, gsi pair.
> > > > > > > 
> > > > > > > There are, in one userspace, just three gsis available for PCI links, so
> > > > > > > you're compressing the source id space by 3.
> > > > > > 
> > > > > > I imagine there's a way to put each PCI interrupt pin on a GSI, but
> > > > > > still only 4, not a great expansion of source ID space.  I like
> > > > > > Michael's idea of re-using source IDs if we run out better.
> > > > > > 
> > > > > > > > That probably makes it less practical to replace anything at the top
> > > > > > > > level with a counter array.  The key that we pass back is currently the
> > > > > > > > actual source ID, but we don't specify what it is, so we could split it
> > > > > > > > and have it encode a 16bit source ID plus 16 bit GSI.  It could also be
> > > > > > > > an idr entry.
> > > > > > > 
> > > > > > > We can fix those kinds of problems by adding another layer of
> > > > > > > indirection.  But I doubt they will be needed.  I don't see people
> > > > > > > assigning 60 legacy devices to one guest.
> > > > > > 
> > > > > > Yep, we can ignore it for now and put it in the hands of userspace to
> > > > > > re-use IDs if needed.
> > > > > > 
> > > > > > > > Michael, would the interface be more acceptable to you if we added
> > > > > > > > separate ioctls to allocate and free some representation of an irq
> > > > > > > > source ID, gsi pair?  For instance, an ioctl might return an idr entry
> > > > > > > > for an irq source ID/gsi object which would then be passed as a
> > > > > > > > parameter in struct kvm_irqfd and struct kvm_eoifd so that the object
> > > > > > > > representing the source id/gsi isn't magically freed on it's own.  This
> > > > > > > > would also allow us to deassign/close one end and reconfigure it later.
> > > > > > > > Thanks,
> > > > > > > 
> > > > > > > Another option is to push the responsibility for allocating IDs for the
> > > > > > > association to userspace.  Let userspace both create the irqfd and the
> > > > > > > eoifd with the same ID, the kernel matches them at registration time and
> > > > > > > copies the gsi/sourceid from the first to the second eventfd.
> > > > > > 
> > > > > > Aside from the copying gsi/sourceid bit, you've just described my latest
> > > > > > attempt at this series.  Specifying both a sourceid and gsi also allows
> > > > > > userspace to make better use of the sourceid address space (use more
> > > > > > than one gsi if userspace wants the complexity of managing them).
> > > > > > Thanks,
> > > > > > 
> > > > > > Alex
> > > > > 
> > > > > Turns out per device source ID is a bug copied from existing
> > > > > device assignment. I am amazed we did not notice before.
> > > > > There we have small # of devices so it's not a problem but there's no
> > > > > reason just not to have a source ID for all irqfds.
> > > > > So the problem goes away, and there is no limit on # of level irqfds,
> > > > > and no need to manage IDs in userspace at all.
> > > > > You can still have cookies in userspace if you like but do not map them
> > > > > to source IDs.
> > > > 
> > > > IMHO it's not a bug, it's an implementation decision.  They could be
> > > > shared, but that doesn't make it wrong to not share them.  Given that we
> > > > have 32 memory slots, the only way you could hit this would be to have a
> > > > lot of really slow devices that don't direct-map any BARs.  A reason to
> > > > not have the same source id for everything is that I think we can do ack
> > > > notification filtering more easily using separate source ids (as is done
> > > > in the first patch of the v8 series).
> > > 
> > > Just a thought: can filtering read and clear the irqfd counter?
> > 
> > Sorry, what's "the irqfd counter"?  The eventfd counter?  As I have it
> > in the patch series, the filtering happens where the irq ack notifier
> > calls the individual notifier callbacks.  That's not irqfd/eventfd
> > specific, so it doesn't have access to the eventfd counter there.
> > Taking the filtering into the into the actual callbacks seems to require
> > locking or maybe your proposed test and clear interface (which still
> > requires locking).
> > 
> > > >  As the code is today, I agree,
> > > > there's probably no advantage to using multiple source IDs.  Thanks,
> > > > 
> > > > Alex
> > > 
> > > I think one point worth addressing is, Gleb wanted
> > > to get eoifd without irqfd at all and that works for
> > > timer interrupt.
> > 
> > Right, that's what I'm referring to with the modular components vs
> > pulling eoifd into irqfd.  One gives us interfaces that can easily be
> > extended or already supports a more generic eoifd, the other gives us a
> > very specific use case and we'll have to come up with something else for
> > non-irqfd related eois.  Thanks,
> > 
> > Alex
> 
> Yes that is fine but previous versions tied eoifd to irqfd
> so were not useful alone anyway. Will look at v8.

This is because earlier feedback rejected creating a version of the
ioctl that had no users.  It was only a matter of adding a flag to
indicate kvm_eoifd.key was actually a gsi and some trivial code changes
to enable such an interface.  v8 builds the interface in the other
direction, so I left the notify-only, untied version as the base.
Thanks,

Alex

^ permalink raw reply

* Re: [Qemu-devel] [RFC-v2 3/6] vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
From: Nicholas A. Bellinger @ 2012-08-14 21:17 UTC (permalink / raw)
  To: Blue Swirl
  Cc: target-devel, Anthony Liguori, Stefan Hajnoczi, kvm-devel,
	Michael S. Tsirkin, Jan Kiszka, qemu-devel, Zhi Yong Wu,
	Anthony Liguori, Zhi Yong Wu, Hannes Reinecke, Paolo Bonzini,
	lf-virt, Christoph Hellwig
In-Reply-To: <CAAu8pHttCSW9bUtrdiDG8-PBCpVvW2O=PEdYQoceDDUYydZ+9Q@mail.gmail.com>

On Mon, 2012-08-13 at 19:47 +0000, Blue Swirl wrote:
> On Mon, Aug 13, 2012 at 8:35 AM, Nicholas A. Bellinger
> <nab@linux-iscsi.org> wrote:
> > From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> >
> > This patch adds a new type of host device that drives the vhost_scsi
> > device.  The syntax to add vhost-scsi is:
> >
> >   qemu -vhost-scsi id=vhost-scsi0,wwpn=...,tpgt=123
> >
> > The virtio-scsi emulated device will make use of vhost-scsi to process
> > virtio-scsi requests inside the kernel and hand them to the in-kernel
> > SCSI target stack using the tcm_vhost fabric driver.
> >
> > The tcm_vhost driver was merged into the upstream linux kernel for 3.6-rc2,
> > and the commit can be found here:
> >
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commitdiff;h=057cbf49a1f08297
> >
> > Changelog v1 -> v2:
> >
> > - Expose ABI version via VHOST_SCSI_GET_ABI_VERSION + use Rev 0 as
> >   starting point for v3.6-rc code (Stefan + ALiguori + nab)
> > - Fix upstream qemu conflict in hw/qdev-properties.c
> > - Make GET_ABI_VERSION use int (nab + mst)
> > - Fix vhost-scsi case lables in configure (reported by paolo)
> > - Convert qdev_prop_vhost_scsi to use ->get() + ->set() following
> >   qdev_prop_netdev (reported by paolo)
> > - Fix typo in qemu-options.hx definition of vhost-scsi (reported by paolo)
> >
> > Changelog v0 -> v1:
> >
> > - Add VHOST_SCSI_SET_ENDPOINT call (stefan)
> > - Enable vhost notifiers for multiple queues (Zhi)
> > - clear vhost-scsi endpoint on stopped (Zhi)
> > - Add CONFIG_VHOST_SCSI for QEMU build configure (nab)
> > - Rename vhost_vring_target -> vhost_scsi_target (mst + nab)
> > - Add support for VHOST_SCSI_GET_ABI_VERSION ioctl (aliguori + nab)
> >
> > Cc: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > Cc: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> > Cc: Anthony Liguori <aliguori@us.ibm.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
> > ---
> >  configure            |   10 +++
> >  hw/Makefile.objs     |    1 +
> >  hw/qdev-properties.c |   40 ++++++++++++
> >  hw/qdev.h            |    3 +
> >  hw/vhost-scsi.c      |  170 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  hw/vhost-scsi.h      |   50 +++++++++++++++
> >  qemu-common.h        |    1 +
> >  qemu-config.c        |   16 +++++
> >  qemu-options.hx      |    4 +
> >  vl.c                 |   18 +++++
> >  10 files changed, 313 insertions(+), 0 deletions(-)
> >  create mode 100644 hw/vhost-scsi.c
> >  create mode 100644 hw/vhost-scsi.h
> >

<SNIP>

> >
> > +/* --- vhost-scsi --- */
> > +
> > +static int parse_vhost_scsi_dev(DeviceState *dev, const char *str, void **ptr)
> > +{
> > +   VHostSCSI *p;
> > +
> > +   p = find_vhost_scsi(str);
> > +   if (p == NULL)
> > +       return -ENOENT;
> 
> Braces, please.
> 

Fixed

> > +
> > +   *ptr = p;
> > +   return 0;
> > +}
> > +
> > +static const char *print_vhost_scsi_dev(void *ptr)
> > +{
> > +    VHostSCSI *p = ptr;
> > +
> > +    return (p) ? vhost_scsi_get_id(p) : "<null>";
> > +}
> > +
> > +static void get_vhost_scsi_dev(Object *obj, Visitor *v, void *opaque,
> > +                       const char *name, Error **errp)
> > +{
> > +    get_pointer(obj, v, opaque, print_vhost_scsi_dev, name, errp);
> > +}
> > +
> > +static void set_vhost_scsi_dev(Object *obj, Visitor *v, void *opaque,
> > +                               const char *name, Error **errp)
> > +{
> > +    set_pointer(obj, v, opaque, parse_vhost_scsi_dev, name, errp);
> > +}
> > +
> > +PropertyInfo qdev_prop_vhost_scsi = {
> > +     .name = "vhost-scsi",
> > +     .get  = get_vhost_scsi_dev,
> > +     .set  = set_vhost_scsi_dev,
> > +};
> > +
> >  /* --- pointer --- */
> >
> >  /* Not a proper property, just for dirty hacks.  TODO Remove it!  */
> > diff --git a/hw/qdev.h b/hw/qdev.h
> > index d699194..d5873bb 100644
> > --- a/hw/qdev.h
> > +++ b/hw/qdev.h
> > @@ -238,6 +238,7 @@ extern PropertyInfo qdev_prop_vlan;
> >  extern PropertyInfo qdev_prop_pci_devfn;
> >  extern PropertyInfo qdev_prop_blocksize;
> >  extern PropertyInfo qdev_prop_pci_host_devaddr;
> > +extern PropertyInfo qdev_prop_vhost_scsi;
> >
> >  #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
> >          .name      = (_name),                                    \
> > @@ -305,6 +306,8 @@ extern PropertyInfo qdev_prop_pci_host_devaddr;
> >      DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
> >  #define DEFINE_PROP_PCI_HOST_DEVADDR(_n, _s, _f) \
> >      DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
> > +#define DEFINE_PROP_VHOST_SCSI(_n, _s, _f)       \
> > +    DEFINE_PROP(_n, _s, _f, qdev_prop_vhost_scsi, VHostSCSI*)
> >
> >  #define DEFINE_PROP_END_OF_LIST()               \
> >      {}
> > diff --git a/hw/vhost-scsi.c b/hw/vhost-scsi.c
> > new file mode 100644
> > index 0000000..7145b2d
> > --- /dev/null
> > +++ b/hw/vhost-scsi.c
> > @@ -0,0 +1,170 @@
> > +/*
> > + * vhost_scsi host device
> > + *
> > + * Copyright IBM, Corp. 2011
> > + *
> > + * Authors:
> > + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> > + *
> > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > + * See the COPYING.LIB file in the top-level directory.
> > + *
> > + */
> > +
> > +#include <sys/ioctl.h>
> > +#include "config.h"
> > +#include "qemu-queue.h"
> > +#include "vhost-scsi.h"
> > +#include "vhost.h"
> > +
> > +struct VHostSCSI {
> > +    const char *id;
> > +    const char *wwpn;
> > +    uint16_t tpgt;
> > +    struct vhost_dev dev;
> > +    struct vhost_virtqueue vqs[3];
> > +    QLIST_ENTRY(VHostSCSI) list;
> > +};
> > +
> > +static QLIST_HEAD(, VHostSCSI) vhost_scsi_list =
> > +    QLIST_HEAD_INITIALIZER(vhost_scsi_list);
> > +
> > +VHostSCSI *find_vhost_scsi(const char *id)
> > +{
> > +    VHostSCSI *vs;
> > +
> > +    QLIST_FOREACH(vs, &vhost_scsi_list, list) {
> > +        if (strcmp(id, vs->id) == 0) {
> > +            return vs;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> > +const char *vhost_scsi_get_id(VHostSCSI *vs)
> > +{
> > +    return vs->id;
> > +}
> > +
> > +int vhost_scsi_start(VHostSCSI *vs, VirtIODevice *vdev)
> > +{
> > +    int ret, abi_version;
> > +    struct vhost_scsi_target backend;
> > +
> > +    if (!vhost_dev_query(&vs->dev, vdev)) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    vs->dev.nvqs = 3;
> > +    vs->dev.vqs = vs->vqs;
> > +
> > +    ret = vhost_dev_enable_notifiers(&vs->dev, vdev);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    ret = vhost_dev_start(&vs->dev, vdev);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    memset(&backend, 0, sizeof(backend));
> > +    ret = ioctl(vs->dev.control, VHOST_SCSI_GET_ABI_VERSION, &abi_version);
> > +    if (ret < 0) {
> > +        ret = -errno;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +    if (abi_version > VHOST_SCSI_ABI_VERSION) {
> > +        fprintf(stderr, "The running tcm_vhost kernel abi_version: %d is greater"
> > +               " than vhost_scsi userspace supports: %d\n", abi_version,
> > +               VHOST_SCSI_ABI_VERSION);
> > +        ret = -ENOSYS;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +    fprintf(stdout, "TCM_vHost ABI version: %d\n", abi_version);
> > +
> > +    pstrcpy((char *)backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->wwpn);
> 
> Please change vhost_wwpn to plain char *, then the cast can be removed.
> 

<nod>, changed to char *, and updating tcm_vhost on the kernel side to
do the same.

> > +    backend.vhost_tpgt = vs->tpgt;
> > +    ret = ioctl(vs->dev.control, VHOST_SCSI_SET_ENDPOINT, &backend);
> > +    if (ret < 0) {
> > +        ret = -errno;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +void vhost_scsi_stop(VHostSCSI *vs, VirtIODevice *vdev)
> > +{
> > +    int ret;
> > +    struct vhost_scsi_target backend;
> > +
> > +    pstrcpy((char *)backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->wwpn);
> 
> Also here.
> 

Done

Thanks for your review Blue!

--nab

^ permalink raw reply

* Re: [RFC-v2 3/6] vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
From: Nicholas A. Bellinger @ 2012-08-14 21:12 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, Stefan Hajnoczi, kvm-devel, Jan Kiszka,
	Zhi Yong Wu, qemu-devel, Zhi Yong Wu, Anthony Liguori,
	target-devel, Hannes Reinecke, Paolo Bonzini, lf-virt,
	Christoph Hellwig
In-Reply-To: <20120813085929.GI14081@redhat.com>

On Mon, 2012-08-13 at 11:59 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 08:35:14AM +0000, Nicholas A. Bellinger wrote:
> > From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > 
> > This patch adds a new type of host device that drives the vhost_scsi
> > device.  The syntax to add vhost-scsi is:
> > 
> >   qemu -vhost-scsi id=vhost-scsi0,wwpn=...,tpgt=123
> > 
> > The virtio-scsi emulated device will make use of vhost-scsi to process
> > virtio-scsi requests inside the kernel and hand them to the in-kernel
> > SCSI target stack using the tcm_vhost fabric driver.
> > 
> > The tcm_vhost driver was merged into the upstream linux kernel for 3.6-rc2,
> > and the commit can be found here:
> > 
> > http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git;a=commitdiff;h=057cbf49a1f08297
> > 
> > Changelog v1 -> v2:
> > 
> > - Expose ABI version via VHOST_SCSI_GET_ABI_VERSION + use Rev 0 as
> >   starting point for v3.6-rc code (Stefan + ALiguori + nab)
> > - Fix upstream qemu conflict in hw/qdev-properties.c
> > - Make GET_ABI_VERSION use int (nab + mst)
> > - Fix vhost-scsi case lables in configure (reported by paolo)
> > - Convert qdev_prop_vhost_scsi to use ->get() + ->set() following
> >   qdev_prop_netdev (reported by paolo)
> > - Fix typo in qemu-options.hx definition of vhost-scsi (reported by paolo)
> > 
> > Changelog v0 -> v1:
> > 
> > - Add VHOST_SCSI_SET_ENDPOINT call (stefan)
> > - Enable vhost notifiers for multiple queues (Zhi)
> > - clear vhost-scsi endpoint on stopped (Zhi)
> > - Add CONFIG_VHOST_SCSI for QEMU build configure (nab)
> > - Rename vhost_vring_target -> vhost_scsi_target (mst + nab)
> > - Add support for VHOST_SCSI_GET_ABI_VERSION ioctl (aliguori + nab)
> > 
> > Cc: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > Cc: Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
> > Cc: Anthony Liguori <aliguori@us.ibm.com>
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
> 
> 
> Sent mail too fast, sorry. More comments below.
> 
> > ---
> >  configure            |   10 +++
> >  hw/Makefile.objs     |    1 +
> >  hw/qdev-properties.c |   40 ++++++++++++
> >  hw/qdev.h            |    3 +
> >  hw/vhost-scsi.c      |  170 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  hw/vhost-scsi.h      |   50 +++++++++++++++
> >  qemu-common.h        |    1 +
> >  qemu-config.c        |   16 +++++
> >  qemu-options.hx      |    4 +
> >  vl.c                 |   18 +++++
> >  10 files changed, 313 insertions(+), 0 deletions(-)
> >  create mode 100644 hw/vhost-scsi.c
> >  create mode 100644 hw/vhost-scsi.h
> > 
> > diff --git a/configure b/configure
> > index f0dbc03..1f03202 100755
> > --- a/configure
> > +++ b/configure
> > @@ -168,6 +168,7 @@ libattr=""
> >  xfs=""
> >  
> >  vhost_net="no"
> > +vhost_scsi="no"
> >  kvm="no"
> >  gprof="no"
> >  debug_tcg="no"
> > @@ -513,6 +514,7 @@ Haiku)
> >    usb="linux"
> >    kvm="yes"
> >    vhost_net="yes"
> > +  vhost_scsi="yes"
> >    if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
> >      audio_possible_drivers="$audio_possible_drivers fmod"
> >    fi
> > @@ -818,6 +820,10 @@ for opt do
> >    ;;
> >    --enable-vhost-net) vhost_net="yes"
> >    ;;
> > +  --disable-vhost-scsi) vhost_scsi="no"
> > +  ;;
> > +  --enable-vhost-scsi) vhost_scsi="yes"
> > +  ;;
> >    --disable-opengl) opengl="no"
> >    ;;
> >    --enable-opengl) opengl="yes"
> > @@ -3116,6 +3122,7 @@ echo "posix_madvise     $posix_madvise"
> >  echo "uuid support      $uuid"
> >  echo "libcap-ng support $cap_ng"
> >  echo "vhost-net support $vhost_net"
> > +echo "vhost-scsi support $vhost_scsi"
> >  echo "Trace backend     $trace_backend"
> >  echo "Trace output file $trace_file-<pid>"
> >  echo "spice support     $spice"
> > @@ -3828,6 +3835,9 @@ case "$target_arch2" in
> >        if test "$vhost_net" = "yes" ; then
> >          echo "CONFIG_VHOST_NET=y" >> $config_target_mak
> >        fi
> > +      if test "$vhost_scsi" = "yes" ; then
> > +        echo "CONFIG_VHOST_SCSI=y" >> $config_target_mak
> > +      fi
> >      fi
> >  esac
> >  case "$target_arch2" in
> > diff --git a/hw/Makefile.objs b/hw/Makefile.objs
> > index 3ba5dd0..6ab75ec 100644
> > --- a/hw/Makefile.objs
> > +++ b/hw/Makefile.objs
> > @@ -169,6 +169,7 @@ obj-$(CONFIG_VIRTIO) += virtio.o virtio-blk.o virtio-balloon.o virtio-net.o
> >  obj-$(CONFIG_VIRTIO) += virtio-serial-bus.o virtio-scsi.o
> >  obj-$(CONFIG_SOFTMMU) += vhost_net.o
> >  obj-$(CONFIG_VHOST_NET) += vhost.o
> > +obj-$(CONFIG_VHOST_SCSI) += vhost-scsi.o
> >  obj-$(CONFIG_REALLY_VIRTFS) += 9pfs/
> >  obj-$(CONFIG_NO_PCI) += pci-stub.o
> >  obj-$(CONFIG_VGA) += vga.o
> > diff --git a/hw/qdev-properties.c b/hw/qdev-properties.c
> > index 8aca0d4..0266266 100644
> > --- a/hw/qdev-properties.c
> > +++ b/hw/qdev-properties.c
> > @@ -4,6 +4,7 @@
> >  #include "blockdev.h"
> >  #include "hw/block-common.h"
> >  #include "net/hub.h"
> > +#include "vhost-scsi.h"
> >  
> >  void *qdev_get_prop_ptr(DeviceState *dev, Property *prop)
> >  {
> > @@ -696,6 +697,45 @@ PropertyInfo qdev_prop_vlan = {
> >      .set   = set_vlan,
> >  };
> >  
> > +/* --- vhost-scsi --- */
> > +
> > +static int parse_vhost_scsi_dev(DeviceState *dev, const char *str, void **ptr)
> > +{
> > +   VHostSCSI *p;
> > +
> > +   p = find_vhost_scsi(str);
> > +   if (p == NULL)
> > +       return -ENOENT;
> > +
> > +   *ptr = p;
> > +   return 0;
> > +}
> > +
> > +static const char *print_vhost_scsi_dev(void *ptr)
> > +{
> > +    VHostSCSI *p = ptr;
> > +
> > +    return (p) ? vhost_scsi_get_id(p) : "<null>";
> > +}
> > +
> > +static void get_vhost_scsi_dev(Object *obj, Visitor *v, void *opaque,
> > +                       const char *name, Error **errp)
> > +{
> > +    get_pointer(obj, v, opaque, print_vhost_scsi_dev, name, errp);
> > +}
> > +
> > +static void set_vhost_scsi_dev(Object *obj, Visitor *v, void *opaque,
> > +                               const char *name, Error **errp)
> > +{
> > +    set_pointer(obj, v, opaque, parse_vhost_scsi_dev, name, errp);
> > +}
> > +
> > +PropertyInfo qdev_prop_vhost_scsi = {
> > +     .name = "vhost-scsi",
> > +     .get  = get_vhost_scsi_dev,
> > +     .set  = set_vhost_scsi_dev,
> > +};
> > +
> >  /* --- pointer --- */
> >  
> >  /* Not a proper property, just for dirty hacks.  TODO Remove it!  */
> 
> Why does this make sense in the generic qdev-properties?
> There's exactly one device that can use this, no?
> 

Mmmm, not sure on this one either..  Stefan..?

> > diff --git a/hw/qdev.h b/hw/qdev.h
> > index d699194..d5873bb 100644
> > --- a/hw/qdev.h
> > +++ b/hw/qdev.h
> > @@ -238,6 +238,7 @@ extern PropertyInfo qdev_prop_vlan;
> >  extern PropertyInfo qdev_prop_pci_devfn;
> >  extern PropertyInfo qdev_prop_blocksize;
> >  extern PropertyInfo qdev_prop_pci_host_devaddr;
> > +extern PropertyInfo qdev_prop_vhost_scsi;
> >  
> >  #define DEFINE_PROP(_name, _state, _field, _prop, _type) { \
> >          .name      = (_name),                                    \
> > @@ -305,6 +306,8 @@ extern PropertyInfo qdev_prop_pci_host_devaddr;
> >      DEFINE_PROP_DEFAULT(_n, _s, _f, _d, qdev_prop_blocksize, uint16_t)
> >  #define DEFINE_PROP_PCI_HOST_DEVADDR(_n, _s, _f) \
> >      DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
> > +#define DEFINE_PROP_VHOST_SCSI(_n, _s, _f)       \
> > +    DEFINE_PROP(_n, _s, _f, qdev_prop_vhost_scsi, VHostSCSI*)
> >
> 
> Can this move to vhost-scsi.c?
>   

Done

> >  #define DEFINE_PROP_END_OF_LIST()               \
> >      {}
> > diff --git a/hw/vhost-scsi.c b/hw/vhost-scsi.c
> > new file mode 100644
> > index 0000000..7145b2d
> > --- /dev/null
> > +++ b/hw/vhost-scsi.c
> > @@ -0,0 +1,170 @@
> > +/*
> > + * vhost_scsi host device
> > + *
> > + * Copyright IBM, Corp. 2011
> > + *
> > + * Authors:
> > + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> > + *
> > + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> > + * See the COPYING.LIB file in the top-level directory.
> > + *
> > + */
> > +
> > +#include <sys/ioctl.h>
> > +#include "config.h"
> > +#include "qemu-queue.h"
> > +#include "vhost-scsi.h"
> > +#include "vhost.h"
> > +
> > +struct VHostSCSI {
> > +    const char *id;
> > +    const char *wwpn;
> > +    uint16_t tpgt;
> > +    struct vhost_dev dev;
> > +    struct vhost_virtqueue vqs[3];
> 
> Could you add enum for vq numbers pls?
> 

Done

> > +    QLIST_ENTRY(VHostSCSI) list;
> > +};
> > +
> > +static QLIST_HEAD(, VHostSCSI) vhost_scsi_list =
> > +    QLIST_HEAD_INITIALIZER(vhost_scsi_list);
> > +
> > +VHostSCSI *find_vhost_scsi(const char *id)
> > +{
> > +    VHostSCSI *vs;
> > +
> > +    QLIST_FOREACH(vs, &vhost_scsi_list, list) {
> > +        if (strcmp(id, vs->id) == 0) {
> 
> !strcmp
> 

Done

> > +            return vs;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> > +const char *vhost_scsi_get_id(VHostSCSI *vs)
> > +{
> > +    return vs->id;
> > +}
> > +
> > +int vhost_scsi_start(VHostSCSI *vs, VirtIODevice *vdev)
> > +{
> > +    int ret, abi_version;
> > +    struct vhost_scsi_target backend;
> > +
> > +    if (!vhost_dev_query(&vs->dev, vdev)) {
> > +        return -ENOTSUP;
> > +    }
> > +
> > +    vs->dev.nvqs = 3;
> > +    vs->dev.vqs = vs->vqs;
> > +
> > +    ret = vhost_dev_enable_notifiers(&vs->dev, vdev);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    ret = vhost_dev_start(&vs->dev, vdev);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    memset(&backend, 0, sizeof(backend));
> > +    ret = ioctl(vs->dev.control, VHOST_SCSI_GET_ABI_VERSION, &abi_version);
> > +    if (ret < 0) {
> > +        ret = -errno;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +    if (abi_version > VHOST_SCSI_ABI_VERSION) {
> > +        fprintf(stderr, "The running tcm_vhost kernel abi_version: %d is greater"
> > +		" than vhost_scsi userspace supports: %d\n", abi_version,
> > +		VHOST_SCSI_ABI_VERSION);
> > +        ret = -ENOSYS;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +    fprintf(stdout, "TCM_vHost ABI version: %d\n", abi_version);
> > +
> > +    pstrcpy((char *)backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->wwpn);
> > +    backend.vhost_tpgt = vs->tpgt;
> > +    ret = ioctl(vs->dev.control, VHOST_SCSI_SET_ENDPOINT, &backend);
> > +    if (ret < 0) {
> > +        ret = -errno;
> > +        vhost_dev_stop(&vs->dev, vdev);
> > +        return ret;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +void vhost_scsi_stop(VHostSCSI *vs, VirtIODevice *vdev)
> > +{
> > +    int ret;
> > +    struct vhost_scsi_target backend;
> > +
> > +    pstrcpy((char *)backend.vhost_wwpn, sizeof(backend.vhost_wwpn), vs->wwpn);
> > +    backend.vhost_tpgt = vs->tpgt;
> > +    ret = ioctl(vs->dev.control, VHOST_SCSI_CLEAR_ENDPOINT, &backend);
> > +    if (ret < 0) {
> > +        fprintf(stderr, "Failed to clear endpoint\n");
> > +    }
> > +
> > +    vhost_dev_stop(&vs->dev, vdev);
> > +}
> > +
> > +static VHostSCSI *vhost_scsi_add(const char *id, const char *wwpn,
> > +                                 uint16_t tpgt)
> > +{
> > +    VHostSCSI *vs = g_malloc0(sizeof(*vs));
> > +    int ret;
> > +
> > +    /* TODO set up vhost-scsi device and bind to tcm_vhost/$wwpm/tpgt_$tpgt */
> > +    fprintf(stderr, "wwpn = \"%s\" tpgt = \"%u\"\n", id, tpgt);
> > +
> 
> Please do not keep debugging fprintfs around.
> 

Dropped

> > +    ret = vhost_dev_init(&vs->dev, -1, "/dev/vhost-scsi", false);
> 
> commented on this separately
> 

...

> > +    if (ret < 0) {
> > +        fprintf(stderr, "vhost-scsi: vhost initialization failed: %s\n",
> > +                strerror(-ret));
> 
> errors should go to monitor, here and elsewhere.
> 

I think this means using monitor_printf() right..?

Looking at that now..

^ permalink raw reply

* Re: [Qemu-devel] [PATCH v8] kvm: notify host when the guest is panicked
From: Marcelo Tosatti @ 2012-08-14 20:53 UTC (permalink / raw)
  To: Anthony Liguori, Wen Congyang
  Cc: Yan Vugenfirer, kvm list, Jan Kiszka,
	linux-kernel@vger.kernel.org, Gleb Natapov, qemu-devel,
	Avi Kivity, KAMEZAWA Hiroyuki
In-Reply-To: <87txw5clmh.fsf@codemonkey.ws>

On Tue, Aug 14, 2012 at 02:35:34PM -0500, Anthony Liguori wrote:
> Marcelo Tosatti <mtosatti@redhat.com> writes:
> 
> > On Tue, Aug 14, 2012 at 01:53:01PM -0500, Anthony Liguori wrote:
> >> Marcelo Tosatti <mtosatti@redhat.com> writes:
> >> 
> >> > On Tue, Aug 14, 2012 at 05:55:54PM +0300, Yan Vugenfirer wrote:
> >> >> 
> >> >> On Aug 14, 2012, at 1:42 PM, Jan Kiszka wrote:
> >> >> 
> >> >> > On 2012-08-14 10:56, Daniel P. Berrange wrote:
> >> >> >> On Mon, Aug 13, 2012 at 03:21:32PM -0300, Marcelo Tosatti wrote:
> >> >> >>> On Wed, Aug 08, 2012 at 10:43:01AM +0800, Wen Congyang wrote:
> >> >> >>>> We can know the guest is panicked when the guest runs on xen.
> >> >> >>>> But we do not have such feature on kvm.
> >> >> >>>> 
> >> >> >>>> Another purpose of this feature is: management app(for example:
> >> >> >>>> libvirt) can do auto dump when the guest is panicked. If management
> >> >> >>>> app does not do auto dump, the guest's user can do dump by hand if
> >> >> >>>> he sees the guest is panicked.
> >> >> >>>> 
> >> >> >>>> We have three solutions to implement this feature:
> >> >> >>>> 1. use vmcall
> >> >> >>>> 2. use I/O port
> >> >> >>>> 3. use virtio-serial.
> >> >> >>>> 
> >> >> >>>> We have decided to avoid touching hypervisor. The reason why I choose
> >> >> >>>> choose the I/O port is:
> >> >> >>>> 1. it is easier to implememt
> >> >> >>>> 2. it does not depend any virtual device
> >> >> >>>> 3. it can work when starting the kernel
> >> >> >>> 
> >> >> >>> How about searching for the "Kernel panic - not syncing" string 
> >> >> >>> in the guests serial output? Say libvirtd could take an action upon
> >> >> >>> that?
> >> >> >> 
> >> >> >> No, this is not satisfactory. It depends on the guest OS being
> >> >> >> configured to use the serial port for console output which we
> >> >> >> cannot mandate, since it may well be required for other purposes.
> >> >> > 
> >> >> Please don't forget Windows guests, there is no console and no "Kernel Panic" string ;)
> >> >> 
> >> >> What I used for debugging purposes on Windows guest is to register a bugcheck callback in virtio-net driver and write 1 to VIRTIO_PCI_ISR register.
> >> >> 
> >> >> Yan. 
> >> >
> >> > Considering whether a "panic-device" should cover other OSes is also \
> >
> >> > something to consider. Even for Linux, is "panic" the only case which
> >> > should be reported via the mechanism? What about oopses without panic? 
> >> >
> >> > Is the mechanism general enough for supporting new events, etc.
> >> 
> >> Hi,
> >> 
> >> I think this discussion is gone of the deep end.
> >> 
> >> Forget about !x86 platforms.  They have their own way to do this sort of
> >> thing.  
> >
> > The panic function in kernel/panic.c has the following options, which
> > appear to be arch independent, on panic:
> >
> > - reboot 
> > - blink
> 
> Not sure the semantics of blink but that might be a good place for a
> pvops hook.
> 
> >
> > None are paravirtual interfaces however.
> >
> >> Think of this feature like a status LED on a motherboard.  These
> >> are very common and usually controlled by IO ports.
> >> 
> >> We're simply reserving a "status LED" for the guest to indicate that it
> >> has paniced.  Let's not over engineer this.
> >
> > My concern is that you end up with state that is dependant on x86.
> >
> > Subject: [PATCH v8 3/6] add a new runstate: RUN_STATE_GUEST_PANICKED
> >
> > Having the ability to stop/restart the guest (and even introducing a 
> > new VM runstate) is more than a status LED analogy.
> 
> I must admit, I don't know why a new runstate is necessary/useful.  The
> kernel shouldn't have to care about the difference between a halted guest
> and a panicked guest.  That level of information belongs in userspace IMHO.
> 
> > Can this new infrastructure be used by other architectures?
> 
> I guess I don't understand why the kernel side of this isn't anything
> more than a paravirt op hook that does a single outb() with the
> remaining logic handled 100% in QEMU.

>From the patch description:

"Another purpose of this feature is: management app(for example:
libvirt) can do auto dump when the guest is panicked. If management
app does not do auto dump, the guest's user can do dump by hand if
he sees the guest is panicked."

Wen, auto dump means dump of guest memory?

In that case, the notification should obviously stop the guest 
otherwise the guest might be reset by the time memdump from QEMU 
monitor runs.

But kexec supports dumping of memory already (i suppose it can 
do automatic dump+{reboot,shutdown}).

> > Do you consider allowing support for Windows as overengineering?
> 
> I don't think there is a way to hook BSOD on Windows so attempting to
> engineer something that works with Windows seems odd, no?

Unsure about hooking at BSOD time. But Windows has configurable 
memory dump/reset/reboot, so yes it should not necessary.

> 
> Regards,
> 
> Anthony Liguori
> 
> >
> >> Regards,
> >> 
> >> Anthony Liguori
> >> 
> >> >
> >> >> 
> >> >> > Well, we have more than a single serial port, even when leaving
> >> >> > virtio-serial aside...
> >> >> > 
> >> >> > Jan
> >> >> > 
> >> >> > -- 
> >> >> > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> >> >> > Corporate Competence Center Embedded Linux
> >> >> > --
> >> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> >> > the body of a message to majordomo@vger.kernel.org
> >> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 3/4] vfio: vfio-pci device assignment driver
From: Jan Kiszka @ 2012-08-14 20:33 UTC (permalink / raw)
  To: Alex Williamson
  Cc: aliguori@us.ibm.com, qemu-devel@nongnu.org, kvm@vger.kernel.org,
	aik@ozlabs.ru
In-Reply-To: <1344971368.4683.314.camel@ul30vt.home>

On 2012-08-14 21:09, Alex Williamson wrote:
>>> +static void vfio_map_bar(VFIODevice *vdev, int nr)
>>> +{
>>> +    VFIOBAR *bar = &vdev->bars[nr];
>>> +    unsigned size = bar->size;
>>> +    char name[64];
>>> +    uint32_t pci_bar;
>>> +    uint8_t type;
>>> +    int ret;
>>> +
>>> +    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
>>> +    if (!size) {
>>> +        return;
>>> +    }
>>> +
>>> +    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
>>> +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
>>> +             vdev->host.function, nr);
>>> +
>>> +    /* Determine what type of BAR this is for registration */
>>> +    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
>>> +                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
>>> +    if (ret != sizeof(pci_bar)) {
>>> +        error_report("vfio: Failed to read BAR %d (%s)\n", nr, strerror(errno));
>>> +        return;
>>> +    }
>>> +
>>> +    pci_bar = le32_to_cpu(pci_bar);
>>> +    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
>>> +           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
>>> +
>>> +    /* A "slow" read/write mapping underlies all BARs */
>>> +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
>>> +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
>>> +
>>> +    /*
>>> +     * We can't mmap areas overlapping the MSIX vector table, so we
>>> +     * potentially insert a direct-mapped subregion before and after it.
>>> +     */
>>> +    if (vdev->msix && vdev->msix->table_bar == nr) {
>>> +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
>>> +    }
>>> +
>>> +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
>>
>> This could generate an unterminated name if we actually have to cut the
>> appended string. You could set name[sizeof(name)-1] = 0.
> 
> strncat adds the terminator, that's why we have the -1 so that there's
> space for it.  strlen does not include the terminator.

Yep, you are right, forget what I said.

Jan

-- 
Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
Corporate Competence Center Embedded Linux

^ permalink raw reply

* [PATCH v3 3/4] vfio: vfio-pci device assignment driver
From: Alex Williamson @ 2012-08-14 20:33 UTC (permalink / raw)
  To: aliguori; +Cc: qemu-devel, kvm, jan.kiszka, avi, blauwirbel, aik
In-Reply-To: <20120814202141.11522.78340.stgit@bling.home>

This adds the core of the QEMU VFIO-based PCI device assignment driver.
To make use of this driver, enable CONFIG_VFIO, CONFIG_VFIO_IOMMU_TYPE1,
and CONFIG_VFIO_PCI in your host Linux kernel config.  Load the vfio-pci
module.  To assign device 0000:05:00.0 to a guest, do the following:

for dev in $(ls /sys/bus/pci/devices/0000:05:00.0/iommu_group/devices); do
    vendor=$(cat /sys/bus/pci/devices/$dev/vendor)
    device=$(cat /sys/bus/pci/devices/$dev/device)
    if [ -e /sys/bus/pci/devices/$dev/driver ]; then
        echo $dev > /sys/bus/pci/devices/$dev/driver/unbind
    fi
    echo $vendor $device > /sys/bus/pci/drivers/vfio-pci/new_id
done

See Documentation/vfio.txt in the Linux kernel tree for further
description of IOMMU groups and VFIO.

Then launch qemu including the option:

-device vfio-pci,host=0000:05:00.0

Support for legacy PCI interrupts (INTx) is not yet included and will
be added in a future update.  Both MSI and MSI-X are supported here.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 hw/vfio_pci.c     | 1870 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/vfio_pci_int.h |  112 +++
 2 files changed, 1982 insertions(+)
 create mode 100644 hw/vfio_pci.c
 create mode 100644 hw/vfio_pci_int.h

diff --git a/hw/vfio_pci.c b/hw/vfio_pci.c
new file mode 100644
index 0000000..3343479
--- /dev/null
+++ b/hw/vfio_pci.c
@@ -0,0 +1,1870 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Based on qemu-kvm device-assignment:
+ *  Adapted for KVM by Qumranet.
+ *  Copyright (c) 2007, Neocleus, Alex Novik (alex@neocleus.com)
+ *  Copyright (c) 2007, Neocleus, Guy Zana (guy@neocleus.com)
+ *  Copyright (C) 2008, Qumranet, Amit Shah (amit.shah@qumranet.com)
+ *  Copyright (C) 2008, Red Hat, Amit Shah (amit.shah@redhat.com)
+ *  Copyright (C) 2008, IBM, Muli Ben-Yehuda (muli@il.ibm.com)
+ */
+
+#include <dirent.h>
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <linux/vfio.h>
+
+#include "config.h"
+#include "event_notifier.h"
+#include "exec-memory.h"
+#include "kvm.h"
+#include "memory.h"
+#include "msi.h"
+#include "msix.h"
+#include "qemu-error.h"
+#include "range.h"
+#include "vfio_pci_int.h"
+
+/* #define DEBUG_VFIO */
+#ifdef DEBUG_VFIO
+#define DPRINTF(fmt, ...) \
+    do { fprintf(stderr, "vfio: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+#define MSIX_CAP_LENGTH 12
+
+static QLIST_HEAD(, VFIOContainer)
+    container_list = QLIST_HEAD_INITIALIZER(container_list);
+
+static QLIST_HEAD(, VFIOGroup)
+    group_list = QLIST_HEAD_INITIALIZER(group_list);
+
+static void vfio_disable_interrupts(VFIODevice *vdev);
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len);
+
+/*
+ * Common VFIO interrupt disable
+ */
+static void vfio_disable_irqindex(VFIODevice *vdev, int index)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index = index,
+        .start = 0,
+        .count = 0,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+
+    vdev->interrupt = VFIO_INT_NONE;
+}
+
+/*
+ * INTx
+ */
+static void vfio_unmask_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set irq_set = {
+        .argsz = sizeof(irq_set),
+        .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK,
+        .index = VFIO_PCI_INTX_IRQ_INDEX,
+        .start = 0,
+        .count = 1,
+    };
+
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set);
+}
+
+static void vfio_intx_interrupt(void *opaque)
+{
+    VFIODevice *vdev = opaque;
+
+    if (!event_notifier_test_and_clear(&vdev->intx.interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Pin %c\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function,
+            'A' + vdev->intx.pin);
+
+    vdev->intx.pending = true;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 1);
+}
+
+static void vfio_eoi(VFIODevice *vdev)
+{
+    if (!vdev->intx.pending) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) EOI\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    vdev->intx.pending = false;
+    qemu_set_irq(vdev->pdev.irq[vdev->intx.pin], 0);
+    vfio_unmask_intx(vdev);
+}
+
+struct vfio_irq_set_fd {
+    struct vfio_irq_set irq_set;
+    int32_t fd;
+} QEMU_PACKED;
+
+static void vfio_enable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    /*
+     * VFIO supports an eventfd for INTx notification and an irqfd-like
+     * mechanism for unmasking INTx.  If we could get a level irqfd in
+     * KVM and an eventfd triggered on EOI from guest, we could interlock
+     * these and avoid userspace for INTx.  Work in progress.
+     */
+#endif
+}
+
+static void vfio_disable_intx_kvm(VFIODevice *vdev)
+{
+#ifdef CONFIG_KVM
+    /* Same. */
+#endif
+}
+
+/* TODO: Move this helper out to generic PCI code */
+static bool vfio_intx_route_changed(PCIINTxRoute *old, PCIINTxRoute *new)
+{
+    return old->mode != new->mode || old->irq != new->irq;
+}
+
+static void vfio_update_irq(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    PCIINTxRoute route;
+
+    if (vdev->interrupt != VFIO_INT_INTx) {
+        return;
+    }
+
+    route = pci_device_route_intx_to_irq(&vdev->pdev, vdev->intx.pin);
+    if (!vfio_intx_route_changed(&vdev->intx.route, &route)) {
+        return; /* Nothing changed */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) IRQ moved %d -> %d\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->intx.route.irq, route.irq);
+
+    vfio_disable_intx_kvm(vdev);
+    /* TBD - Disable QEMU eoi notifier */
+
+    vdev->intx.route = route;
+
+    if (route.mode == PCI_INTX_DISABLED) {
+        return;
+    }
+
+    /* TBD - Enable QEMU eoi notifier */
+    vfio_enable_intx_kvm(vdev);
+
+    /* Re-enable the interrupt in cased we missed an EOI */
+    vfio_eoi(vdev);
+}
+
+static int vfio_enable_intx(VFIODevice *vdev)
+{
+    struct vfio_irq_set_fd irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_INTX_IRQ_INDEX,
+            .start = 0,
+            .count = 1,
+        },
+    };
+    uint8_t pin = vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1);
+    int ret;
+
+    if (!pin) {
+        return 0;
+    }
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->intx.pin = pin - 1; /* Pin A (1) -> irq[0] */
+    vdev->intx.route = pci_device_route_intx_to_irq(&vdev->pdev,
+                                                    vdev->intx.pin);
+    /* TBD - Enable QEMU eoi notifier */
+
+    ret = event_notifier_init(&vdev->intx.interrupt, 0);
+    if (ret) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+        return ret;
+    }
+
+    irq_set_fd.fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(irq_set_fd.fd, vfio_intx_interrupt, NULL, vdev);
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd)) {
+        error_report("vfio: Error: Failed to setup INTx fd: %m\n");
+        return -errno;
+    }
+
+    vfio_enable_intx_kvm(vdev);
+
+    vdev->interrupt = VFIO_INT_INTx;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    return 0;
+}
+
+static void vfio_disable_intx(VFIODevice *vdev)
+{
+    int fd;
+
+    vfio_disable_intx_kvm(vdev);
+    vfio_disable_irqindex(vdev, VFIO_PCI_INTX_IRQ_INDEX);
+
+    /* TBD - Disable QEMU eoi notifier */
+
+    fd = event_notifier_get_fd(&vdev->intx.interrupt);
+    qemu_set_fd_handler(fd, NULL, NULL, vdev);
+    event_notifier_cleanup(&vdev->intx.interrupt);
+
+    vdev->interrupt = VFIO_INT_NONE;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+}
+
+/*
+ * MSI/X
+ */
+static void vfio_msi_interrupt(void *opaque)
+{
+    VFIOMSIVector *vec = opaque;
+    VFIODevice *vdev = vec->vdev;
+
+    if (!event_notifier_test_and_clear(&vec->interrupt)) {
+        return;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %ld\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vec - vdev->msi_vectors);
+
+    if (vdev->interrupt == VFIO_INT_MSIX) {
+        msix_notify(&vdev->pdev, vec - vdev->msi_vectors);
+    } else if (vdev->interrupt == VFIO_INT_MSI) {
+        msi_notify(&vdev->pdev, vec - vdev->msi_vectors);
+    } else {
+        error_report("vfio: MSI interrupt receieved, but not enabled?\n");
+    }
+}
+
+static int vfio_enable_vectors(VFIODevice *vdev, bool msix)
+{
+    struct vfio_irq_set *irq_set;
+    int ret = 0, i, argsz;
+    int32_t *fds;
+
+    argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
+
+    irq_set = g_malloc0(argsz);
+    irq_set->argsz = argsz;
+    irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
+    irq_set->index = msix ? VFIO_PCI_MSIX_IRQ_INDEX : VFIO_PCI_MSI_IRQ_INDEX;
+    irq_set->start = 0;
+    irq_set->count = vdev->nr_vectors;
+    fds = (int32_t *)&irq_set->data;
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        if (!vdev->msi_vectors[i].use) {
+            fds[i] = -1;
+            continue;
+        }
+
+        fds[i] = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+    }
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+    g_free(irq_set);
+
+    if (!ret) {
+        vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
+    }
+
+    return ret;
+}
+
+static int vfio_msix_vector_use(PCIDevice *pdev,
+                                unsigned int vector, MSIMessage msg)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    int ret, fd;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vector);
+
+    if (vdev->interrupt != VFIO_INT_MSIX) {
+        vfio_disable_interrupts(vdev);
+    }
+
+    if (!vdev->msi_vectors) {
+        vdev->msi_vectors = g_malloc0(vdev->msix->entries *
+                                      sizeof(VFIOMSIVector));
+    }
+
+    vdev->msi_vectors[vector].vdev = vdev;
+    vdev->msi_vectors[vector].use = true;
+
+    msix_vector_use(pdev, vector);
+
+    if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) {
+        error_report("vfio: Error: event_notifier_init failed\n");
+    }
+
+    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
+
+    /*
+     * Attempt to enable route through KVM irqchip,
+     * default to userspace handling if unavailable.
+     */
+    vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+    if (vdev->msi_vectors[vector].virq < 0 ||
+        kvm_irqchip_add_irqfd(kvm_state, fd,
+                              vdev->msi_vectors[vector].virq) < 0) {
+        if (vdev->msi_vectors[vector].virq >= 0) {
+            kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
+            vdev->msi_vectors[vector].virq = -1;
+        }
+        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                            &vdev->msi_vectors[vector]);
+    }
+
+    /*
+     * We don't want to have the host allocate all possible MSI vectors
+     * for a device if they're not in use, so we shutdown and incrementally
+     * increase them as needed.
+     */
+    if (vdev->nr_vectors < vector + 1) {
+        int i;
+
+        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
+        vdev->nr_vectors = vector + 1;
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d\n", ret);
+        }
+
+        /* We don't know if we've missed interrupts in the interim... */
+        for (i = 0; i < vdev->msix->entries; i++) {
+            if (vdev->msi_vectors[i].use) {
+                msix_notify(&vdev->pdev, i);
+            }
+        }
+    } else {
+        struct vfio_irq_set_fd irq_set_fd = {
+            .irq_set = {
+                .argsz = sizeof(irq_set_fd),
+                .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                         VFIO_IRQ_SET_ACTION_TRIGGER,
+                .index = VFIO_PCI_MSIX_IRQ_INDEX,
+                .start = vector,
+                .count = 1,
+            },
+            .fd = fd,
+        };
+        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+        if (ret) {
+            error_report("vfio: failed to modify vector, %d\n", ret);
+        }
+
+        /*
+         * If we were connected to the hardware PBA we could skip this,
+         * until then, a spurious interrupt is better than starvation.
+         */
+        msix_notify(&vdev->pdev, vector);
+    }
+
+    return 0;
+}
+
+static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    struct vfio_irq_set_fd irq_set_fd = {
+        .irq_set = {
+            .argsz = sizeof(irq_set_fd),
+            .flags = VFIO_IRQ_SET_DATA_EVENTFD |
+                     VFIO_IRQ_SET_ACTION_TRIGGER,
+            .index = VFIO_PCI_MSIX_IRQ_INDEX,
+            .start = vector,
+            .count = 1,
+        },
+        .fd = -1,
+    };
+    int fd;
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vector);
+
+    /*
+     * XXX What's the right thing to do here?  This turns off the interrupt
+     * completely, but do we really just want to switch the interrupt to
+     * bouncing through userspace and let msix.c drop it?  Not sure.
+     */
+    msix_vector_unuse(pdev, vector);
+    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
+
+    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
+
+    if (vdev->msi_vectors[vector].virq < 0) {
+        qemu_set_fd_handler(fd, NULL, NULL, NULL);
+    } else {
+        kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[vector].virq);
+        kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
+        vdev->msi_vectors[vector].virq = -1;
+    }
+
+    event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt);
+    vdev->msi_vectors[vector].use = false;
+}
+
+/* TODO This should move to msi.c */
+static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
+{
+    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
+    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
+    MSIMessage msg;
+
+    if (msi64bit) {
+        msg.address = pci_get_quad(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    } else {
+        msg.address = pci_get_long(pdev->config +
+                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
+    }
+
+    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
+                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
+    msg.data += vector;
+
+    return msg;
+}
+
+/* So should this */
+static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
+{
+    uint8_t *config = pdev->config + pdev->msi_cap;
+    uint16_t flags;
+
+    flags = pci_get_word(config + PCI_MSI_FLAGS);
+    flags = le16_to_cpu(flags);
+    flags &= ~PCI_MSI_FLAGS_QSIZE;
+    flags |= (size & 0x7) << 4;
+    flags = cpu_to_le16(flags);
+    pci_set_word(config + PCI_MSI_FLAGS, flags);
+}
+
+static void vfio_enable_msi(VFIODevice *vdev)
+{
+    int ret, i;
+
+    vfio_disable_interrupts(vdev);
+
+    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
+retry:
+    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        MSIMessage msg;
+        int fd;
+
+        vdev->msi_vectors[i].vdev = vdev;
+        vdev->msi_vectors[i].use = true;
+
+        if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) {
+            error_report("vfio: Error: event_notifier_init failed\n");
+        }
+
+        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+
+        msg = msi_get_msg(&vdev->pdev, i);
+
+        /*
+         * Attempt to enable route through KVM irqchip,
+         * default to userspace handling if unavailable.
+         */
+        vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
+        if (vdev->msi_vectors[i].virq < 0 ||
+            kvm_irqchip_add_irqfd(kvm_state, fd,
+                                  vdev->msi_vectors[i].virq) < 0) {
+            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
+                                &vdev->msi_vectors[i]);
+        }
+    }
+
+    ret = vfio_enable_vectors(vdev, false);
+    if (ret) {
+        if (ret < 0) {
+            error_report("vfio: Error: Failed to setup MSI fds: %m\n");
+        } else if (ret != vdev->nr_vectors) {
+            error_report("vfio: Error: Failed to enable %d "
+                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
+        }
+
+        for (i = 0; i < vdev->nr_vectors; i++) {
+            int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+            if (vdev->msi_vectors[i].virq >= 0) {
+                kvm_irqchip_remove_irqfd(kvm_state, fd,
+                                         vdev->msi_vectors[i].virq);
+                kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
+                vdev->msi_vectors[i].virq = -1;
+            } else {
+                qemu_set_fd_handler(fd, NULL, NULL, NULL);
+            }
+            event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
+        }
+
+        g_free(vdev->msi_vectors);
+
+        if (ret > 0 && ret != vdev->nr_vectors) {
+            vdev->nr_vectors = ret;
+            goto retry;
+        }
+        vdev->nr_vectors = 0;
+
+        return;
+    }
+
+    msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, vdev->nr_vectors);
+}
+
+static void vfio_disable_msi_x(VFIODevice *vdev, bool msix)
+{
+    int i;
+
+    vfio_disable_irqindex(vdev, msix ? VFIO_PCI_MSIX_IRQ_INDEX :
+                                       VFIO_PCI_MSI_IRQ_INDEX);
+
+    for (i = 0; i < vdev->nr_vectors; i++) {
+        int fd;
+
+        if (!vdev->msi_vectors[i].use) {
+            continue;
+        }
+
+        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
+
+        if (vdev->msi_vectors[i].virq >= 0) {
+            kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[i].virq);
+            kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
+            vdev->msi_vectors[i].virq = -1;
+        } else {
+            qemu_set_fd_handler(fd, NULL, NULL, NULL);
+        }
+
+        if (msix) {
+            msix_vector_unuse(&vdev->pdev, i);
+        }
+
+        event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
+    }
+
+    g_free(vdev->msi_vectors);
+    vdev->msi_vectors = NULL;
+    vdev->nr_vectors = 0;
+
+    if (!msix) {
+        msi_set_qsize(&vdev->pdev, 0); /* Actually still means 1 vector */
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, msi%s)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, msix ? "x" : "");
+
+    vfio_enable_intx(vdev);
+}
+
+/*
+ * IO Port/MMIO - Beware of the endians, VFIO is always little endian
+ */
+static void vfio_bar_write(void *opaque, target_phys_addr_t addr,
+                           uint64_t data, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    union {
+        uint8_t buf[8];
+        uint64_t foo; /* For alignment */
+    } u;
+
+    switch (size) {
+    case 1:
+        *u.buf = data & 0xff;
+        break;
+    case 2:
+        *(uint16_t *)u.buf = cpu_to_le16(data);
+        break;
+    case 4:
+        *(uint32_t *)u.buf = cpu_to_le32(data);
+        break;
+    default:
+        hw_error("vfio: unsupported write size, %d bytes\n", size);
+        break;
+    }
+
+    if (pwrite(bar->fd, u.buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"PRIx64", 0x%"PRIx64", %d) failed: %m\n",
+                     __func__, addr, data, size);
+    }
+
+    DPRINTF("%s(BAR%d+0x%"PRIx64", 0x%"PRIx64", %d)\n",
+            __func__, bar->nr, addr, data, size);
+}
+
+static uint64_t vfio_bar_read(void *opaque,
+                              target_phys_addr_t addr, unsigned size)
+{
+    VFIOBAR *bar = opaque;
+    union {
+        uint8_t buf[8];
+        uint64_t foo; /* For alignment */
+    } u;
+    uint64_t data = 0;
+
+    if (pread(bar->fd, u.buf, size, bar->fd_offset + addr) != size) {
+        error_report("%s(,0x%"PRIx64", %d) failed: %m\n",
+                     __func__, addr, size);
+        return (uint64_t)-1;
+    }
+
+    switch (size) {
+    case 1:
+        data = u.buf[0];
+        break;
+    case 2:
+        data = le16_to_cpu(*(uint16_t *)u.buf);
+        break;
+    case 4:
+        data = le32_to_cpu(*(uint32_t *)u.buf);
+        break;
+    default:
+        hw_error("vfio: unsupported read size, %d bytes\n", size);
+        break;
+    }
+
+    DPRINTF("%s(BAR%d+0x%"PRIx64", %d) = 0x%"PRIx64"\n",
+            __func__, bar->nr, addr, size, data);
+
+    return data;
+}
+
+static const MemoryRegionOps vfio_bar_ops = {
+    .read = vfio_bar_read,
+    .write = vfio_bar_write,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+/*
+ * PCI config space
+ */
+static uint32_t vfio_pci_read_config(PCIDevice *pdev, uint32_t addr, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val = 0;
+
+    /*
+     * We only need QEMU PCI config support for the ROM BAR, the MSI and MSIX
+     * capabilities, and the multifunction bit below.  We let VFIO handle
+     * virtualizing everything else.  Performance is not a concern here.
+     */
+    if (ranges_overlap(addr, len, PCI_ROM_ADDRESS, 4) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+         ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) ||
+        (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+         ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size))) {
+
+        val = pci_default_read_config(pdev, addr, len);
+    } else {
+        if (pread(vdev->fd, &val, len, vdev->config_offset + addr) != len) {
+            error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x) failed: %m\n",
+                         __func__, vdev->host.domain, vdev->host.bus,
+                         vdev->host.slot, vdev->host.function, addr, len);
+            return -errno;
+        }
+        val = le32_to_cpu(val);
+    }
+
+    /* Multifunction bit is virualized in QEMU */
+    if (unlikely(ranges_overlap(addr, len, PCI_HEADER_TYPE, 1))) {
+        uint32_t mask = PCI_HEADER_TYPE_MULTI_FUNCTION;
+
+        if (len == 4) {
+            mask <<= 16;
+        }
+
+        if (pdev->cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+            val |= mask;
+        } else {
+            val &= ~mask;
+        }
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, len=0x%x) %x\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, len, val);
+
+    return val;
+}
+
+static void vfio_pci_write_config(PCIDevice *pdev, uint32_t addr,
+                                  uint32_t val, int len)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    uint32_t val_le = cpu_to_le32(val);
+
+    DPRINTF("%s(%04x:%02x:%02x.%x, @0x%x, 0x%x, len=0x%x)\n", __func__,
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, addr, val, len);
+
+    /* Write everything to VFIO, let it filter out what we can't write */
+    if (pwrite(vdev->fd, &val_le, len, vdev->config_offset + addr) != len) {
+        error_report("%s(%04x:%02x:%02x.%x, 0x%x, 0x%x, 0x%x) failed: %m\n",
+                     __func__, vdev->host.domain, vdev->host.bus,
+                     vdev->host.slot, vdev->host.function, addr, val, len);
+    }
+
+    /* Write standard header bits to emulation */
+    if (addr < PCI_CONFIG_HEADER_SIZE) {
+        pci_default_write_config(pdev, addr, val, len);
+        return;
+    }
+
+    /* MSI/MSI-X Enabling/Disabling */
+    if (pdev->cap_present & QEMU_PCI_CAP_MSI &&
+        ranges_overlap(addr, len, pdev->msi_cap, vdev->msi_cap_size)) {
+        int is_enabled, was_enabled = msi_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msi_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            vfio_enable_msi(vdev);
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, false);
+        }
+    }
+
+    if (pdev->cap_present & QEMU_PCI_CAP_MSIX &&
+        ranges_overlap(addr, len, pdev->msix_cap, MSIX_CAP_LENGTH)) {
+        int is_enabled, was_enabled = msix_enabled(pdev);
+
+        pci_default_write_config(pdev, addr, val, len);
+
+        is_enabled = msix_enabled(pdev);
+
+        if (!was_enabled && is_enabled) {
+            /* vfio_msix_vector_use handles this automatically */
+        } else if (was_enabled && !is_enabled) {
+            vfio_disable_msi_x(vdev, true);
+        }
+    }
+}
+
+/*
+ * DMA - Mapping and unmapping for the "type1" IOMMU interface used on x86
+ */
+static int vfio_dma_map(VFIOContainer *container, target_phys_addr_t iova,
+                        ram_addr_t size, void *vaddr, bool readonly)
+{
+    struct vfio_iommu_type1_dma_map map = {
+        .argsz = sizeof(map),
+        .flags = VFIO_DMA_MAP_FLAG_READ,
+        .vaddr = (__u64)vaddr,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (!readonly) {
+        map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
+    }
+
+    if (ioctl(container->fd, VFIO_IOMMU_MAP_DMA, &map)) {
+        DPRINTF("VFIO_MAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static int vfio_dma_unmap(VFIOContainer *container,
+                          target_phys_addr_t iova, ram_addr_t size)
+{
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = iova,
+        .size = size,
+    };
+
+    if (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        DPRINTF("VFIO_UNMAP_DMA: %d\n", -errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void vfio_listener_dummy1(MemoryListener *listener)
+{
+    /* We don't do batching (begin/commit) or care about logging */
+}
+
+static void vfio_listener_dummy2(MemoryListener *listener,
+                                 MemoryRegionSection *section)
+{
+    /* We don't do logging or care about nops */
+}
+
+static void vfio_listener_dummy3(MemoryListener *listener,
+                                 MemoryRegionSection *section,
+                                 bool match_data, uint64_t data,
+                                 EventNotifier *e)
+{
+    /* We don't care about eventfds */
+}
+
+static bool vfio_listener_skipped_section(MemoryRegionSection *section)
+{
+    return !memory_region_is_ram(section->mr);
+}
+
+static void vfio_listener_region_add(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    void *vaddr;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_add %016lx - %016lx\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+
+    DPRINTF("vfio: region_add %016lx - %016lx [%p]\n",
+            iova, end - 1, vaddr);
+
+    ret = vfio_dma_map(container, iova, end - iova, vaddr, section->readonly);
+    if (ret) {
+        error_report("vfio_dma_map(%p, 0x%016lx, 0x%lx, %p) = %d (%m)\n",
+                     container, iova, end - iova, vaddr, ret);
+    }
+}
+
+static void vfio_listener_region_del(MemoryListener *listener,
+                                     MemoryRegionSection *section)
+{
+    VFIOContainer *container = container_of(listener, VFIOContainer,
+                                            iommu_data.listener);
+    target_phys_addr_t iova, end;
+    int ret;
+
+    if (vfio_listener_skipped_section(section)) {
+        DPRINTF("vfio: SKIPPING region_del %016lx - %016lx\n",
+                section->offset_within_address_space,
+                section->offset_within_address_space + section->size - 1);
+        return;
+    }
+
+    if (unlikely((section->offset_within_address_space & ~TARGET_PAGE_MASK) !=
+                 (section->offset_within_region & ~TARGET_PAGE_MASK))) {
+        error_report("%s received unaligned region\n", __func__);
+        return;
+    }
+
+    iova = TARGET_PAGE_ALIGN(section->offset_within_address_space);
+    end = (section->offset_within_address_space + section->size) &
+          TARGET_PAGE_MASK;
+
+    if (iova >= end) {
+        return;
+    }
+
+    DPRINTF("vfio: region_del %016lx - %016lx\n", iova, end - 1);
+
+    ret = vfio_dma_unmap(container, iova, end - iova);
+    if (ret) {
+        error_report("vfio_dma_unmap(%p, 0x%016lx, 0x%lx) = %d (%m)\n",
+                     container, iova, end - iova, ret);
+    }
+}
+
+static MemoryListener vfio_memory_listener = {
+    .begin = vfio_listener_dummy1,
+    .commit = vfio_listener_dummy1,
+    .region_add = vfio_listener_region_add,
+    .region_del = vfio_listener_region_del,
+    .region_nop = vfio_listener_dummy2,
+    .log_start = vfio_listener_dummy2,
+    .log_stop = vfio_listener_dummy2,
+    .log_sync = vfio_listener_dummy2,
+    .log_global_start = vfio_listener_dummy1,
+    .log_global_stop = vfio_listener_dummy1,
+    .eventfd_add = vfio_listener_dummy3,
+    .eventfd_del = vfio_listener_dummy3,
+};
+
+static void vfio_listener_release(VFIOContainer *container)
+{
+    memory_listener_unregister(&container->iommu_data.listener);
+}
+
+/*
+ * Interrupt setup
+ */
+static void vfio_disable_interrupts(VFIODevice *vdev)
+{
+    switch (vdev->interrupt) {
+    case VFIO_INT_INTx:
+        vfio_disable_intx(vdev);
+        break;
+    case VFIO_INT_MSI:
+        vfio_disable_msi_x(vdev, false);
+        break;
+    case VFIO_INT_MSIX:
+        vfio_disable_msi_x(vdev, true);
+        break;
+    }
+}
+
+static int vfio_setup_msi(VFIODevice *vdev, int pos)
+{
+    uint16_t ctrl;
+    bool msi_64bit, msi_maskbit;
+    int ret, entries;
+
+    /*
+     * TODO: don't peek into msi_supported, let msi_init fail and
+     * check for ENOTSUP
+     */
+    if (!msi_supported) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -errno;
+    }
+    ctrl = le16_to_cpu(ctrl);
+
+    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
+    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
+    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
+
+    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
+
+    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
+    if (ret < 0) {
+        error_report("vfio: msi_init failed\n");
+        return ret;
+    }
+    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
+
+    return 0;
+}
+
+/*
+ * We don't have any control over how pci_add_capability() inserts
+ * capabilities into the chain.  In order to setup MSI-X we need a
+ * MemoryRegion for the BAR.  In order to setup the BAR and not
+ * attempt to mmap the MSI-X table area, which VFIO won't allow, we
+ * need to first look for where the MSI-X table lives.  So we
+ * unfortunately split MSI-X setup across two functions.
+ */
+static int vfio_early_setup_msix(VFIODevice *vdev)
+{
+    uint8_t pos;
+    uint16_t ctrl;
+    uint32_t table, pba;
+
+    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
+    if (!pos) {
+        return 0;
+    }
+
+    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
+              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
+        return -errno;
+    }
+
+    if (pread(vdev->fd, &table, sizeof(table),
+              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
+        return -errno;
+    }
+
+    if (pread(vdev->fd, &pba, sizeof(pba),
+              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
+        return -errno;
+    }
+
+    ctrl = le16_to_cpu(ctrl);
+    table = le32_to_cpu(table);
+    pba = le32_to_cpu(pba);
+
+    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
+    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
+    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+
+    DPRINTF("%04x:%02x:%02x.%x "
+            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function, pos, vdev->msix->table_bar,
+            vdev->msix->table_offset, vdev->msix->entries);
+
+    return 0;
+}
+
+static int vfio_setup_msix(VFIODevice *vdev, int pos)
+{
+    int ret;
+
+    /*
+     * TODO: don't peek into msi_supported, let msix_init fail and
+     * check for ENOTSUP
+     */
+    if (!msi_supported) {
+        return 0;
+    }
+
+    ret = msix_init(&vdev->pdev, vdev->msix->entries,
+                    &vdev->bars[vdev->msix->table_bar].mem,
+                    vdev->msix->table_bar, vdev->msix->table_offset,
+                    &vdev->bars[vdev->msix->pba_bar].mem,
+                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
+    if (ret < 0) {
+        error_report("vfio: msix_init failed\n");
+        return ret;
+    }
+
+    ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
+                                    vfio_msix_vector_release);
+    if (ret) {
+        error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void vfio_teardown_msi(VFIODevice *vdev)
+{
+    msi_uninit(&vdev->pdev);
+
+    if (vdev->msix) {
+        /* FIXME: Why can't unset just silently do nothing?? */
+        if (vdev->pdev.msix_vector_use_notifier &&
+            vdev->pdev.msix_vector_release_notifier) {
+            msix_unset_vector_notifiers(&vdev->pdev);
+        }
+
+        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
+                    &vdev->bars[vdev->msix->pba_bar].mem);
+    }
+}
+
+/*
+ * Resource setup
+ */
+static void vfio_unmap_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+
+    if (!bar->size) {
+        return;
+    }
+
+    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
+    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
+        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
+    }
+
+    memory_region_destroy(&bar->mem);
+}
+
+static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
+                         void **map, size_t size, off_t offset,
+                         const char *name)
+{
+    int ret = 0;
+
+    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
+        int prot = 0;
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
+            prot |= PROT_READ;
+        }
+
+        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
+            prot |= PROT_WRITE;
+        }
+
+        *map = mmap(NULL, size, prot, MAP_SHARED,
+                    bar->fd, bar->fd_offset + offset);
+        if (*map == MAP_FAILED) {
+            *map = NULL;
+            ret = -errno;
+            goto empty_region;
+        }
+
+        memory_region_init_ram_ptr(submem, name, size, *map);
+    } else {
+empty_region:
+        /* Create a zero sized sub-region to make cleanup easy. */
+        memory_region_init(submem, name, 0);
+    }
+
+    memory_region_add_subregion(mem, offset, submem);
+
+    return ret;
+}
+
+static void vfio_map_bar(VFIODevice *vdev, int nr)
+{
+    VFIOBAR *bar = &vdev->bars[nr];
+    unsigned size = bar->size;
+    char name[64];
+    uint32_t pci_bar;
+    uint8_t type;
+    int ret;
+
+    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
+    if (!size) {
+        return;
+    }
+
+    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function, nr);
+
+    /* Determine what type of BAR this is for registration */
+    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
+                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
+    if (ret != sizeof(pci_bar)) {
+        error_report("vfio: Failed to read BAR %d (%m)\n", nr);
+        return;
+    }
+
+    pci_bar = le32_to_cpu(pci_bar);
+    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
+           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
+
+    /* A "slow" read/write mapping underlies all BARs */
+    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
+    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
+
+    /*
+     * We can't mmap areas overlapping the MSIX vector table, so we
+     * potentially insert a direct-mapped subregion before and after it.
+     */
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
+    }
+
+    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
+    if (vfio_mmap_bar(bar, &bar->mem,
+                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
+        error_report("%s unsupported. Performance may be slow\n", name);
+    }
+
+    if (vdev->msix && vdev->msix->table_bar == nr) {
+        unsigned start;
+
+        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
+                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
+
+        size = start < bar->size ? bar->size - start : 0;
+        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
+        /* VFIOMSIXInfo contains another MemoryRegion for this mapping */
+        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
+                          &vdev->msix->mmap, size, start, name)) {
+            error_report("%s unsupported. Performance may be slow\n", name);
+        }
+    }
+}
+
+static void vfio_map_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_map_bar(vdev, i);
+    }
+}
+
+static void vfio_unmap_bars(VFIODevice *vdev)
+{
+    int i;
+
+    for (i = 0; i < PCI_ROM_SLOT; i++) {
+        vfio_unmap_bar(vdev, i);
+    }
+}
+
+/*
+ * General setup
+ */
+static uint8_t vfio_std_cap_max_size(PCIDevice *pdev, uint8_t pos)
+{
+    uint8_t tmp, next = 0xff;
+
+    for (tmp = pdev->config[PCI_CAPABILITY_LIST]; tmp;
+         tmp = pdev->config[tmp + 1]) {
+        if (tmp > pos && tmp < next) {
+            next = tmp;
+        }
+    }
+
+    return next - pos;
+}
+
+static int vfio_add_std_cap(VFIODevice *vdev, uint8_t pos)
+{
+    PCIDevice *pdev = &vdev->pdev;
+    uint8_t cap_id, next, size;
+    int ret;
+
+    cap_id = pdev->config[pos];
+    next = pdev->config[pos + 1];
+
+    /*
+     * If it becomes important to configure capabilities to their actual
+     * size, use this as the default when it's something we don't recognize.
+     * Since QEMU doesn't actually handle many of the config accesses,
+     * exact size doesn't seem worthwhile.
+     */
+    size = vfio_std_cap_max_size(pdev, pos);
+
+    /*
+     * pci_add_capability always inserts the new capability at the head
+     * of the chain.  Therefore to end up with a chain that matches the
+     * physical device, we insert from the end by making this recursive.
+     * This is also why we pre-caclulate size above as cached config space
+     * will be changed as we unwind the stack.
+     */
+    if (next) {
+        ret = vfio_add_std_cap(vdev, next);
+        if (ret) {
+            return ret;
+        }
+    } else {
+        pdev->config[PCI_CAPABILITY_LIST] = 0; /* Begin the rebuild */
+    }
+
+    switch (cap_id) {
+    case PCI_CAP_ID_MSI:
+        ret = vfio_setup_msi(vdev, pos);
+        break;
+    case PCI_CAP_ID_MSIX:
+        ret = vfio_setup_msix(vdev, pos);
+        break;
+    default:
+        ret = pci_add_capability(pdev, cap_id, pos, size);
+        break;
+    }
+
+    if (ret < 0) {
+        error_report("vfio: %04x:%02x:%02x.%x Error adding PCI capability "
+                     "0x%x[0x%x]@0x%x: %d\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function,
+                     cap_id, size, pos, ret);
+        return ret;
+    }
+
+    return 0;
+}
+
+static int vfio_add_capabilities(VFIODevice *vdev)
+{
+    PCIDevice *pdev = &vdev->pdev;
+
+    if (!(pdev->config[PCI_STATUS] & PCI_STATUS_CAP_LIST) ||
+        !pdev->config[PCI_CAPABILITY_LIST]) {
+        return 0; /* Nothing to add */
+    }
+
+    return vfio_add_std_cap(vdev, pdev->config[PCI_CAPABILITY_LIST]);
+}
+
+static int vfio_load_rom(VFIODevice *vdev)
+{
+    uint64_t size = vdev->rom_size;
+    char name[32];
+    off_t off = 0, voff = vdev->rom_offset;
+    ssize_t bytes;
+    void *ptr;
+
+    /* If loading ROM from file, pci handles it */
+    if (vdev->pdev.romfile || !vdev->pdev.rom_bar || !size) {
+        return 0;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x)\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function);
+
+    snprintf(name, sizeof(name), "vfio[%04x:%02x:%02x.%x].rom",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+    memory_region_init_ram(&vdev->pdev.rom, name, size);
+    ptr = memory_region_get_ram_ptr(&vdev->pdev.rom);
+    memset(ptr, 0xff, size);
+
+    while (size) {
+        bytes = pread(vdev->fd, ptr + off, size, voff + off);
+        if (bytes == 0) {
+            break; /* expect that we could get back less than the ROM BAR */
+        } else if (bytes > 0) {
+            off += bytes;
+            size -= bytes;
+        } else {
+            if (errno == EINTR || errno == EAGAIN) {
+                continue;
+            }
+            error_report("vfio: Error reading device ROM: %m\n");
+            memory_region_destroy(&vdev->pdev.rom);
+            return -errno;
+        }
+    }
+
+    pci_register_bar(&vdev->pdev, PCI_ROM_SLOT, 0, &vdev->pdev.rom);
+    vdev->pdev.has_rom = true;
+    return 0;
+}
+
+static int vfio_connect_container(VFIOGroup *group)
+{
+    VFIOContainer *container;
+    int ret, fd;
+
+    if (group->container) {
+        return 0;
+    }
+
+    QLIST_FOREACH(container, &container_list, next) {
+        if (!ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container->fd)) {
+            group->container = container;
+            QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+            return 0;
+        }
+    }
+
+    fd = qemu_open("/dev/vfio/vfio", O_RDWR);
+    if (fd < 0) {
+        error_report("vfio: failed to open /dev/vfio/vfio: %m\n");
+        return -errno;
+    }
+
+    ret = ioctl(fd, VFIO_GET_API_VERSION);
+    if (ret != VFIO_API_VERSION) {
+        error_report("vfio: supported vfio version: %d, "
+                     "reported version: %d\n", VFIO_API_VERSION, ret);
+        close(fd);
+        return -EINVAL;
+    }
+
+    container = g_malloc0(sizeof(*container));
+    container->fd = fd;
+
+    if (ioctl(fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        ret = ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &fd);
+        if (ret) {
+            error_report("vfio: failed to set group container: %m\n");
+            g_free(container);
+            close(fd);
+            return -errno;
+        }
+
+        ret = ioctl(fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
+        if (ret) {
+            error_report("vfio: failed to set iommu for container: %m\n");
+            g_free(container);
+            close(fd);
+            return -errno;
+        }
+
+        container->iommu_data.listener = vfio_memory_listener;
+        container->iommu_data.release = vfio_listener_release;
+
+        memory_listener_register(&container->iommu_data.listener,
+                                 get_system_memory());
+    } else {
+        error_report("vfio: No available IOMMU models\n");
+        g_free(container);
+        close(fd);
+        return -EINVAL;
+    }
+
+    QLIST_INIT(&container->group_list);
+    QLIST_INSERT_HEAD(&container_list, container, next);
+
+    group->container = container;
+    QLIST_INSERT_HEAD(&container->group_list, group, container_next);
+
+    return 0;
+}
+
+static void vfio_disconnect_container(VFIOGroup *group)
+{
+    VFIOContainer *container = group->container;
+
+    if (ioctl(group->fd, VFIO_GROUP_UNSET_CONTAINER, &container->fd)) {
+        error_report("vfio: error disconnecting group %d from container\n",
+                     group->groupid);
+    }
+
+    QLIST_REMOVE(group, container_next);
+    group->container = NULL;
+
+    if (QLIST_EMPTY(&container->group_list)) {
+        if (container->iommu_data.release) {
+            container->iommu_data.release(container);
+        }
+        QLIST_REMOVE(container, next);
+        DPRINTF("vfio_disconnect_container: close container->fd\n");
+        close(container->fd);
+        g_free(container);
+    }
+}
+
+static VFIOGroup *vfio_get_group(int groupid)
+{
+    VFIOGroup *group;
+    char path[32];
+    struct vfio_group_status status = { .argsz = sizeof(status) };
+
+    QLIST_FOREACH(group, &group_list, next) {
+        if (group->groupid == groupid) {
+            return group;
+        }
+    }
+
+    group = g_malloc0(sizeof(*group));
+
+    snprintf(path, sizeof(path), "/dev/vfio/%d", groupid);
+    group->fd = qemu_open(path, O_RDWR);
+    if (group->fd < 0) {
+        error_report("vfio: error opening %s: %m\n", path);
+        g_free(group);
+        return NULL;
+    }
+
+    if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &status)) {
+        error_report("vfio: error getting group status: %m\n");
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    if (!(status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_report("vfio: error, group %d is not viable, please ensure "
+                     "all devices within the iommu_group are bound to their "
+                     "vfio bus driver.\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    group->groupid = groupid;
+    QLIST_INIT(&group->device_list);
+
+    if (vfio_connect_container(group)) {
+        error_report("vfio: failed to setup container for group %d\n", groupid);
+        close(group->fd);
+        g_free(group);
+        return NULL;
+    }
+
+    QLIST_INSERT_HEAD(&group_list, group, next);
+
+    return group;
+}
+
+static void vfio_put_group(VFIOGroup *group)
+{
+    if (!QLIST_EMPTY(&group->device_list)) {
+        return;
+    }
+
+    vfio_disconnect_container(group);
+    QLIST_REMOVE(group, next);
+    DPRINTF("vfio_put_group: close group->fd\n");
+    close(group->fd);
+    g_free(group);
+}
+
+static int vfio_get_device(VFIOGroup *group, const char *name, VFIODevice *vdev)
+{
+    struct vfio_device_info dev_info = { .argsz = sizeof(dev_info) };
+    struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
+    int ret, i;
+
+    ret = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+    if (ret < 0) {
+        error_report("vfio: error getting device %s from group %d: %m\n",
+                     name, group->groupid);
+        error_report("Verify all devices in group %d are bound to vfio-pci "
+                     "or pci-stub and not already in use\n", group->groupid);
+        return ret;
+    }
+
+    vdev->fd = ret;
+    vdev->group = group;
+    QLIST_INSERT_HEAD(&group->device_list, vdev, next);
+
+    /* Sanity check device */
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_INFO, &dev_info);
+    if (ret) {
+        error_report("vfio: error getting device info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s flags: %u, regions: %u, irgs: %u\n", name,
+            dev_info.flags, dev_info.num_regions, dev_info.num_irqs);
+
+    if (!(dev_info.flags & VFIO_DEVICE_FLAGS_PCI)) {
+        error_report("vfio: Um, this isn't a PCI device\n");
+        goto error;
+    }
+
+    vdev->reset_works = !!(dev_info.flags & VFIO_DEVICE_FLAGS_RESET);
+    if (!vdev->reset_works) {
+        error_report("Warning, device %s does not support reset\n", name);
+    }
+
+    if (dev_info.num_regions != VFIO_PCI_NUM_REGIONS) {
+        error_report("vfio: unexpected number of io regions %u\n",
+                     dev_info.num_regions);
+        goto error;
+    }
+
+    if (dev_info.num_irqs != VFIO_PCI_NUM_IRQS) {
+        error_report("vfio: unexpected number of irqs %u\n", dev_info.num_irqs);
+        goto error;
+    }
+
+    for (i = VFIO_PCI_BAR0_REGION_INDEX; i < VFIO_PCI_ROM_REGION_INDEX; i++) {
+        reg_info.index = i;
+
+        ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+        if (ret) {
+            error_report("vfio: Error getting region %d info: %m\n", i);
+            goto error;
+        }
+
+        DPRINTF("Device %s region %d:\n", name, i);
+        DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+                (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+                (unsigned long)reg_info.flags);
+
+        vdev->bars[i].flags = reg_info.flags;
+        vdev->bars[i].size = reg_info.size;
+        vdev->bars[i].fd_offset = reg_info.offset;
+        vdev->bars[i].fd = vdev->fd;
+        vdev->bars[i].nr = i;
+    }
+
+    reg_info.index = VFIO_PCI_ROM_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting ROM info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s ROM:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->rom_size = reg_info.size;
+    vdev->rom_offset = reg_info.offset;
+
+    reg_info.index = VFIO_PCI_CONFIG_REGION_INDEX;
+
+    ret = ioctl(vdev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
+    if (ret) {
+        error_report("vfio: Error getting config info: %m\n");
+        goto error;
+    }
+
+    DPRINTF("Device %s config:\n", name);
+    DPRINTF("  size: 0x%lx, offset: 0x%lx, flags: 0x%lx\n",
+            (unsigned long)reg_info.size, (unsigned long)reg_info.offset,
+            (unsigned long)reg_info.flags);
+
+    vdev->config_size = reg_info.size;
+    vdev->config_offset = reg_info.offset;
+
+error:
+    if (ret) {
+        QLIST_REMOVE(vdev, next);
+        vdev->group = NULL;
+        close(vdev->fd);
+    }
+    return ret;
+}
+
+static void vfio_put_device(VFIODevice *vdev)
+{
+    QLIST_REMOVE(vdev, next);
+    vdev->group = NULL;
+    DPRINTF("vfio_put_device: close vdev->fd\n");
+    close(vdev->fd);
+    if (vdev->msix) {
+        g_free(vdev->msix);
+        vdev->msix = NULL;
+    }
+}
+
+static int vfio_initfn(PCIDevice *pdev)
+{
+    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group;
+    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
+    ssize_t len;
+    struct stat st;
+    int groupid;
+    int ret;
+
+    /* Check that the host device exists */
+    snprintf(path, sizeof(path),
+             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+             vdev->host.domain, vdev->host.bus, vdev->host.slot,
+             vdev->host.function);
+    if (stat(path, &st) < 0) {
+        error_report("vfio: error: no such host device: %s\n", path);
+        return -errno;
+    }
+
+    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
+
+    len = readlink(path, iommu_group_path, PATH_MAX);
+    if (len <= 0) {
+        error_report("vfio: error no iommu_group for device\n");
+        return -errno;
+    }
+
+    iommu_group_path[len] = 0;
+    group_name = basename(iommu_group_path);
+
+    if (sscanf(group_name, "%d", &groupid) != 1) {
+        error_report("vfio: error reading %s: %m\n", path);
+        return -errno;
+    }
+
+    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
+            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
+
+    group = vfio_get_group(groupid);
+    if (!group) {
+        error_report("vfio: failed to get group %d\n", groupid);
+        return -ENOENT;
+    }
+
+    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
+            vdev->host.domain, vdev->host.bus, vdev->host.slot,
+            vdev->host.function);
+
+    QLIST_FOREACH(pvdev, &group->device_list, next) {
+        if (pvdev->host.domain == vdev->host.domain &&
+            pvdev->host.bus == vdev->host.bus &&
+            pvdev->host.slot == vdev->host.slot &&
+            pvdev->host.function == vdev->host.function) {
+
+            error_report("vfio: error: device %s is already attached\n", path);
+            vfio_put_group(group);
+            return -EBUSY;
+        }
+    }
+
+    ret = vfio_get_device(group, path, vdev);
+    if (ret) {
+        error_report("vfio: failed to get device %s\n", path);
+        vfio_put_group(group);
+        return ret;
+    }
+
+    /* Get a copy of config space */
+    assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
+    ret = pread(vdev->fd, vdev->pdev.config,
+                pci_config_size(&vdev->pdev), vdev->config_offset);
+    if (ret < (int)pci_config_size(&vdev->pdev)) {
+        ret = ret < 0 ? -errno : -EFAULT;
+        error_report("vfio: Failed to read device config space\n");
+        goto out_put;
+    }
+
+    /*
+     * Clear host resource mapping info.  If we choose not to register a
+     * BAR, such as might be the case with the option ROM, we can get
+     * confusing, unwritable, residual addresses from the host here.
+     */
+    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
+    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
+
+    vfio_load_rom(vdev);
+
+    ret = vfio_early_setup_msix(vdev);
+    if (ret) {
+        goto out_put;
+    }
+
+    vfio_map_bars(vdev);
+
+    ret = vfio_add_capabilities(vdev);
+    if (ret) {
+        goto out_teardown;
+    }
+
+    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
+        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
+
+        ret = vfio_enable_intx(vdev);
+        if (ret) {
+            goto out_teardown;
+        }
+    }
+
+    return 0;
+
+out_teardown:
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+out_put:
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+    return ret;
+}
+
+static void vfio_exitfn(PCIDevice *pdev)
+{
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+    VFIOGroup *group = vdev->group;
+
+    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
+    vfio_disable_interrupts(vdev);
+    vfio_teardown_msi(vdev);
+    vfio_unmap_bars(vdev);
+    vfio_put_device(vdev);
+    vfio_put_group(group);
+}
+
+static void vfio_pci_reset(DeviceState *dev)
+{
+    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
+    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
+
+    if (!vdev->reset_works) {
+        return;
+    }
+
+    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
+        error_report("vfio: Error unable to reset physical device "
+                     "(%04x:%02x:%02x.%x): %m\n", vdev->host.domain,
+                     vdev->host.bus, vdev->host.slot, vdev->host.function);
+    }
+}
+
+static Property vfio_pci_dev_properties[] = {
+    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
+    /*
+     * TODO - support passed fds... is this necessary?
+     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
+     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
+     */
+    DEFINE_PROP_END_OF_LIST(),
+};
+
+
+static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+    PCIDeviceClass *pdc = PCI_DEVICE_CLASS(klass);
+
+    dc->reset = vfio_pci_reset;
+    dc->props = vfio_pci_dev_properties;
+    pdc->init = vfio_initfn;
+    pdc->exit = vfio_exitfn;
+    pdc->config_read = vfio_pci_read_config;
+    pdc->config_write = vfio_pci_write_config;
+}
+
+static TypeInfo vfio_pci_dev_info = {
+    .name = "vfio-pci",
+    .parent = TYPE_PCI_DEVICE,
+    .instance_size = sizeof(VFIODevice),
+    .class_init = vfio_pci_dev_class_init,
+};
+
+static void register_vfio_pci_dev_type(void)
+{
+    type_register_static(&vfio_pci_dev_info);
+}
+
+type_init(register_vfio_pci_dev_type)
diff --git a/hw/vfio_pci_int.h b/hw/vfio_pci_int.h
new file mode 100644
index 0000000..e664741
--- /dev/null
+++ b/hw/vfio_pci_int.h
@@ -0,0 +1,112 @@
+/*
+ * vfio based device assignment support
+ *
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_VFIO_PCI_INT_H
+#define HW_VFIO_PCI_INT_H
+
+#include "qemu-common.h"
+#include "qemu-queue.h"
+#include "pci.h"
+#include "event_notifier.h"
+
+typedef struct VFIOBAR {
+    off_t fd_offset; /* offset of BAR within device fd */
+    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
+    MemoryRegion mem; /* slow, read/write access */
+    MemoryRegion mmap_mem; /* direct mapped access */
+    void *mmap;
+    size_t size;
+    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
+    uint8_t nr; /* cache the BAR number for debug */
+} VFIOBAR;
+
+typedef struct VFIOINTx {
+    bool pending; /* interrupt pending */
+    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
+    uint8_t pin; /* which pin to pull for qemu_set_irq */
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
+    PCIINTxRoute route; /* routing info for QEMU bypass */
+} VFIOINTx;
+
+struct VFIODevice;
+
+typedef struct VFIOMSIVector {
+    EventNotifier interrupt; /* eventfd triggered on interrupt */
+    struct VFIODevice *vdev; /* back pointer to device */
+    int virq; /* KVM irqchip route for QEMU bypass */
+    bool use;
+} VFIOMSIVector;
+
+enum {
+    VFIO_INT_NONE = 0,
+    VFIO_INT_INTx = 1,
+    VFIO_INT_MSI  = 2,
+    VFIO_INT_MSIX = 3,
+};
+
+struct VFIOGroup;
+
+typedef struct VFIOContainer {
+    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
+    struct {
+        /* enable abstraction to support various iommu backends */
+        union {
+            MemoryListener listener; /* Used by type1 iommu */
+        };
+        void (*release)(struct VFIOContainer *);
+    } iommu_data;
+    QLIST_HEAD(, VFIOGroup) group_list;
+    QLIST_ENTRY(VFIOContainer) next;
+} VFIOContainer;
+
+/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
+typedef struct VFIOMSIXInfo {
+    uint8_t table_bar;
+    uint8_t pba_bar;
+    uint16_t entries;
+    uint32_t table_offset;
+    uint32_t pba_offset;
+    MemoryRegion mmap_mem;
+    void *mmap;
+} VFIOMSIXInfo;
+
+typedef struct VFIODevice {
+    PCIDevice pdev;
+    int fd;
+    VFIOINTx intx;
+    unsigned int config_size;
+    off_t config_offset; /* Offset of config space region within device fd */
+    unsigned int rom_size;
+    off_t rom_offset; /* Offset of ROM region within device fd */
+    int msi_cap_size;
+    VFIOMSIVector *msi_vectors;
+    VFIOMSIXInfo *msix;
+    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
+    int interrupt; /* Current interrupt type */
+    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
+    PCIHostDeviceAddress host;
+    QLIST_ENTRY(VFIODevice) next;
+    struct VFIOGroup *group;
+    bool reset_works;
+} VFIODevice;
+
+typedef struct VFIOGroup {
+    int fd;
+    int groupid;
+    VFIOContainer *container;
+    QLIST_HEAD(, VFIODevice) device_list;
+    QLIST_ENTRY(VFIOGroup) next;
+    QLIST_ENTRY(VFIOGroup) container_next;
+} VFIOGroup;
+
+#endif /* HW_VFIO_PCI_INT_H */


^ permalink raw reply related

* [PATCH v3 4/4] vfio: Enable vfio-pci and mark supported
From: Alex Williamson @ 2012-08-14 20:33 UTC (permalink / raw)
  To: aliguori; +Cc: kvm, aik, jan.kiszka, qemu-devel, blauwirbel, avi
In-Reply-To: <20120814202141.11522.78340.stgit@bling.home>

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 MAINTAINERS           |    5 +++++
 configure             |    6 ++++++
 hw/i386/Makefile.objs |    1 +
 3 files changed, 12 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 708ad54..327b219 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -468,6 +468,11 @@ M: Gerd Hoffmann <kraxel@redhat.com>
 S: Maintained
 F: hw/usb*
 
+VFIO
+M: Alex Williamson <alex.williamson@redhat.com>
+S: Supported
+F: hw/vfio*
+
 vhost
 M: Michael S. Tsirkin <mst@redhat.com>
 S: Supported
diff --git a/configure b/configure
index fea62f1..ca04f76 100755
--- a/configure
+++ b/configure
@@ -167,6 +167,7 @@ attr=""
 libattr=""
 xfs=""
 
+vfio_pci="no"
 vhost_net="no"
 kvm="no"
 gprof="no"
@@ -512,6 +513,7 @@ Haiku)
   usb="linux"
   kvm="yes"
   vhost_net="yes"
+  vfio_pci="yes"
   if [ "$cpu" = "i386" -o "$cpu" = "x86_64" ] ; then
     audio_possible_drivers="$audio_possible_drivers fmod"
   fi
@@ -3120,6 +3122,7 @@ echo "OpenGL support    $opengl"
 echo "libiscsi support  $libiscsi"
 echo "build guest agent $guest_agent"
 echo "coroutine backend $coroutine_backend"
+echo "VFIO PCI support  $vfio_pci"
 
 if test "$sdl_too_old" = "yes"; then
 echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -3811,6 +3814,9 @@ case "$target_arch2" in
   *)
     echo "CONFIG_NO_XEN=y" >> $config_target_mak
 esac
+if test "$vfio_pci" = "yes" -a "$target_softmmu" = "yes" ; then
+  echo "CONFIG_VFIO_PCI=y" >> $config_target_mak
+fi
 case "$target_arch2" in
   i386|x86_64|ppcemb|ppc|ppc64|s390x)
     # Make sure the target and host cpus are compatible
diff --git a/hw/i386/Makefile.objs b/hw/i386/Makefile.objs
index 8c764bb..a2783ef 100644
--- a/hw/i386/Makefile.objs
+++ b/hw/i386/Makefile.objs
@@ -11,5 +11,6 @@ obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen-host-pci-device.o
 obj-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pt.o xen_pt_config_init.o xen_pt_msi.o
 obj-y += kvm/
 obj-$(CONFIG_SPICE) += qxl.o qxl-logger.o qxl-render.o
+obj-$(CONFIG_VFIO_PCI) += vfio_pci.o
 
 obj-y := $(addprefix ../,$(obj-y))

^ permalink raw reply related

* [PATCH v3 2/4] Update Linux kernel headers
From: Alex Williamson @ 2012-08-14 20:32 UTC (permalink / raw)
  To: aliguori; +Cc: kvm, aik, jan.kiszka, qemu-devel, blauwirbel, avi
In-Reply-To: <20120814202141.11522.78340.stgit@bling.home>

Based on Linux as of ddf343f6.  Note that vfio.h isn't yet an
installed header at this commit, but is fixed by trivial update to
include/linux/Kbuild.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 linux-headers/asm-s390/kvm.h      |    2 
 linux-headers/asm-s390/kvm_para.h |    2 
 linux-headers/asm-x86/kvm.h       |    1 
 linux-headers/asm-x86/kvm_para.h  |    7 +
 linux-headers/linux/kvm.h         |    3 
 linux-headers/linux/vfio.h        |  368 +++++++++++++++++++++++++++++++++++++
 6 files changed, 381 insertions(+), 2 deletions(-)
 create mode 100644 linux-headers/linux/vfio.h

diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h
index bdcbe0f..d25da59 100644
--- a/linux-headers/asm-s390/kvm.h
+++ b/linux-headers/asm-s390/kvm.h
@@ -1,7 +1,7 @@
 #ifndef __LINUX_KVM_S390_H
 #define __LINUX_KVM_S390_H
 /*
- * asm-s390/kvm.h - KVM s390 specific structures and definitions
+ * KVM s390 specific structures and definitions
  *
  * Copyright IBM Corp. 2008
  *
diff --git a/linux-headers/asm-s390/kvm_para.h b/linux-headers/asm-s390/kvm_para.h
index 8e2dd67..870051f 100644
--- a/linux-headers/asm-s390/kvm_para.h
+++ b/linux-headers/asm-s390/kvm_para.h
@@ -1,5 +1,5 @@
 /*
- * asm-s390/kvm_para.h - definition for paravirtual devices on s390
+ * definition for paravirtual devices on s390
  *
  * Copyright IBM Corp. 2008
  *
diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h
index e7d1c19..246617e 100644
--- a/linux-headers/asm-x86/kvm.h
+++ b/linux-headers/asm-x86/kvm.h
@@ -12,6 +12,7 @@
 /* Select x86 specific features in <linux/kvm.h> */
 #define __KVM_HAVE_PIT
 #define __KVM_HAVE_IOAPIC
+#define __KVM_HAVE_IRQ_LINE
 #define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
diff --git a/linux-headers/asm-x86/kvm_para.h b/linux-headers/asm-x86/kvm_para.h
index f2ac46a..a1c3d72 100644
--- a/linux-headers/asm-x86/kvm_para.h
+++ b/linux-headers/asm-x86/kvm_para.h
@@ -22,6 +22,7 @@
 #define KVM_FEATURE_CLOCKSOURCE2        3
 #define KVM_FEATURE_ASYNC_PF		4
 #define KVM_FEATURE_STEAL_TIME		5
+#define KVM_FEATURE_PV_EOI		6
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -37,6 +38,7 @@
 #define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
 #define MSR_KVM_ASYNC_PF_EN 0x4b564d02
 #define MSR_KVM_STEAL_TIME  0x4b564d03
+#define MSR_KVM_PV_EOI_EN      0x4b564d04
 
 struct kvm_steal_time {
 	__u64 steal;
@@ -89,5 +91,10 @@ struct kvm_vcpu_pv_apf_data {
 	__u32 enabled;
 };
 
+#define KVM_PV_EOI_BIT 0
+#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT)
+#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
+#define KVM_PV_EOI_DISABLED 0x0
+
 
 #endif /* _ASM_X86_KVM_PARA_H */
diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
index 5a9d4e3..4b9e575 100644
--- a/linux-headers/linux/kvm.h
+++ b/linux-headers/linux/kvm.h
@@ -617,6 +617,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SIGNAL_MSI 77
 #define KVM_CAP_PPC_GET_SMMU_INFO 78
 #define KVM_CAP_S390_COW 79
+#define KVM_CAP_PPC_ALLOC_HTAB 80
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -828,6 +829,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
 /* Available with KVM_CAP_PPC_GET_SMMU_INFO */
 #define KVM_PPC_GET_SMMU_INFO	  _IOR(KVMIO,  0xa6, struct kvm_ppc_smmu_info)
+/* Available with KVM_CAP_PPC_ALLOC_HTAB */
+#define KVM_PPC_ALLOCATE_HTAB	  _IOWR(KVMIO, 0xa7, __u32)
 
 /*
  * ioctls for vcpu fds
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
new file mode 100644
index 0000000..f787b72
--- /dev/null
+++ b/linux-headers/linux/vfio.h
@@ -0,0 +1,368 @@
+/*
+ * VFIO API definition
+ *
+ * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
+ *     Author: Alex Williamson <alex.williamson@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef VFIO_H
+#define VFIO_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VFIO_API_VERSION	0
+
+
+/* Kernel & User level defines for VFIO IOCTLs. */
+
+/* Extensions */
+
+#define VFIO_TYPE1_IOMMU		1
+
+/*
+ * The IOCTL interface is designed for extensibility by embedding the
+ * structure length (argsz) and flags into structures passed between
+ * kernel and userspace.  We therefore use the _IO() macro for these
+ * defines to avoid implicitly embedding a size into the ioctl request.
+ * As structure fields are added, argsz will increase to match and flag
+ * bits will be defined to indicate additional fields with valid data.
+ * It's *always* the caller's responsibility to indicate the size of
+ * the structure passed by setting argsz appropriately.
+ */
+
+#define VFIO_TYPE	(';')
+#define VFIO_BASE	100
+
+/* -------- IOCTLs for VFIO file descriptor (/dev/vfio/vfio) -------- */
+
+/**
+ * VFIO_GET_API_VERSION - _IO(VFIO_TYPE, VFIO_BASE + 0)
+ *
+ * Report the version of the VFIO API.  This allows us to bump the entire
+ * API version should we later need to add or change features in incompatible
+ * ways.
+ * Return: VFIO_API_VERSION
+ * Availability: Always
+ */
+#define VFIO_GET_API_VERSION		_IO(VFIO_TYPE, VFIO_BASE + 0)
+
+/**
+ * VFIO_CHECK_EXTENSION - _IOW(VFIO_TYPE, VFIO_BASE + 1, __u32)
+ *
+ * Check whether an extension is supported.
+ * Return: 0 if not supported, 1 (or some other positive integer) if supported.
+ * Availability: Always
+ */
+#define VFIO_CHECK_EXTENSION		_IO(VFIO_TYPE, VFIO_BASE + 1)
+
+/**
+ * VFIO_SET_IOMMU - _IOW(VFIO_TYPE, VFIO_BASE + 2, __s32)
+ *
+ * Set the iommu to the given type.  The type must be supported by an
+ * iommu driver as verified by calling CHECK_EXTENSION using the same
+ * type.  A group must be set to this file descriptor before this
+ * ioctl is available.  The IOMMU interfaces enabled by this call are
+ * specific to the value set.
+ * Return: 0 on success, -errno on failure
+ * Availability: When VFIO group attached
+ */
+#define VFIO_SET_IOMMU			_IO(VFIO_TYPE, VFIO_BASE + 2)
+
+/* -------- IOCTLs for GROUP file descriptors (/dev/vfio/$GROUP) -------- */
+
+/**
+ * VFIO_GROUP_GET_STATUS - _IOR(VFIO_TYPE, VFIO_BASE + 3,
+ *						struct vfio_group_status)
+ *
+ * Retrieve information about the group.  Fills in provided
+ * struct vfio_group_info.  Caller sets argsz.
+ * Return: 0 on succes, -errno on failure.
+ * Availability: Always
+ */
+struct vfio_group_status {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_GROUP_FLAGS_VIABLE		(1 << 0)
+#define VFIO_GROUP_FLAGS_CONTAINER_SET	(1 << 1)
+};
+#define VFIO_GROUP_GET_STATUS		_IO(VFIO_TYPE, VFIO_BASE + 3)
+
+/**
+ * VFIO_GROUP_SET_CONTAINER - _IOW(VFIO_TYPE, VFIO_BASE + 4, __s32)
+ *
+ * Set the container for the VFIO group to the open VFIO file
+ * descriptor provided.  Groups may only belong to a single
+ * container.  Containers may, at their discretion, support multiple
+ * groups.  Only when a container is set are all of the interfaces
+ * of the VFIO file descriptor and the VFIO group file descriptor
+ * available to the user.
+ * Return: 0 on success, -errno on failure.
+ * Availability: Always
+ */
+#define VFIO_GROUP_SET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 4)
+
+/**
+ * VFIO_GROUP_UNSET_CONTAINER - _IO(VFIO_TYPE, VFIO_BASE + 5)
+ *
+ * Remove the group from the attached container.  This is the
+ * opposite of the SET_CONTAINER call and returns the group to
+ * an initial state.  All device file descriptors must be released
+ * prior to calling this interface.  When removing the last group
+ * from a container, the IOMMU will be disabled and all state lost,
+ * effectively also returning the VFIO file descriptor to an initial
+ * state.
+ * Return: 0 on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_UNSET_CONTAINER	_IO(VFIO_TYPE, VFIO_BASE + 5)
+
+/**
+ * VFIO_GROUP_GET_DEVICE_FD - _IOW(VFIO_TYPE, VFIO_BASE + 6, char)
+ *
+ * Return a new file descriptor for the device object described by
+ * the provided string.  The string should match a device listed in
+ * the devices subdirectory of the IOMMU group sysfs entry.  The
+ * group containing the device must already be added to this context.
+ * Return: new file descriptor on success, -errno on failure.
+ * Availability: When attached to container
+ */
+#define VFIO_GROUP_GET_DEVICE_FD	_IO(VFIO_TYPE, VFIO_BASE + 6)
+
+/* --------------- IOCTLs for DEVICE file descriptors --------------- */
+
+/**
+ * VFIO_DEVICE_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 7,
+ *						struct vfio_device_info)
+ *
+ * Retrieve information about the device.  Fills in provided
+ * struct vfio_device_info.  Caller sets argsz.
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_device_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DEVICE_FLAGS_RESET	(1 << 0)	/* Device supports reset */
+#define VFIO_DEVICE_FLAGS_PCI	(1 << 1)	/* vfio-pci device */
+	__u32	num_regions;	/* Max region index + 1 */
+	__u32	num_irqs;	/* Max IRQ index + 1 */
+};
+#define VFIO_DEVICE_GET_INFO		_IO(VFIO_TYPE, VFIO_BASE + 7)
+
+/**
+ * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8,
+ *				       struct vfio_region_info)
+ *
+ * Retrieve information about a device region.  Caller provides
+ * struct vfio_region_info with index value set.  Caller sets argsz.
+ * Implementation of region mapping is bus driver specific.  This is
+ * intended to describe MMIO, I/O port, as well as bus specific
+ * regions (ex. PCI config space).  Zero sized regions may be used
+ * to describe unimplemented regions (ex. unimplemented PCI BARs).
+ * Return: 0 on success, -errno on failure.
+ */
+struct vfio_region_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_REGION_INFO_FLAG_READ	(1 << 0) /* Region supports read */
+#define VFIO_REGION_INFO_FLAG_WRITE	(1 << 1) /* Region supports write */
+#define VFIO_REGION_INFO_FLAG_MMAP	(1 << 2) /* Region supports mmap */
+	__u32	index;		/* Region index */
+	__u32	resv;		/* Reserved for alignment */
+	__u64	size;		/* Region size (bytes) */
+	__u64	offset;		/* Region offset from start of device fd */
+};
+#define VFIO_DEVICE_GET_REGION_INFO	_IO(VFIO_TYPE, VFIO_BASE + 8)
+
+/**
+ * VFIO_DEVICE_GET_IRQ_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 9,
+ *				    struct vfio_irq_info)
+ *
+ * Retrieve information about a device IRQ.  Caller provides
+ * struct vfio_irq_info with index value set.  Caller sets argsz.
+ * Implementation of IRQ mapping is bus driver specific.  Indexes
+ * using multiple IRQs are primarily intended to support MSI-like
+ * interrupt blocks.  Zero count irq blocks may be used to describe
+ * unimplemented interrupt types.
+ *
+ * The EVENTFD flag indicates the interrupt index supports eventfd based
+ * signaling.
+ *
+ * The MASKABLE flags indicates the index supports MASK and UNMASK
+ * actions described below.
+ *
+ * AUTOMASKED indicates that after signaling, the interrupt line is
+ * automatically masked by VFIO and the user needs to unmask the line
+ * to receive new interrupts.  This is primarily intended to distinguish
+ * level triggered interrupts.
+ *
+ * The NORESIZE flag indicates that the interrupt lines within the index
+ * are setup as a set and new subindexes cannot be enabled without first
+ * disabling the entire index.  This is used for interrupts like PCI MSI
+ * and MSI-X where the driver may only use a subset of the available
+ * indexes, but VFIO needs to enable a specific number of vectors
+ * upfront.  In the case of MSI-X, where the user can enable MSI-X and
+ * then add and unmask vectors, it's up to userspace to make the decision
+ * whether to allocate the maximum supported number of vectors or tear
+ * down setup and incrementally increase the vectors as each is enabled.
+ */
+struct vfio_irq_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_INFO_EVENTFD		(1 << 0)
+#define VFIO_IRQ_INFO_MASKABLE		(1 << 1)
+#define VFIO_IRQ_INFO_AUTOMASKED	(1 << 2)
+#define VFIO_IRQ_INFO_NORESIZE		(1 << 3)
+	__u32	index;		/* IRQ index */
+	__u32	count;		/* Number of IRQs within this index */
+};
+#define VFIO_DEVICE_GET_IRQ_INFO	_IO(VFIO_TYPE, VFIO_BASE + 9)
+
+/**
+ * VFIO_DEVICE_SET_IRQS - _IOW(VFIO_TYPE, VFIO_BASE + 10, struct vfio_irq_set)
+ *
+ * Set signaling, masking, and unmasking of interrupts.  Caller provides
+ * struct vfio_irq_set with all fields set.  'start' and 'count' indicate
+ * the range of subindexes being specified.
+ *
+ * The DATA flags specify the type of data provided.  If DATA_NONE, the
+ * operation performs the specified action immediately on the specified
+ * interrupt(s).  For example, to unmask AUTOMASKED interrupt [0,0]:
+ * flags = (DATA_NONE|ACTION_UNMASK), index = 0, start = 0, count = 1.
+ *
+ * DATA_BOOL allows sparse support for the same on arrays of interrupts.
+ * For example, to mask interrupts [0,1] and [0,3] (but not [0,2]):
+ * flags = (DATA_BOOL|ACTION_MASK), index = 0, start = 1, count = 3,
+ * data = {1,0,1}
+ *
+ * DATA_EVENTFD binds the specified ACTION to the provided __s32 eventfd.
+ * A value of -1 can be used to either de-assign interrupts if already
+ * assigned or skip un-assigned interrupts.  For example, to set an eventfd
+ * to be trigger for interrupts [0,0] and [0,2]:
+ * flags = (DATA_EVENTFD|ACTION_TRIGGER), index = 0, start = 0, count = 3,
+ * data = {fd1, -1, fd2}
+ * If index [0,1] is previously set, two count = 1 ioctls calls would be
+ * required to set [0,0] and [0,2] without changing [0,1].
+ *
+ * Once a signaling mechanism is set, DATA_BOOL or DATA_NONE can be used
+ * with ACTION_TRIGGER to perform kernel level interrupt loopback testing
+ * from userspace (ie. simulate hardware triggering).
+ *
+ * Setting of an event triggering mechanism to userspace for ACTION_TRIGGER
+ * enables the interrupt index for the device.  Individual subindex interrupts
+ * can be disabled using the -1 value for DATA_EVENTFD or the index can be
+ * disabled as a whole with: flags = (DATA_NONE|ACTION_TRIGGER), count = 0.
+ *
+ * Note that ACTION_[UN]MASK specify user->kernel signaling (irqfds) while
+ * ACTION_TRIGGER specifies kernel->user signaling.
+ */
+struct vfio_irq_set {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IRQ_SET_DATA_NONE		(1 << 0) /* Data not present */
+#define VFIO_IRQ_SET_DATA_BOOL		(1 << 1) /* Data is bool (u8) */
+#define VFIO_IRQ_SET_DATA_EVENTFD	(1 << 2) /* Data is eventfd (s32) */
+#define VFIO_IRQ_SET_ACTION_MASK	(1 << 3) /* Mask interrupt */
+#define VFIO_IRQ_SET_ACTION_UNMASK	(1 << 4) /* Unmask interrupt */
+#define VFIO_IRQ_SET_ACTION_TRIGGER	(1 << 5) /* Trigger interrupt */
+	__u32	index;
+	__u32	start;
+	__u32	count;
+	__u8	data[];
+};
+#define VFIO_DEVICE_SET_IRQS		_IO(VFIO_TYPE, VFIO_BASE + 10)
+
+#define VFIO_IRQ_SET_DATA_TYPE_MASK	(VFIO_IRQ_SET_DATA_NONE | \
+					 VFIO_IRQ_SET_DATA_BOOL | \
+					 VFIO_IRQ_SET_DATA_EVENTFD)
+#define VFIO_IRQ_SET_ACTION_TYPE_MASK	(VFIO_IRQ_SET_ACTION_MASK | \
+					 VFIO_IRQ_SET_ACTION_UNMASK | \
+					 VFIO_IRQ_SET_ACTION_TRIGGER)
+/**
+ * VFIO_DEVICE_RESET - _IO(VFIO_TYPE, VFIO_BASE + 11)
+ *
+ * Reset a device.
+ */
+#define VFIO_DEVICE_RESET		_IO(VFIO_TYPE, VFIO_BASE + 11)
+
+/*
+ * The VFIO-PCI bus driver makes use of the following fixed region and
+ * IRQ index mapping.  Unimplemented regions return a size of zero.
+ * Unimplemented IRQ types return a count of zero.
+ */
+
+enum {
+	VFIO_PCI_BAR0_REGION_INDEX,
+	VFIO_PCI_BAR1_REGION_INDEX,
+	VFIO_PCI_BAR2_REGION_INDEX,
+	VFIO_PCI_BAR3_REGION_INDEX,
+	VFIO_PCI_BAR4_REGION_INDEX,
+	VFIO_PCI_BAR5_REGION_INDEX,
+	VFIO_PCI_ROM_REGION_INDEX,
+	VFIO_PCI_CONFIG_REGION_INDEX,
+	VFIO_PCI_NUM_REGIONS
+};
+
+enum {
+	VFIO_PCI_INTX_IRQ_INDEX,
+	VFIO_PCI_MSI_IRQ_INDEX,
+	VFIO_PCI_MSIX_IRQ_INDEX,
+	VFIO_PCI_NUM_IRQS
+};
+
+/* -------- API for Type1 VFIO IOMMU -------- */
+
+/**
+ * VFIO_IOMMU_GET_INFO - _IOR(VFIO_TYPE, VFIO_BASE + 12, struct vfio_iommu_info)
+ *
+ * Retrieve information about the IOMMU object. Fills in provided
+ * struct vfio_iommu_info. Caller sets argsz.
+ *
+ * XXX Should we do these by CHECK_EXTENSION too?
+ */
+struct vfio_iommu_type1_info {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_IOMMU_INFO_PGSIZES (1 << 0)	/* supported page sizes info */
+	__u64	iova_pgsizes;		/* Bitmap of supported page sizes */
+};
+
+#define VFIO_IOMMU_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
+
+/**
+ * VFIO_IOMMU_MAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 13, struct vfio_dma_map)
+ *
+ * Map process virtual addresses to IO virtual addresses using the
+ * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ */
+struct vfio_iommu_type1_dma_map {
+	__u32	argsz;
+	__u32	flags;
+#define VFIO_DMA_MAP_FLAG_READ (1 << 0)		/* readable from device */
+#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)	/* writable from device */
+	__u64	vaddr;				/* Process virtual address */
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_MAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 13)
+
+/**
+ * VFIO_IOMMU_UNMAP_DMA - _IOW(VFIO_TYPE, VFIO_BASE + 14, struct vfio_dma_unmap)
+ *
+ * Unmap IO virtual addresses using the provided struct vfio_dma_unmap.
+ * Caller sets argsz.
+ */
+struct vfio_iommu_type1_dma_unmap {
+	__u32	argsz;
+	__u32	flags;
+	__u64	iova;				/* IO virtual address */
+	__u64	size;				/* Size of mapping (bytes) */
+};
+
+#define VFIO_IOMMU_UNMAP_DMA _IO(VFIO_TYPE, VFIO_BASE + 14)
+
+#endif /* VFIO_H */

^ permalink raw reply related

* [PATCH v3 1/4] Update kernel header script to include vfio
From: Alex Williamson @ 2012-08-14 20:32 UTC (permalink / raw)
  To: aliguori; +Cc: kvm, aik, jan.kiszka, qemu-devel, blauwirbel, avi
In-Reply-To: <20120814202141.11522.78340.stgit@bling.home>

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---

 scripts/update-linux-headers.sh |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 9d2a4bc..270d32b 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -43,7 +43,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h kvm_para.h vhost.h virtio_config.h virtio_ring.h; do
+for header in kvm.h kvm_para.h vfio.h vhost.h virtio_config.h virtio_ring.h; do
     cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
 if [ -L "$linux/source" ]; then

^ permalink raw reply related

* [PATCH v3 0/4] VFIO-based PCI device assignment for QEMU 1.2
From: Alex Williamson @ 2012-08-14 20:32 UTC (permalink / raw)
  To: aliguori; +Cc: kvm, aik, jan.kiszka, qemu-devel, blauwirbel, avi

v3:
 - Incorporate feedback from Anthony, Avi, and Jan (Thank you!)

I've also added a new tag to my github tree, so those who have
already reviewed can check the delta by comparing:

git://github.com/awilliam/qemu-vfio.git
	tags/vfio-pci-for-qemu-1.2
	tags/vfio-pci-for-qemu-1.2-v2
	tags/vfio-pci-for-qemu-1.2-v3

Blue Swirl already offered an Ack-by for previous version, I'll
hope, but won't assume that carries forward.  Original description
below with updated tag.  Thanks,

Alex

VFIO kernel support was just merged into Linux, so I'd like to
formally propose inclusion of the QEMU vfio-pci driver for
QEMU 1.2.  Included here is support for x86 PCI device assignment.
PCI INTx is not yet enabled, but devices making use of either MSI
or MSI-X work.  The level irqfd and eoifd support I've proposed
for KVM enable an accelerated patch for this through KVM.  I'd
like to get this base driver in first and enable the remaining
support in-tree.

I've split this version up a little from the RFC to make it a bit
easier to review.  Review comments from Blue Swirl and Avi are
already incorporated, including Avi's requests to simplify both
the PCI BAR mapping and unmapping paths.

This series is also available at:

git://github.com/awilliam/qemu-vfio.git tags/vfio-pci-for-qemu-1.2-v3

Thanks,

Alex

---

Alex Williamson (4):
      vfio: Enable vfio-pci and mark supported
      vfio: vfio-pci device assignment driver
      Update Linux kernel headers
      Update kernel header script to include vfio

 MAINTAINERS                       |    5 
 configure                         |    6 
 hw/i386/Makefile.objs             |    1 
 hw/vfio_pci.c                     | 1870 +++++++++++++++++++++++++++++++++++++
 hw/vfio_pci_int.h                 |  112 ++
 linux-headers/asm-s390/kvm.h      |    2 
 linux-headers/asm-s390/kvm_para.h |    2 
 linux-headers/asm-x86/kvm.h       |    1 
 linux-headers/asm-x86/kvm_para.h  |    7 
 linux-headers/linux/kvm.h         |    3 
 linux-headers/linux/vfio.h        |  368 +++++++
 scripts/update-linux-headers.sh   |    2 
 12 files changed, 2376 insertions(+), 3 deletions(-)
 create mode 100644 hw/vfio_pci.c
 create mode 100644 hw/vfio_pci_int.h
 create mode 100644 linux-headers/linux/vfio.h

^ permalink raw reply

* Re: [RFC-v2 3/6] vhost-scsi: add -vhost-scsi host device for use with tcm-vhost
From: Nicholas A. Bellinger @ 2012-08-14 20:31 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, Stefan Hajnoczi, kvm-devel, Jan Kiszka,
	qemu-devel, Zhi Yong Wu, Anthony Liguori, target-devel,
	Paolo Bonzini, lf-virt, Christoph Hellwig
In-Reply-To: <20120813085325.GH14081@redhat.com>

On Mon, 2012-08-13 at 11:53 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 08:35:14AM +0000, Nicholas A. Bellinger wrote:
> > From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > 
> > This patch adds a new type of host device that drives the vhost_scsi
> > device.  The syntax to add vhost-scsi is:
> > 
> >   qemu -vhost-scsi id=vhost-scsi0,wwpn=...,tpgt=123
> > 
> > The virtio-scsi emulated device will make use of vhost-scsi to process
> > virtio-scsi requests inside the kernel and hand them to the in-kernel
> > SCSI target stack using the tcm_vhost fabric driver.

<SNIP>

> > +static VHostSCSI *vhost_scsi_add(const char *id, const char *wwpn,
> > +                                 uint16_t tpgt)
> > +{
> > +    VHostSCSI *vs = g_malloc0(sizeof(*vs));
> > +    int ret;
> > +
> > +    /* TODO set up vhost-scsi device and bind to tcm_vhost/$wwpm/tpgt_$tpgt */
> > +    fprintf(stderr, "wwpn = \"%s\" tpgt = \"%u\"\n", id, tpgt);
> > +
> > +    ret = vhost_dev_init(&vs->dev, -1, "/dev/vhost-scsi", false);
> 
> This -1 is a hack. You need to support passing in fd from
> the monitor, and pass it here.
> 

Mmm, looking at how vhost_net_init + tap.c does this, but am not quite
what fd needs to be propagated up for virtio-scsi -> vhost-scsi..

Can you please elaborate on this one a bit more..?

--nab

^ permalink raw reply

* Re: [RFC-v2 6/6] virtio-scsi: Fix incorrect VirtIOSCSI->cmd_vqs[0] definition
From: Nicholas A. Bellinger @ 2012-08-14 20:20 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Stefan Hajnoczi, kvm-devel, Jan Kiszka, qemu-devel, Zhi Yong Wu,
	Anthony Liguori, target-devel, Hannes Reinecke, Paolo Bonzini,
	lf-virt, Christoph Hellwig
In-Reply-To: <20120813090259.GJ14081@redhat.com>

On Mon, 2012-08-13 at 12:02 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 08:35:17AM +0000, Nicholas A. Bellinger wrote:
> > From: Nicholas Bellinger <nab@linux-iscsi.org>
> > 
> > This patch fixes bug in the definition of VirtIOSCSI->cmd_vqs[0],
> > where the return of virtio_add_queue() in virtio_scsi_init() ends up
> > overwriting past the end of ->cmd_vqs[0].
> > 
> > Since virtio_scsi currently assumes a single vqs for data, this patch
> > simply changes ->cmd_vqs[1] to handle the single VirtQueue.
> > 
> > Cc: Paolo Bonzini <pbonzini@redhat.com>
> > Cc: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
> 
> This is a bugfix we need even without vhost, right?
> 

I believe so, as it appears to be stomping past the end of memory for
every virtio-scsi initialization regardless of vhost usage.. 

Paolo, can you pickup this fix now for stable so it can be dropped from
RFC-v3..?

--nab

> > ---
> >  hw/virtio-scsi.c |    2 +-
> >  1 files changed, 1 insertions(+), 1 deletions(-)
> > 
> > diff --git a/hw/virtio-scsi.c b/hw/virtio-scsi.c
> > index 5e2ff6b..2c70f89 100644
> > --- a/hw/virtio-scsi.c
> > +++ b/hw/virtio-scsi.c
> > @@ -150,7 +150,7 @@ typedef struct {
> >      bool events_dropped;
> >      VirtQueue *ctrl_vq;
> >      VirtQueue *event_vq;
> > -    VirtQueue *cmd_vqs[0];
> > +    VirtQueue *cmd_vqs[1];
> >  
> >      bool vhost_started;
> >      VHostSCSI *vhost_scsi;
> > -- 
> > 1.7.2.5
> --
> To unsubscribe from this list: send the line "unsubscribe target-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [RFC-v2 1/6] msix: Work-around for vhost-scsi with KVM in-kernel MSI injection
From: Nicholas A. Bellinger @ 2012-08-14 20:10 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Anthony Liguori, Stefan Hajnoczi, kvm-devel, Jan Kiszka,
	qemu-devel, Zhi Yong Wu, Anthony Liguori, target-devel,
	Paolo Bonzini, lf-virt, Christoph Hellwig
In-Reply-To: <20120813181705.GC19460@redhat.com>

On Mon, 2012-08-13 at 21:17 +0300, Michael S. Tsirkin wrote:
> On Mon, Aug 13, 2012 at 08:06:17PM +0200, Jan Kiszka wrote:
> > On 2012-08-13 20:03, Michael S. Tsirkin wrote:
> > > On Mon, Aug 13, 2012 at 02:06:10PM +0200, Jan Kiszka wrote:
> > >> On 2012-08-13 10:35, Nicholas A. Bellinger wrote:
> > >>> From: Nicholas Bellinger <nab@linux-iscsi.org>
> > >>>
> > >>> This is required to get past the following assert with:
> > >>>
> > >>> commit 1523ed9e1d46b0b54540049d491475ccac7e6421
> > >>> Author: Jan Kiszka <jan.kiszka@siemens.com>
> > >>> Date:   Thu May 17 10:32:39 2012 -0300
> > >>>
> > >>>     virtio/vhost: Add support for KVM in-kernel MSI injection
> > >>>
> > >>> Cc: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> > >>> Cc: Jan Kiszka <jan.kiszka@siemens.com>
> > >>> Cc: Paolo Bonzini <pbonzini@redhat.com>
> > >>> Cc: Anthony Liguori <aliguori@us.ibm.com>
> > >>> Signed-off-by: Nicholas Bellinger <nab@linux-iscsi.org>
> > >>> ---
> > >>>  hw/msix.c |    3 +++
> > >>>  1 files changed, 3 insertions(+), 0 deletions(-)
> > >>>
> > >>> diff --git a/hw/msix.c b/hw/msix.c
> > >>> index 800fc32..c1e6dc3 100644
> > >>> --- a/hw/msix.c
> > >>> +++ b/hw/msix.c
> > >>> @@ -544,6 +544,9 @@ void msix_unset_vector_notifiers(PCIDevice *dev)
> > >>>  {
> > >>>      int vector;
> > >>>  
> > >>> +    if (!dev->msix_vector_use_notifier && !dev->msix_vector_release_notifier)
> > >>> +        return;
> > >>> +
> > >>>      assert(dev->msix_vector_use_notifier &&
> > >>>             dev->msix_vector_release_notifier);
> > >>>  
> > >>>
> > >>
> > >> I think to remember pointing out that there is a bug somewhere in the
> > >> reset code which deactivates a non-active vhost instance, no?
> > >>
> > >> Jan
> > > 
> > > Could not find it. Could you dig it up pls?
> > 
> > http://thread.gmane.org/gmane.linux.scsi.target.devel/2277/focus=2309
> > 
> > Jan
> 
> Ah yes. So let's not work around, need to get to the bottom of that.
> 

Ok, so the assert being triggered in msix_unset_vector_notifiers()
appears to have been a side effect of the memory corruption bug in
virtio-scsi fixed in Patch #6, and is no longer required to start
vhost-scsi with the bugfix in place.

That said, dropping this patch for RFC-v3..

^ permalink raw reply

* Re: [GIT PULL] VFIO update for 3.6-rc1
From: Alex Williamson @ 2012-08-14 19:59 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, kvm
In-Reply-To: <1344366349.3441.86.camel@ul30vt.home>

On Tue, 2012-08-07 at 13:05 -0600, Alex Williamson wrote:
> Hi Linus,
> 
> The following changes since commit 42a579a0f960081cd16fc945036e4780c3ad3202:
> 
>   Merge branches 'timers-urgent-for-linus' and 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip (2012-08-05 22:28:49 +0300)
> 
> are available in the git repository at:
> 
> 
>   git://github.com/awilliam/linux-vfio.git tags/vfio-for-v3.6-rc1
> 
> for you to fetch changes up to 817fea2df3c24b22f6123dc0106eb063b7132883:
> 
>   vfio: Include vfio.h in installed headers (2012-08-07 11:48:33 -0600)
> 
> ----------------------------------------------------------------
> VFIO for v3.6-rc1
> 
> Just a trivial patch to include vfio.h in the installed headers
> so we can complete userspace integration into QEMU.  Thanks!
> 
> ----------------------------------------------------------------
> Alex Williamson (1):
>       vfio: Include vfio.h in installed headers
> 
>  include/linux/Kbuild | 1 +
>  1 file changed, 1 insertion(+)
> 

Hi Linus,

I'd really appreciate getting this fix in, I can send as a patch if you
prefer for such a trivial change.  Thanks,

Alex

^ permalink raw reply

* Re: [PATCH v8] kvm: notify host when the guest is panicked
From: Peter Maydell @ 2012-08-14 19:58 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Yan Vugenfirer, kvm list, Gleb Natapov, Jan Kiszka,
	Marcelo Tosatti, linux-kernel@vger.kernel.org, qemu-devel,
	Avi Kivity, KAMEZAWA Hiroyuki
In-Reply-To: <87boidgvaq.fsf@codemonkey.ws>

On 14 August 2012 19:53, Anthony Liguori <anthony@codemonkey.ws> wrote:
> Forget about !x86 platforms.  They have their own way to do this sort of
> thing.  Think of this feature like a status LED on a motherboard.  These
> are very common and usually controlled by IO ports.

Please don't forget !x86 platforms, we are cute and loveable really :-)

> We're simply reserving a "status LED" for the guest to indicate that it
> has paniced.  Let's not over engineer this.

...not that QEMU actually has any kind of "front panel lights and switches"
interface at all, it might be nice to have one. I bet a lot of the embedded
boards have function DIP switches, heartbeat LEDs, etc etc...

-- PMM

^ permalink raw reply

* Re: [PATCH 3/4] s390/kvm: Add a channel I/O based virtio transport driver.
From: Anthony Liguori @ 2012-08-14 19:56 UTC (permalink / raw)
  To: Cornelia Huck, KVM, linux-s390, qemu-devel
  Cc: Avi Kivity, Marcelo Tosatti, Rusty Russell, Christian Borntraeger,
	Carsten Otte, Alexander Graf, Heiko Carstens, Martin Schwidefsky,
	Sebastian Ott
In-Reply-To: <1344351168-2568-4-git-send-email-cornelia.huck@de.ibm.com>

Cornelia Huck <cornelia.huck@de.ibm.com> writes:

> Add a driver for kvm guests that matches virtual ccw devices provided
> by the host as virtio bridge devices.
>
> These virtio-ccw devices use a special set of channel commands in order
> to perform virtio functions.
>
> Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>

Hi,

Have you written an appendix for the virtio specification for
virtio-ccw?  I think it would be good to include in this series for the
purposes of review.

Regards,

Anthony Liguori

> ---
>  arch/s390/include/asm/irq.h   |   1 +
>  arch/s390/kernel/irq.c        |   1 +
>  drivers/s390/kvm/Makefile     |   2 +-
>  drivers/s390/kvm/virtio_ccw.c | 761 ++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 764 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/s390/kvm/virtio_ccw.c
>
> diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
> index 2b9d418..b4bea53 100644
> --- a/arch/s390/include/asm/irq.h
> +++ b/arch/s390/include/asm/irq.h
> @@ -31,6 +31,7 @@ enum interruption_class {
>  	IOINT_CTC,
>  	IOINT_APB,
>  	IOINT_CSC,
> +	IOINT_VIR,
>  	NMI_NMI,
>  	NR_IRQS,
>  };
> diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
> index dd7630d..2cc7eed 100644
> --- a/arch/s390/kernel/irq.c
> +++ b/arch/s390/kernel/irq.c
> @@ -56,6 +56,7 @@ static const struct irq_class intrclass_names[] = {
>  	{.name = "CTC", .desc = "[I/O] CTC" },
>  	{.name = "APB", .desc = "[I/O] AP Bus" },
>  	{.name = "CSC", .desc = "[I/O] CHSC Subchannel" },
> +	{.name = "VIR", .desc = "[I/O] Virtual I/O Devices" },
>  	{.name = "NMI", .desc = "[NMI] Machine Check" },
>  };
>  
> diff --git a/drivers/s390/kvm/Makefile b/drivers/s390/kvm/Makefile
> index 0815690..241891a 100644
> --- a/drivers/s390/kvm/Makefile
> +++ b/drivers/s390/kvm/Makefile
> @@ -6,4 +6,4 @@
>  # it under the terms of the GNU General Public License (version 2 only)
>  # as published by the Free Software Foundation.
>  
> -obj-$(CONFIG_S390_GUEST) += kvm_virtio.o
> +obj-$(CONFIG_S390_GUEST) += kvm_virtio.o virtio_ccw.o
> diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
> new file mode 100644
> index 0000000..df0f994
> --- /dev/null
> +++ b/drivers/s390/kvm/virtio_ccw.c
> @@ -0,0 +1,761 @@
> +/*
> + * ccw based virtio transport
> + *
> + * Copyright IBM Corp. 2012
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License (version 2 only)
> + * as published by the Free Software Foundation.
> + *
> + *    Author(s): Cornelia Huck <cornelia.huck@de.ibm.com>
> + */
> +
> +#include <linux/kernel_stat.h>
> +#include <linux/init.h>
> +#include <linux/bootmem.h>
> +#include <linux/err.h>
> +#include <linux/virtio.h>
> +#include <linux/virtio_config.h>
> +#include <linux/slab.h>
> +#include <linux/virtio_console.h>
> +#include <linux/interrupt.h>
> +#include <linux/virtio_ring.h>
> +#include <linux/pfn.h>
> +#include <linux/async.h>
> +#include <linux/wait.h>
> +#include <linux/list.h>
> +#include <linux/bitops.h>
> +#include <linux/module.h>
> +#include <asm/io.h>
> +#include <asm/kvm_para.h>
> +#include <asm/setup.h>
> +#include <asm/irq.h>
> +#include <asm/cio.h>
> +#include <asm/ccwdev.h>
> +
> +/*
> + * virtio related functions
> + */
> +
> +struct vq_config_block {
> +	__u16 index;
> +	__u16 num;
> +} __attribute__ ((packed));
> +
> +#define VIRTIO_CCW_CONFIG_SIZE 0x100
> +/* same as PCI config space size, should be enough for all drivers */
> +
> +struct virtio_ccw_device {
> +	struct virtio_device vdev;
> +	__u8 status;
> +	__u8 config[VIRTIO_CCW_CONFIG_SIZE];
> +	struct ccw_device *cdev;
> +	struct ccw1 ccw;
> +	__u32 area;
> +	__u32 curr_io;
> +	int err;
> +	wait_queue_head_t wait_q;
> +	spinlock_t lock;
> +	struct list_head virtqueues;
> +	unsigned long indicators; /* XXX - works because we're under 64 bit */
> +	struct vq_config_block *config_block;
> +};
> +
> +struct vq_info_block {
> +	__u64 queue;
> +	__u16 num;
> +} __attribute__ ((packed));
> +
> +struct virtio_ccw_vq_info {
> +	struct virtqueue *vq;
> +	int num;
> +	int queue_index;
> +	void *queue;
> +	struct vq_info_block *info_block;
> +	struct list_head node;
> +};
> +
> +#define KVM_VIRTIO_CCW_RING_ALIGN 4096
> +
> +#define CCW_CMD_SET_VQ 0x13
> +#define CCW_CMD_VDEV_RESET 0x33
> +#define CCW_CMD_SET_IND 0x43
> +#define CCW_CMD_READ_FEAT 0x12
> +#define CCW_CMD_WRITE_FEAT 0x11
> +#define CCW_CMD_READ_CONF 0x22
> +#define CCW_CMD_WRITE_CONF 0x21
> +#define CCW_CMD_WRITE_STATUS 0x31
> +#define CCW_CMD_READ_VQ_CONF 0x32
> +
> +#define VIRTIO_CCW_DOING_SET_VQ 0x00010000
> +#define VIRTIO_CCW_DOING_RESET 0x00040000
> +#define VIRTIO_CCW_DOING_READ_FEAT 0x00080000
> +#define VIRTIO_CCW_DOING_WRITE_FEAT 0x00100000
> +#define VIRTIO_CCW_DOING_READ_CONFIG 0x00200000
> +#define VIRTIO_CCW_DOING_WRITE_CONFIG 0x00400000
> +#define VIRTIO_CCW_DOING_WRITE_STATUS 0x00800000
> +#define VIRTIO_CCW_DOING_SET_IND 0x01000000
> +#define VIRTIO_CCW_DOING_READ_VQ_CONF 0x02000000
> +#define VIRTIO_CCW_INTPARM_MASK 0xffff0000
> +
> +static struct virtio_ccw_device *to_vc_device(struct virtio_device *vdev)
> +{
> +	return container_of(vdev, struct virtio_ccw_device, vdev);
> +}
> +
> +static int doing_io(struct virtio_ccw_device *vcdev, __u32 flag)
> +{
> +	unsigned long flags;
> +	__u32 ret;
> +
> +	spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
> +	if (vcdev->err)
> +		ret = vcdev->err;
> +	else
> +		ret = vcdev->curr_io & flag;
> +	spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
> +	return ret;
> +}
> +
> +static int ccw_io_helper(struct virtio_ccw_device *vcdev, __u32 intparm)
> +{
> +	int ret;
> +	unsigned long flags;
> +	int flag = intparm & VIRTIO_CCW_INTPARM_MASK;
> +
> +	spin_lock_irqsave(get_ccwdev_lock(vcdev->cdev), flags);
> +	ret = ccw_device_start(vcdev->cdev, &vcdev->ccw, intparm, 0, 0);
> +	if (!ret)
> +		vcdev->curr_io |= flag;
> +	spin_unlock_irqrestore(get_ccwdev_lock(vcdev->cdev), flags);
> +	wait_event(vcdev->wait_q, doing_io(vcdev, flag) == 0);
> +	return ret ? ret : vcdev->err;
> +}
> +
> +static void virtio_ccw_kvm_notify(struct virtqueue *vq)
> +{
> +	struct virtio_ccw_vq_info *info = vq->priv;
> +	struct virtio_ccw_device *vcdev;
> +	struct subchannel_id schid;
> +	__u32 reg2;
> +
> +	vcdev = to_vc_device(info->vq->vdev);
> +	ccw_device_get_schid(vcdev->cdev, &schid);
> +	reg2 = *(__u32 *)&schid;
> +	kvm_hypercall2(3 /* CCW_NOTIFY */, reg2, info->queue_index);
> +}
> +
> +static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev, int index)
> +{
> +	vcdev->config_block->index = index;
> +	vcdev->ccw.cmd_code = CCW_CMD_READ_VQ_CONF;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(struct vq_config_block);
> +	vcdev->ccw.cda = (__u32)(unsigned long)(vcdev->config_block);
> +	ccw_io_helper(vcdev, VIRTIO_CCW_DOING_READ_VQ_CONF);
> +	return vcdev->config_block->num;
> +}
> +
> +static void virtio_ccw_del_vq(struct virtqueue *vq)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vq->vdev);
> +	struct virtio_ccw_vq_info *info = vq->priv;
> +	unsigned long flags;
> +	unsigned long size;
> +	int ret;
> +
> +	/* Remove from our list. */
> +	spin_lock_irqsave(&vcdev->lock, flags);
> +	list_del(&info->node);
> +	spin_unlock_irqrestore(&vcdev->lock, flags);
> +
> +	/* Release from host. */
> +	info->info_block->queue = 0;
> +	info->info_block->num = info->queue_index;
> +	vcdev->ccw.cmd_code = CCW_CMD_SET_VQ;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(*info->info_block);
> +	vcdev->ccw.cda = (__u32)(unsigned long)(info->info_block);
> +	ret = ccw_io_helper(vcdev, VIRTIO_CCW_DOING_SET_VQ | info->queue_index);
> +	if (ret)
> +		dev_warn(&vq->vdev->dev, "Error %x while deleting queue %d",
> +			 ret, info->queue_index);
> +
> +	vring_del_virtqueue(vq);
> +	size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
> +	free_pages_exact(info->queue, size);
> +	kfree(info->info_block);
> +	kfree(info);
> +}
> +
> +static void virtio_ccw_del_vqs(struct virtio_device *vdev)
> +{
> +	struct virtqueue *vq, *n;
> +
> +	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
> +		virtio_ccw_del_vq(vq);
> +}
> +
> +static struct virtqueue *virtio_ccw_setup_vq(struct virtio_device *vdev,
> +					     int i, vq_callback_t *callback,
> +					     const char *name)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +	int err;
> +	struct virtqueue *vq;
> +	struct virtio_ccw_vq_info *info;
> +	unsigned long size;
> +	unsigned long flags;
> +
> +	/* Allocate queue. */
> +	info = kzalloc(sizeof(struct virtio_ccw_vq_info), GFP_KERNEL);
> +	if (!info) {
> +		dev_warn(&vcdev->cdev->dev, "no info\n");
> +		err = -ENOMEM;
> +		goto out_err;
> +	}
> +	info->info_block = kzalloc(sizeof(*info->info_block),
> +				   GFP_DMA | GFP_KERNEL);
> +	if (!info->info_block) {
> +		dev_warn(&vcdev->cdev->dev, "no info block\n");
> +		err = -ENOMEM;
> +		goto out_err;
> +	}
> +	info->queue_index = i;
> +	info->num = virtio_ccw_read_vq_conf(vcdev, i);
> +	size = PAGE_ALIGN(vring_size(info->num, KVM_VIRTIO_CCW_RING_ALIGN));
> +	info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
> +	if (info->queue == NULL) {
> +		dev_warn(&vcdev->cdev->dev, "no queue\n");
> +		err = -ENOMEM;
> +		goto out_err;
> +	}
> +	vq = vring_new_virtqueue(info->num, KVM_VIRTIO_CCW_RING_ALIGN, vdev,
> +				 true, info->queue, virtio_ccw_kvm_notify,
> +				 callback, name);
> +	if (!vq) {
> +		dev_warn(&vcdev->cdev->dev, "no vq\n");
> +		err = -ENOMEM;
> +		free_pages_exact(info->queue, size);
> +		goto out_err;
> +	}
> +	info->vq = vq;
> +	vq->priv = info;
> +
> +	/* Register it with the host. */
> +	info->info_block->queue = (__u64)info->queue;
> +	info->info_block->num = info->queue_index;
> +	vcdev->ccw.cmd_code = CCW_CMD_SET_VQ;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(*info->info_block);
> +	vcdev->ccw.cda = (__u32)(unsigned long)(info->info_block);
> +	err = ccw_io_helper(vcdev, VIRTIO_CCW_DOING_SET_VQ | info->queue_index);
> +	if (err) {
> +		dev_warn(&vcdev->cdev->dev, "SET_VQ failed\n");
> +		free_pages_exact(info->queue, size);
> +		info->vq = NULL;
> +		vq->priv = NULL;
> +		goto out_err;
> +	}
> +
> +	/* Save it to our list. */
> +	spin_lock_irqsave(&vcdev->lock, flags);
> +	list_add(&info->node, &vcdev->virtqueues);
> +	spin_unlock_irqrestore(&vcdev->lock, flags);
> +
> +	return vq;
> +
> +out_err:
> +	if (info)
> +		kfree(info->info_block);
> +	kfree(info);
> +	return ERR_PTR(err);
> +}
> +
> +static int virtio_ccw_find_vqs(struct virtio_device *vdev, unsigned nvqs,
> +			       struct virtqueue *vqs[],
> +			       vq_callback_t *callbacks[],
> +			       const char *names[])
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +	int ret, i;
> +
> +	for (i = 0; i < nvqs; ++i) {
> +		vqs[i] = virtio_ccw_setup_vq(vdev, i, callbacks[i], names[i]);
> +		if (IS_ERR(vqs[i])) {
> +			ret = PTR_ERR(vqs[i]);
> +			vqs[i] = NULL;
> +			goto out;
> +		}
> +	}
> +	/* Register queue indicators with host. */
> +	vcdev->indicators = 0;
> +	vcdev->ccw.cmd_code = CCW_CMD_SET_IND;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(vcdev->indicators);
> +	vcdev->ccw.cda = (__u32)(unsigned long)(&vcdev->indicators);
> +	ret = ccw_io_helper(vcdev, VIRTIO_CCW_DOING_SET_IND);
> +	if (ret)
> +		goto out;
> +	return 0;
> +out:
> +	virtio_ccw_del_vqs(vdev);
> +	return ret;
> +}
> +
> +static void virtio_ccw_reset(struct virtio_device *vdev)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +
> +	/* Send a reset ccw on device. */
> +	vcdev->ccw.cmd_code = CCW_CMD_VDEV_RESET;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = 0;
> +	vcdev->ccw.cda = 0;
> +	ccw_io_helper(vcdev, VIRTIO_CCW_DOING_RESET);
> +}
> +
> +static u32 virtio_ccw_get_features(struct virtio_device *vdev)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +	u32 features;
> +	int ret;
> +
> +	/* Read the feature bits from the host. */
> +	vcdev->ccw.cmd_code = CCW_CMD_READ_FEAT;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(features);
> +	vcdev->ccw.cda = vcdev->area;
> +	ret = ccw_io_helper(vcdev, VIRTIO_CCW_DOING_READ_FEAT);
> +	if (ret)
> +		return 0;
> +
> +	memcpy(&features, (void *)(unsigned long)vcdev->area,
> +	       sizeof(features));
> +	return le32_to_cpu(features);
> +}
> +
> +static void virtio_ccw_finalize_features(struct virtio_device *vdev)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +
> +	/* Give virtio_ring a chance to accept features. */
> +	vring_transport_features(vdev);
> +
> +	memcpy((void *)(unsigned long)vcdev->area, vdev->features,
> +	       sizeof(*vdev->features));
> +	/* Write the feature bits to the host. */
> +	vcdev->ccw.cmd_code = CCW_CMD_WRITE_FEAT;
> +	/* Sigh. The kernel's features may be longer than the host's. */
> +	vcdev->ccw.flags = CCW_FLAG_SLI;
> +	vcdev->ccw.count = sizeof(*vdev->features);
> +	vcdev->ccw.cda = vcdev->area;
> +	ccw_io_helper(vcdev, VIRTIO_CCW_DOING_WRITE_FEAT);
> +}
> +
> +static void virtio_ccw_get_config(struct virtio_device *vdev,
> +				  unsigned int offset, void *buf, unsigned len)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +	int ret;
> +
> +	/* Read the config area from the host. */
> +	vcdev->ccw.cmd_code = CCW_CMD_READ_CONF;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = offset + len;
> +	vcdev->ccw.cda = vcdev->area;
> +	ret = ccw_io_helper(vcdev, VIRTIO_CCW_DOING_READ_CONFIG);
> +	if (ret)
> +		return;
> +
> +	memcpy(vcdev->config, (void *)(unsigned long)vcdev->area,
> +	       sizeof(vcdev->config));
> +	memcpy(buf, &vcdev->config[offset], len);
> +}
> +
> +static void virtio_ccw_set_config(struct virtio_device *vdev,
> +				  unsigned int offset, const void *buf,
> +				  unsigned len)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +
> +	memcpy(&vcdev->config[offset], buf, len);
> +	/* Write the config area to the host. */
> +	memcpy((void *)(unsigned long)vcdev->area, vcdev->config,
> +	       sizeof(vcdev->config));
> +	vcdev->ccw.cmd_code = CCW_CMD_WRITE_CONF;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = offset + len;
> +	vcdev->ccw.cda = vcdev->area;
> +	ccw_io_helper(vcdev, VIRTIO_CCW_DOING_WRITE_CONFIG);
> +}
> +
> +static u8 virtio_ccw_get_status(struct virtio_device *vdev)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +
> +	return vcdev->status;
> +}
> +
> +static void virtio_ccw_set_status(struct virtio_device *vdev, u8 status)
> +{
> +	struct virtio_ccw_device *vcdev = to_vc_device(vdev);
> +
> +	/* Write the status to the host. */
> +	vcdev->status = status;
> +	memcpy((void *)(unsigned long)vcdev->area, &status, sizeof(status));
> +	vcdev->ccw.cmd_code = CCW_CMD_WRITE_STATUS;
> +	vcdev->ccw.flags = 0;
> +	vcdev->ccw.count = sizeof(status);
> +	vcdev->ccw.cda = vcdev->area;
> +	ccw_io_helper(vcdev, VIRTIO_CCW_DOING_WRITE_STATUS);
> +}
> +
> +static struct virtio_config_ops virtio_ccw_config_ops = {
> +	.get_features = virtio_ccw_get_features,
> +	.finalize_features = virtio_ccw_finalize_features,
> +	.get = virtio_ccw_get_config,
> +	.set = virtio_ccw_set_config,
> +	.get_status = virtio_ccw_get_status,
> +	.set_status = virtio_ccw_set_status,
> +	.reset = virtio_ccw_reset,
> +	.find_vqs = virtio_ccw_find_vqs,
> +	.del_vqs = virtio_ccw_del_vqs,
> +};
> +
> +
> +/*
> + * ccw bus driver related functions
> + */
> +
> +static void virtio_ccw_release_dev(struct device *_d)
> +{
> +	struct virtio_device *dev = container_of(_d, struct virtio_device,
> +						 dev);
> +	struct virtio_ccw_device *vcdev = to_vc_device(dev);
> +
> +	kfree((void *)(unsigned long)vcdev->area);
> +	kfree(vcdev->config_block);
> +	kfree(vcdev);
> +}
> +
> +static int irb_is_error(struct irb *irb)
> +{
> +	if (scsw_cstat(&irb->scsw) != 0)
> +		return 1;
> +	if (scsw_dstat(&irb->scsw) & ~(DEV_STAT_CHN_END | DEV_STAT_DEV_END))
> +		return 1;
> +	if (scsw_cc(&irb->scsw) != 0)
> +		return 1;
> +	return 0;
> +}
> +
> +static struct virtqueue *virtio_ccw_vq_by_ind(struct virtio_ccw_device *vcdev,
> +					      int index)
> +{
> +	struct virtio_ccw_vq_info *info;
> +	unsigned long flags;
> +	struct virtqueue *vq;
> +
> +	vq = NULL;
> +	spin_lock_irqsave(&vcdev->lock, flags);
> +	list_for_each_entry(info, &vcdev->virtqueues, node) {
> +		if (info->queue_index == index) {
> +			vq = info->vq;
> +			break;
> +		}
> +	}
> +	spin_unlock_irqrestore(&vcdev->lock, flags);
> +	return vq;
> +}
> +
> +static void virtio_ccw_int_handler(struct ccw_device *cdev,
> +				   unsigned long intparm,
> +				   struct irb *irb)
> +{
> +	__u32 activity = intparm & VIRTIO_CCW_INTPARM_MASK;
> +	struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
> +	int i;
> +	struct virtqueue *vq;
> +
> +	/* Check if it's a notification from the host. */
> +	if ((intparm == 0) &&
> +	    (scsw_stctl(&irb->scsw) ==
> +	     (SCSW_STCTL_ALERT_STATUS | SCSW_STCTL_STATUS_PEND))) {
> +		/* OK */
> +	}
> +	if (irb_is_error(irb))
> +		vcdev->err = -EIO; /* XXX - use real error */
> +	if (vcdev->curr_io & activity) {
> +		switch (activity) {
> +		case VIRTIO_CCW_DOING_READ_FEAT:
> +		case VIRTIO_CCW_DOING_WRITE_FEAT:
> +		case VIRTIO_CCW_DOING_READ_CONFIG:
> +		case VIRTIO_CCW_DOING_WRITE_CONFIG:
> +		case VIRTIO_CCW_DOING_WRITE_STATUS:
> +		case VIRTIO_CCW_DOING_SET_VQ:
> +		case VIRTIO_CCW_DOING_SET_IND:
> +		case VIRTIO_CCW_DOING_RESET:
> +		case VIRTIO_CCW_DOING_READ_VQ_CONF:
> +			vcdev->curr_io &= ~activity;
> +			wake_up(&vcdev->wait_q);
> +			break;
> +		default:
> +			/* don't know what to do... */
> +			dev_warn(&cdev->dev, "Suspicious activity '%08x'\n",
> +				 activity);
> +			WARN_ON(1);
> +			break;
> +		}
> +	}
> +	for_each_set_bit(i, &vcdev->indicators,
> +			 sizeof(vcdev->indicators)) {
> +		vq = virtio_ccw_vq_by_ind(vcdev, i);
> +		vring_interrupt(0, vq);
> +		clear_bit(i, &vcdev->indicators);
> +	}
> +}
> +
> +/*
> + * We usually want to autoonline all devices, but give the admin
> + * a way to exempt devices from this.
> + */
> +#define __DEV_WORDS ((__MAX_SUBCHANNEL + (8*sizeof(long) - 1)) / \
> +		     (8*sizeof(long)))
> +static unsigned long devs_no_auto[__MAX_SSID + 1][__DEV_WORDS];
> +
> +static char *no_auto = "";
> +
> +module_param(no_auto, charp, 0444);
> +MODULE_PARM_DESC(no_auto, "list of ccw bus id ranges not to be auto-onlined");
> +
> +static int virtio_ccw_check_autoonline(struct ccw_device *cdev)
> +{
> +	struct ccw_dev_id id;
> +
> +	ccw_device_get_id(cdev, &id);
> +	if (test_bit(id.devno, devs_no_auto[id.ssid]))
> +		return 0;
> +	return 1;
> +}
> +
> +static void virtio_ccw_auto_online(void *data, async_cookie_t cookie)
> +{
> +	struct ccw_device *cdev = data;
> +	int ret;
> +
> +	ret = ccw_device_set_online(cdev);
> +	if (ret)
> +		dev_warn(&cdev->dev, "Failed to set online: %d\n", ret);
> +}
> +
> +static int virtio_ccw_probe(struct ccw_device *cdev)
> +{
> +	cdev->handler = virtio_ccw_int_handler;
> +
> +	if (virtio_ccw_check_autoonline(cdev))
> +		async_schedule(virtio_ccw_auto_online, cdev);
> +	return 0;
> +}
> +
> +static void virtio_ccw_remove(struct ccw_device *cdev)
> +{
> +	cdev->handler = NULL;
> +}
> +
> +static int virtio_ccw_offline(struct ccw_device *cdev)
> +{
> +	struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev);
> +
> +	unregister_virtio_device(&vcdev->vdev);
> +	dev_set_drvdata(&cdev->dev, NULL);
> +	return 0;
> +}
> +
> +
> +/* Area needs to be big enough to fit status, features or configuration. */
> +#define VIRTIO_AREA_SIZE VIRTIO_CCW_CONFIG_SIZE /* biggest possible use */
> +
> +static int virtio_ccw_online(struct ccw_device *cdev)
> +{
> +	int ret;
> +	struct virtio_ccw_device *vcdev;
> +
> +	vcdev = kzalloc(sizeof(*vcdev), GFP_KERNEL);
> +	if (!vcdev) {
> +		dev_warn(&cdev->dev, "Could not get memory for virtio\n");
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}
> +	vcdev->area = (__u32)(unsigned long)kzalloc(VIRTIO_AREA_SIZE,
> +						    GFP_DMA | GFP_KERNEL);
> +	if (!vcdev->area) {
> +		dev_warn(&cdev->dev, "Cound not get memory for virtio\n");
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}
> +	vcdev->config_block = kzalloc(sizeof(*vcdev->config_block),
> +				   GFP_DMA | GFP_KERNEL);
> +	if (!vcdev->config_block) {
> +		ret = -ENOMEM;
> +		goto out_free;
> +	}
> +	vcdev->vdev.dev.parent = &cdev->dev;
> +	vcdev->vdev.dev.release = virtio_ccw_release_dev;
> +	vcdev->vdev.config = &virtio_ccw_config_ops;
> +	vcdev->cdev = cdev;
> +	init_waitqueue_head(&vcdev->wait_q);
> +	INIT_LIST_HEAD(&vcdev->virtqueues);
> +
> +	dev_set_drvdata(&cdev->dev, vcdev);
> +	vcdev->vdev.id.vendor = cdev->id.cu_type;
> +	vcdev->vdev.id.device = cdev->id.cu_model;
> +	ret = register_virtio_device(&vcdev->vdev);
> +	if (ret) {
> +		dev_warn(&cdev->dev, "Failed to register virtio device: %d\n",
> +			 ret);
> +		goto out_put;
> +	}
> +	return 0;
> +out_put:
> +	dev_set_drvdata(&cdev->dev, NULL);
> +	put_device(&vcdev->vdev.dev);
> +	return ret;
> +out_free:
> +	if (vcdev) {
> +		kfree((void *)(unsigned long)vcdev->area);
> +		kfree(vcdev->config_block);
> +	}
> +	kfree(vcdev);
> +	return ret;
> +}
> +
> +static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event)
> +{
> +	/* TODO: Check whether we need special handling here. */
> +	return 0;
> +}
> +
> +static struct ccw_device_id virtio_ids[] = {
> +	{ CCW_DEVICE(0x3832, 0) },
> +	{},
> +};
> +MODULE_DEVICE_TABLE(ccw, virtio_ids);
> +
> +static struct ccw_driver virtio_ccw_driver = {
> +	.driver = {
> +		.owner = THIS_MODULE,
> +		.name = "virtio_ccw",
> +	},
> +	.ids = virtio_ids,
> +	.probe = virtio_ccw_probe,
> +	.remove = virtio_ccw_remove,
> +	.set_offline = virtio_ccw_offline,
> +	.set_online = virtio_ccw_online,
> +	.notify = virtio_ccw_cio_notify,
> +	.int_class = IOINT_VIR,
> +};
> +
> +static int __init pure_hex(char **cp, unsigned int *val, int min_digit,
> +			   int max_digit, int max_val)
> +{
> +	int diff;
> +
> +	diff = 0;
> +	*val = 0;
> +
> +	while (diff <= max_digit) {
> +		int value = hex_to_bin(**cp);
> +
> +		if (value < 0)
> +			break;
> +		*val = *val * 16 + value;
> +		(*cp)++;
> +		diff++;
> +	}
> +
> +	if ((diff < min_digit) || (diff > max_digit) || (*val > max_val))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static int __init parse_busid(char *str, unsigned int *cssid,
> +			      unsigned int *ssid, unsigned int *devno)
> +{
> +	char *str_work;
> +	int rc, ret;
> +
> +	rc = 1;
> +
> +	if (*str == '\0')
> +		goto out;
> +
> +	str_work = str;
> +	ret = pure_hex(&str_work, cssid, 1, 2, __MAX_CSSID);
> +	if (ret || (str_work[0] != '.'))
> +		goto out;
> +	str_work++;
> +	ret = pure_hex(&str_work, ssid, 1, 1, __MAX_SSID);
> +	if (ret || (str_work[0] != '.'))
> +		goto out;
> +	str_work++;
> +	ret = pure_hex(&str_work, devno, 4, 4, __MAX_SUBCHANNEL);
> +	if (ret || (str_work[0] != '\0'))
> +		goto out;
> +
> +	rc = 0;
> +out:
> +	return rc;
> +}
> +
> +static void __init no_auto_parse(void)
> +{
> +	unsigned int from_cssid, to_cssid, from_ssid, to_ssid, from, to;
> +	char *parm, *str;
> +	int rc;
> +
> +	str = no_auto;
> +	while ((parm = strsep(&str, ","))) {
> +		rc = parse_busid(strsep(&parm, "-"), &from_cssid,
> +				 &from_ssid, &from);
> +		if (rc)
> +			continue;
> +		if (parm != NULL) {
> +			rc = parse_busid(parm, &to_cssid,
> +					 &to_ssid, &to);
> +			if ((from_ssid > to_ssid) ||
> +			    ((from_ssid == to_ssid) && (from > to)))
> +				rc = -EINVAL;
> +		} else {
> +			to_cssid = from_cssid;
> +			to_ssid = from_ssid;
> +			to = from;
> +		}
> +		if (rc)
> +			continue;
> +		while ((from_ssid < to_ssid) ||
> +		       ((from_ssid == to_ssid) && (from <= to))) {
> +			set_bit(from, devs_no_auto[from_ssid]);
> +			from++;
> +			if (from > __MAX_SUBCHANNEL) {
> +				from_ssid++;
> +				from = 0;
> +			}
> +		}
> +	}
> +}
> +
> +static int __init virtio_ccw_init(void)
> +{
> +	/* parse no_auto string before we do anything further */
> +	no_auto_parse();
> +	return ccw_driver_register(&virtio_ccw_driver);
> +}
> +module_init(virtio_ccw_init);
> +
> +static void __exit virtio_ccw_exit(void)
> +{
> +	ccw_driver_unregister(&virtio_ccw_driver);
> +}
> +module_exit(virtio_ccw_exit);
> -- 
> 1.7.11.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe kvm" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply

* Re: [PATCH v8] kvm: notify host when the guest is panicked
From: Anthony Liguori @ 2012-08-14 19:35 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: kvm list, Gleb Natapov, Jan Kiszka, Yan Vugenfirer,
	linux-kernel@vger.kernel.org, qemu-devel, Avi Kivity,
	KAMEZAWA Hiroyuki
In-Reply-To: <20120814191927.GA6058@amt.cnet>

Marcelo Tosatti <mtosatti@redhat.com> writes:

> On Tue, Aug 14, 2012 at 01:53:01PM -0500, Anthony Liguori wrote:
>> Marcelo Tosatti <mtosatti@redhat.com> writes:
>> 
>> > On Tue, Aug 14, 2012 at 05:55:54PM +0300, Yan Vugenfirer wrote:
>> >> 
>> >> On Aug 14, 2012, at 1:42 PM, Jan Kiszka wrote:
>> >> 
>> >> > On 2012-08-14 10:56, Daniel P. Berrange wrote:
>> >> >> On Mon, Aug 13, 2012 at 03:21:32PM -0300, Marcelo Tosatti wrote:
>> >> >>> On Wed, Aug 08, 2012 at 10:43:01AM +0800, Wen Congyang wrote:
>> >> >>>> We can know the guest is panicked when the guest runs on xen.
>> >> >>>> But we do not have such feature on kvm.
>> >> >>>> 
>> >> >>>> Another purpose of this feature is: management app(for example:
>> >> >>>> libvirt) can do auto dump when the guest is panicked. If management
>> >> >>>> app does not do auto dump, the guest's user can do dump by hand if
>> >> >>>> he sees the guest is panicked.
>> >> >>>> 
>> >> >>>> We have three solutions to implement this feature:
>> >> >>>> 1. use vmcall
>> >> >>>> 2. use I/O port
>> >> >>>> 3. use virtio-serial.
>> >> >>>> 
>> >> >>>> We have decided to avoid touching hypervisor. The reason why I choose
>> >> >>>> choose the I/O port is:
>> >> >>>> 1. it is easier to implememt
>> >> >>>> 2. it does not depend any virtual device
>> >> >>>> 3. it can work when starting the kernel
>> >> >>> 
>> >> >>> How about searching for the "Kernel panic - not syncing" string 
>> >> >>> in the guests serial output? Say libvirtd could take an action upon
>> >> >>> that?
>> >> >> 
>> >> >> No, this is not satisfactory. It depends on the guest OS being
>> >> >> configured to use the serial port for console output which we
>> >> >> cannot mandate, since it may well be required for other purposes.
>> >> > 
>> >> Please don't forget Windows guests, there is no console and no "Kernel Panic" string ;)
>> >> 
>> >> What I used for debugging purposes on Windows guest is to register a bugcheck callback in virtio-net driver and write 1 to VIRTIO_PCI_ISR register.
>> >> 
>> >> Yan. 
>> >
>> > Considering whether a "panic-device" should cover other OSes is also \
>
>> > something to consider. Even for Linux, is "panic" the only case which
>> > should be reported via the mechanism? What about oopses without panic? 
>> >
>> > Is the mechanism general enough for supporting new events, etc.
>> 
>> Hi,
>> 
>> I think this discussion is gone of the deep end.
>> 
>> Forget about !x86 platforms.  They have their own way to do this sort of
>> thing.  
>
> The panic function in kernel/panic.c has the following options, which
> appear to be arch independent, on panic:
>
> - reboot 
> - blink

Not sure the semantics of blink but that might be a good place for a
pvops hook.

>
> None are paravirtual interfaces however.
>
>> Think of this feature like a status LED on a motherboard.  These
>> are very common and usually controlled by IO ports.
>> 
>> We're simply reserving a "status LED" for the guest to indicate that it
>> has paniced.  Let's not over engineer this.
>
> My concern is that you end up with state that is dependant on x86.
>
> Subject: [PATCH v8 3/6] add a new runstate: RUN_STATE_GUEST_PANICKED
>
> Having the ability to stop/restart the guest (and even introducing a 
> new VM runstate) is more than a status LED analogy.

I must admit, I don't know why a new runstate is necessary/useful.  The
kernel shouldn't have to care about the difference between a halted guest
and a panicked guest.  That level of information belongs in userspace IMHO.

> Can this new infrastructure be used by other architectures?

I guess I don't understand why the kernel side of this isn't anything
more than a paravirt op hook that does a single outb() with the
remaining logic handled 100% in QEMU.

> Do you consider allowing support for Windows as overengineering?

I don't think there is a way to hook BSOD on Windows so attempting to
engineer something that works with Windows seems odd, no?

Regards,

Anthony Liguori

>
>> Regards,
>> 
>> Anthony Liguori
>> 
>> >
>> >> 
>> >> > Well, we have more than a single serial port, even when leaving
>> >> > virtio-serial aside...
>> >> > 
>> >> > Jan
>> >> > 
>> >> > -- 
>> >> > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
>> >> > Corporate Competence Center Embedded Linux
>> >> > --
>> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
>> >> > the body of a message to majordomo@vger.kernel.org
>> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [Qemu-devel] [PATCH v8] kvm: notify host when the guest is panicked
From: Marcelo Tosatti @ 2012-08-14 19:19 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Yan Vugenfirer, kvm list, Jan Kiszka,
	linux-kernel@vger.kernel.org, Gleb Natapov, qemu-devel,
	Avi Kivity, KAMEZAWA Hiroyuki
In-Reply-To: <87boidgvaq.fsf@codemonkey.ws>

On Tue, Aug 14, 2012 at 01:53:01PM -0500, Anthony Liguori wrote:
> Marcelo Tosatti <mtosatti@redhat.com> writes:
> 
> > On Tue, Aug 14, 2012 at 05:55:54PM +0300, Yan Vugenfirer wrote:
> >> 
> >> On Aug 14, 2012, at 1:42 PM, Jan Kiszka wrote:
> >> 
> >> > On 2012-08-14 10:56, Daniel P. Berrange wrote:
> >> >> On Mon, Aug 13, 2012 at 03:21:32PM -0300, Marcelo Tosatti wrote:
> >> >>> On Wed, Aug 08, 2012 at 10:43:01AM +0800, Wen Congyang wrote:
> >> >>>> We can know the guest is panicked when the guest runs on xen.
> >> >>>> But we do not have such feature on kvm.
> >> >>>> 
> >> >>>> Another purpose of this feature is: management app(for example:
> >> >>>> libvirt) can do auto dump when the guest is panicked. If management
> >> >>>> app does not do auto dump, the guest's user can do dump by hand if
> >> >>>> he sees the guest is panicked.
> >> >>>> 
> >> >>>> We have three solutions to implement this feature:
> >> >>>> 1. use vmcall
> >> >>>> 2. use I/O port
> >> >>>> 3. use virtio-serial.
> >> >>>> 
> >> >>>> We have decided to avoid touching hypervisor. The reason why I choose
> >> >>>> choose the I/O port is:
> >> >>>> 1. it is easier to implememt
> >> >>>> 2. it does not depend any virtual device
> >> >>>> 3. it can work when starting the kernel
> >> >>> 
> >> >>> How about searching for the "Kernel panic - not syncing" string 
> >> >>> in the guests serial output? Say libvirtd could take an action upon
> >> >>> that?
> >> >> 
> >> >> No, this is not satisfactory. It depends on the guest OS being
> >> >> configured to use the serial port for console output which we
> >> >> cannot mandate, since it may well be required for other purposes.
> >> > 
> >> Please don't forget Windows guests, there is no console and no "Kernel Panic" string ;)
> >> 
> >> What I used for debugging purposes on Windows guest is to register a bugcheck callback in virtio-net driver and write 1 to VIRTIO_PCI_ISR register.
> >> 
> >> Yan. 
> >
> > Considering whether a "panic-device" should cover other OSes is also \

> > something to consider. Even for Linux, is "panic" the only case which
> > should be reported via the mechanism? What about oopses without panic? 
> >
> > Is the mechanism general enough for supporting new events, etc.
> 
> Hi,
> 
> I think this discussion is gone of the deep end.
> 
> Forget about !x86 platforms.  They have their own way to do this sort of
> thing.  

The panic function in kernel/panic.c has the following options, which
appear to be arch independent, on panic:

- reboot 
- blink

None are paravirtual interfaces however.

> Think of this feature like a status LED on a motherboard.  These
> are very common and usually controlled by IO ports.
> 
> We're simply reserving a "status LED" for the guest to indicate that it
> has paniced.  Let's not over engineer this.

My concern is that you end up with state that is dependant on x86.

Subject: [PATCH v8 3/6] add a new runstate: RUN_STATE_GUEST_PANICKED

Having the ability to stop/restart the guest (and even introducing a 
new VM runstate) is more than a status LED analogy.

Can this new infrastructure be used by other architectures?

Do you consider allowing support for Windows as overengineering?

> Regards,
> 
> Anthony Liguori
> 
> >
> >> 
> >> > Well, we have more than a single serial port, even when leaving
> >> > virtio-serial aside...
> >> > 
> >> > Jan
> >> > 
> >> > -- 
> >> > Siemens AG, Corporate Technology, CT RTC ITP SDP-DE
> >> > Corporate Competence Center Embedded Linux
> >> > --
> >> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> >> > the body of a message to majordomo@vger.kernel.org
> >> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* Re: [PATCH v2 3/4] vfio: vfio-pci device assignment driver
From: Alex Williamson @ 2012-08-14 19:09 UTC (permalink / raw)
  To: Jan Kiszka
  Cc: aik@ozlabs.ru, aliguori@us.ibm.com, qemu-devel@nongnu.org,
	kvm@vger.kernel.org
In-Reply-To: <502A8DAB.6010406@siemens.com>

On Tue, 2012-08-14 at 19:40 +0200, Jan Kiszka wrote:
> Just some comments, didn't look at all details.

Thanks!  I'll send out a v3 soon.  I think that the off-by-one in
strncat doesn't exist though, so would appreciate if you could double
check.  Individual comments below...

> On 2012-08-02 21:17, Alex Williamson wrote:
> > +
> > +static int vfio_msix_vector_use(PCIDevice *pdev,
> > +                                unsigned int vector, MSIMessage msg)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    int ret, fd;
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d used\n", __func__,
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function, vector);
> > +
> > +    if (vdev->interrupt != INT_MSIX) {
> > +        vfio_disable_interrupts(vdev);
> > +    }
> > +
> > +    if (!vdev->msi_vectors) {
> > +        vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(MSIVector));
> > +    }
> > +
> > +    vdev->msi_vectors[vector].vdev = vdev;
> > +    vdev->msi_vectors[vector].vector = vector;
> > +    vdev->msi_vectors[vector].use = true;
> > +
> > +    msix_vector_use(pdev, vector);
> > +
> > +    if (event_notifier_init(&vdev->msi_vectors[vector].interrupt, 0)) {
> > +        error_report("vfio: Error: event_notifier_init failed\n");
> > +    }
> > +
> > +    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> > +
> > +    /*
> > +     * Attempt to enable route through KVM irqchip,
> > +     * default to userspace handling if unavailable.
> > +     */
> > +    vdev->msi_vectors[vector].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> > +    if (vdev->msi_vectors[vector].virq < 0 ||
> > +        kvm_irqchip_add_irqfd(kvm_state, fd,
> > +                              vdev->msi_vectors[vector].virq) < 0) {
> 
> If kvm_irqchip_add_irqfd fails, you have to drop the route and set virq
> to -1. Otherwise, you won't match with the release logic below.

Yep, good catch.

> > +        qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> > +                            &vdev->msi_vectors[vector]);
> > +    }
> > +
> > +    /*
> > +     * We don't want to have the host allocate all possible MSI vectors
> > +     * for a device if they're not in use, so we shutdown and incrementally
> > +     * increase them as needed.
> > +     */
> > +    if (vdev->nr_vectors < vector + 1) {
> > +        int i;
> > +
> > +        vfio_disable_irqindex(vdev, VFIO_PCI_MSIX_IRQ_INDEX);
> > +        vdev->nr_vectors = vector + 1;
> > +        ret = vfio_enable_vectors(vdev, true);
> > +        if (ret) {
> > +            error_report("vfio: failed to enable vectors, %d\n", ret);
> > +        }
> > +
> > +        /* We don't know if we've missed interrupts in the interim... */
> > +        for (i = 0; i < vdev->msix->entries; i++) {
> > +            if (vdev->msi_vectors[i].use) {
> > +                msix_notify(&vdev->pdev, i);
> > +            }
> > +        }
> 
> And it wasn't possible to provide an interface with VFIO that allows
> vector addition/removal on the fly? KVM has an aweful one in this
> regard, but that is legacy, VFIO is new. The above logic is a bit ugly IMHO.

Linux doesn't allow it.  VFIO has a VFIO_IRQ_INFO_NORESIZE attribute on
MSI and MSI-X interrupts to expose this restriction and can be unset if
Linux is updated to be more dynamic at some point.

> > +    } else {
> > +        struct vfio_irq_set_fd irq_set_fd = {
> > +            .irq_set = {
> > +                .argsz = sizeof(irq_set_fd),
> > +                .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> > +                         VFIO_IRQ_SET_ACTION_TRIGGER,
> > +                .index = VFIO_PCI_MSIX_IRQ_INDEX,
> > +                .start = vector,
> > +                .count = 1,
> > +            },
> > +            .fd = fd,
> > +        };
> > +        ret = ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> > +        if (ret) {
> > +            error_report("vfio: failed to modify vector, %d\n", ret);
> > +        }
> > +        msix_notify(&vdev->pdev, vector);
> 
> That injection should no longer be needed once we bounce and record in
> the PBA, right? Maybe add a comment for now.

Right.  Ok.

> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int vector)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    struct vfio_irq_set_fd irq_set_fd = {
> > +        .irq_set = {
> > +            .argsz = sizeof(irq_set_fd),
> > +            .flags = VFIO_IRQ_SET_DATA_EVENTFD |
> > +                     VFIO_IRQ_SET_ACTION_TRIGGER,
> > +            .index = VFIO_PCI_MSIX_IRQ_INDEX,
> > +            .start = vector,
> > +            .count = 1,
> > +        },
> > +        .fd = -1,
> > +    };
> > +    int fd;
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) vector %d released\n", __func__,
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function, vector);
> > +
> > +    /*
> > +     * XXX What's the right thing to do here?  This turns off the interrupt
> > +     * completely, but do we really just want to switch the interrupt to
> > +     * bouncing through userspace and let msix.c drop it?  Not sure.
> > +     */
> > +    msix_vector_unuse(pdev, vector);
> > +    ioctl(vdev->fd, VFIO_DEVICE_SET_IRQS, &irq_set_fd);
> > +
> > +    fd = event_notifier_get_fd(&vdev->msi_vectors[vector].interrupt);
> > +
> > +    if (vdev->msi_vectors[vector].virq < 0) {
> > +        qemu_set_fd_handler(fd, NULL, NULL, NULL);
> > +    } else {
> > +        kvm_irqchip_remove_irqfd(kvm_state, fd, vdev->msi_vectors[vector].virq);
> > +        kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[vector].virq);
> > +        vdev->msi_vectors[vector].virq = -1;
> > +    }
> > +
> > +    event_notifier_cleanup(&vdev->msi_vectors[vector].interrupt);
> > +    vdev->msi_vectors[vector].use = false;
> > +}
> > +
> > +/* XXX This should move to msi.c */
> 
> Nope. We rather need notifier support for MSI. I only have an outdated
> patch at hand.

Yes, MSI needs some of the same interface upgrades as MSI-X has.  This
is just a bootstrap for now.

> > +static MSIMessage msi_get_msg(PCIDevice *pdev, unsigned int vector)
> > +{
> > +    uint16_t flags = pci_get_word(pdev->config + pdev->msi_cap + PCI_MSI_FLAGS);
> > +    bool msi64bit = flags & PCI_MSI_FLAGS_64BIT;
> > +    MSIMessage msg;
> > +
> > +    if (msi64bit) {
> > +        msg.address = pci_get_quad(pdev->config +
> > +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> > +    } else {
> > +        msg.address = pci_get_long(pdev->config +
> > +                                   pdev->msi_cap + PCI_MSI_ADDRESS_LO);
> > +    }
> > +
> > +    msg.data = pci_get_word(pdev->config + pdev->msi_cap +
> > +                            (msi64bit ? PCI_MSI_DATA_64 : PCI_MSI_DATA_32));
> > +    msg.data += vector;
> > +
> > +    return msg;
> > +}
> > +
> > +/* So should this */
> > +static void msi_set_qsize(PCIDevice *pdev, uint8_t size)
> > +{
> > +    uint8_t *config = pdev->config + pdev->msi_cap;
> > +    uint16_t flags;
> > +
> > +    flags = pci_get_word(config + PCI_MSI_FLAGS);
> > +    flags = le16_to_cpu(flags);
> > +    flags &= ~PCI_MSI_FLAGS_QSIZE;
> > +    flags |= (size & 0x7) << 4;
> > +    flags = cpu_to_le16(flags);
> > +    pci_set_word(config + PCI_MSI_FLAGS, flags);
> 
> Hmm...
> 
> > +}
> > +
> > +static void vfio_enable_msi(VFIODevice *vdev)
> > +{
> > +    int ret, i;
> > +
> > +    vfio_disable_interrupts(vdev);
> > +
> > +    vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
> > +retry:
> > +    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(MSIVector));
> > +
> > +    for (i = 0; i < vdev->nr_vectors; i++) {
> > +        MSIMessage msg;
> > +        int fd;
> > +
> > +        vdev->msi_vectors[i].vdev = vdev;
> > +        vdev->msi_vectors[i].vector = i;
> > +        vdev->msi_vectors[i].use = true;
> > +
> > +        if (event_notifier_init(&vdev->msi_vectors[i].interrupt, 0)) {
> > +            error_report("vfio: Error: event_notifier_init failed\n");
> > +        }
> > +
> > +        fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> > +
> > +        msg = msi_get_msg(&vdev->pdev, i);
> > +
> > +        /*
> > +         * Attempt to enable route through KVM irqchip,
> > +         * default to userspace handling if unavailable.
> > +         */
> > +        vdev->msi_vectors[i].virq = kvm_irqchip_add_msi_route(kvm_state, msg);
> > +        if (vdev->msi_vectors[i].virq < 0 ||
> > +            kvm_irqchip_add_irqfd(kvm_state, fd,
> > +                                  vdev->msi_vectors[i].virq) < 0) {
> > +            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL,
> > +                                &vdev->msi_vectors[i]);
> > +        }
> > +    }
> > +
> > +    ret = vfio_enable_vectors(vdev, false);
> > +    if (ret) {
> > +        if (ret < 0) {
> > +            error_report("vfio: Error: Failed to setup MSI fds: %s\n",
> > +                         strerror(errno));
> > +        } else if (ret != vdev->nr_vectors) {
> > +            error_report("vfio: Error: Failed to enable %d "
> > +                         "MSI vectors, retry with %d\n", vdev->nr_vectors, ret);
> > +        }
> > +
> > +        for (i = 0; i < vdev->nr_vectors; i++) {
> > +            int fd = event_notifier_get_fd(&vdev->msi_vectors[i].interrupt);
> > +            if (vdev->msi_vectors[i].virq >= 0) {
> > +                kvm_irqchip_remove_irqfd(kvm_state, fd,
> > +                                         vdev->msi_vectors[i].virq);
> > +                kvm_irqchip_release_virq(kvm_state, vdev->msi_vectors[i].virq);
> > +                vdev->msi_vectors[i].virq = -1;
> > +            } else {
> > +                qemu_set_fd_handler(fd, NULL, NULL, NULL);
> > +            }
> > +            event_notifier_cleanup(&vdev->msi_vectors[i].interrupt);
> > +        }
> > +
> > +        g_free(vdev->msi_vectors);
> > +
> > +        if (ret > 0 && ret != vdev->nr_vectors) {
> > +            vdev->nr_vectors = ret;
> > +            goto retry;
> > +        }
> > +        vdev->nr_vectors = 0;
> > +
> > +        return;
> > +    }
> > +
> > +    msi_set_qsize(&vdev->pdev, vdev->nr_vectors);
> 
> Hmmm... Can we really patch qsize? While the guest is already running
> and possibly evaluated this field before? IOW: Does the spec allow it to
> be changed by the device and, if yes, in which states?

This is really wishful thinking on my part, I haven't proven that it
does anything or that anyone ever resamples it.  It's probably
reasonable to have VFIO report that only a single vector is available
(at least on and x86 host) so we could lie in qmask.  As you know, our
failure options are pretty limited for enabling either MSI or MSI-X, but
we also don't want to sit on a pile of vectors in the host.

> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) Enabled %d MSI vectors\n", __func__,
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function, vdev->nr_vectors);
> > +}
> > +
> 
> ...
> 
> > +
> > +static int vfio_setup_msi(VFIODevice *vdev, int pos)
> > +{
> > +    uint16_t ctrl;
> > +    bool msi_64bit, msi_maskbit;
> > +    int ret, entries;
> > +
> > +    if (!msi_supported) {
> 
> Not critical, but I would prefer to keep this variable in the context of
> the MSI core. msi_init will return ENOTSUP, and you could handle that
> gracefully.

Ok, I'll add a TODO to remove it.

> > +        return 0;
> > +    }
> > +
> > +    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> > +              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> > +        return -1;
> > +    }
> > +    ctrl = le16_to_cpu(ctrl);
> > +
> > +    msi_64bit = !!(ctrl & PCI_MSI_FLAGS_64BIT);
> > +    msi_maskbit = !!(ctrl & PCI_MSI_FLAGS_MASKBIT);
> > +    entries = 1 << ((ctrl & PCI_MSI_FLAGS_QMASK) >> 1);
> > +
> > +    DPRINTF("%04x:%02x:%02x.%x PCI MSI CAP @0x%x\n", vdev->host.domain,
> > +            vdev->host.bus, vdev->host.slot, vdev->host.function, pos);
> > +
> > +    ret = msi_init(&vdev->pdev, pos, entries, msi_64bit, msi_maskbit);
> > +    if (ret < 0) {
> > +        error_report("vfio: msi_init failed\n");
> > +        return ret;
> > +    }
> > +    vdev->msi_cap_size = 0xa + (msi_maskbit ? 0xa : 0) + (msi_64bit ? 0x4 : 0);
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * We don't have any control over how pci_add_capability() inserts
> > + * capabilities into the chain.
> 
> What control is missing precisely? Can pci_add_capability be improved to
> simplify the early setup? I don't see it (msix_init requires the
> parameters), but the comment suggests this somehow.

Capabilities are a linked list and pci_add_capability always inserts at
the head of the list.  msix_init and msi_init want to call
pci_add_capability as part of their setup, that means to get a
capability list to match as close as possible to hardware (same
capability offsets and ordering), we have to walk to the end of the
chain on the physical device and add each to the virtual config space,
calling msi_init and msix_init at the right points along the way.  This
leads to vfio_add_std_cap being a recursive function to handle this.

> >  In order to setup MSI-X we need a
> > + * MemoryRegion for the BAR.  In order to setup the BAR and not
> > + * attempt to mmap the MSI-X table area, which VFIO won't allow, we
> > + * need to first look for where the MSI-X table lives.  So we
> > + * unfortunately split MSI-X setup across two functions.
> > + */
> > +static int vfio_early_setup_msix(VFIODevice *vdev)
> > +{
> > +    uint8_t pos;
> > +    uint16_t ctrl;
> > +    uint32_t table, pba;
> > +
> > +    pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
> > +    if (!pos) {
> > +        return 0;
> > +    }
> > +
> > +    if (pread(vdev->fd, &ctrl, sizeof(ctrl),
> > +              vdev->config_offset + pos + PCI_CAP_FLAGS) != sizeof(ctrl)) {
> > +        return -1;
> > +    }
> > +
> > +    if (pread(vdev->fd, &table, sizeof(table),
> > +              vdev->config_offset + pos + PCI_MSIX_TABLE) != sizeof(table)) {
> > +        return -1;
> > +    }
> > +
> > +    if (pread(vdev->fd, &pba, sizeof(pba),
> > +              vdev->config_offset + pos + PCI_MSIX_PBA) != sizeof(pba)) {
> > +        return -1;
> > +    }
> > +
> > +    ctrl = le16_to_cpu(ctrl);
> > +    table = le32_to_cpu(table);
> > +    pba = le32_to_cpu(pba);
> > +
> > +    vdev->msix = g_malloc0(sizeof(*(vdev->msix)));
> > +    vdev->msix->table_bar = table & PCI_MSIX_FLAGS_BIRMASK;
> > +    vdev->msix->table_offset = table & ~PCI_MSIX_FLAGS_BIRMASK;
> > +    vdev->msix->pba_bar = pba & PCI_MSIX_FLAGS_BIRMASK;
> > +    vdev->msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
> > +    vdev->msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
> > +
> > +    DPRINTF("%04x:%02x:%02x.%x "
> > +            "PCI MSI-X CAP @0x%x, BAR %d, offset 0x%x, entries %d\n",
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function, pos, vdev->msix->table_bar,
> > +            vdev->msix->table_offset, vdev->msix->entries);
> > +
> > +    return 0;
> > +}
> > +
> > +static int vfio_setup_msix(VFIODevice *vdev, int pos)
> > +{
> > +    int ret;
> > +
> > +    if (!msi_supported) {
> 
> See above.

Yep, noted.

> > +        return 0;
> > +    }
> > +
> > +    ret = msix_init(&vdev->pdev, vdev->msix->entries,
> > +                    &vdev->bars[vdev->msix->table_bar].mem,
> > +                    vdev->msix->table_bar, vdev->msix->table_offset,
> > +                    &vdev->bars[vdev->msix->pba_bar].mem,
> > +                    vdev->msix->pba_bar, vdev->msix->pba_offset, pos);
> > +    if (ret < 0) {
> > +        error_report("vfio: msix_init failed\n");
> > +        return ret;
> > +    }
> > +
> > +    ret = msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
> > +                                    vfio_msix_vector_release);
> > +    if (ret) {
> > +        error_report("vfio: msix_set_vector_notifiers failed %d\n", ret);
> > +        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> > +                    &vdev->bars[vdev->msix->pba_bar].mem);
> > +        return ret;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +static void vfio_teardown_msi(VFIODevice *vdev)
> > +{
> > +    msi_uninit(&vdev->pdev);
> > +
> > +    if (vdev->msix) {
> > +        /* FIXME: Why can't unset just silently do nothing?? */
> 
> Yep, that would be better.

New movie title: Attack of the Unnecessary Asserts ;)

> > +        if (vdev->pdev.msix_vector_use_notifier &&
> > +            vdev->pdev.msix_vector_release_notifier) {
> > +            msix_unset_vector_notifiers(&vdev->pdev);
> > +        }
> > +
> > +        msix_uninit(&vdev->pdev, &vdev->bars[vdev->msix->table_bar].mem,
> > +                    &vdev->bars[vdev->msix->pba_bar].mem);
> > +    }
> > +}
> > +
> > +/*
> > + * Resource setup
> > + */
> > +static void vfio_unmap_bar(VFIODevice *vdev, int nr)
> > +{
> > +    VFIOBAR *bar = &vdev->bars[nr];
> > +
> > +    if (!bar->size) {
> > +        return;
> > +    }
> > +
> > +    memory_region_del_subregion(&bar->mem, &bar->mmap_mem);
> > +    munmap(bar->mmap, memory_region_size(&bar->mmap_mem));
> > +
> > +    if (vdev->msix && vdev->msix->table_bar == nr) {
> > +        memory_region_del_subregion(&bar->mem, &vdev->msix->mmap_mem);
> > +        munmap(vdev->msix->mmap, memory_region_size(&vdev->msix->mmap_mem));
> > +    }
> > +
> > +    memory_region_destroy(&bar->mem);
> > +}
> > +
> > +static int vfio_mmap_bar(VFIOBAR *bar, MemoryRegion *mem, MemoryRegion *submem,
> > +                         void **map, size_t size, off_t offset,
> > +                         const char *name)
> > +{
> > +    int ret = 0;
> > +
> > +    if (size && bar->flags & VFIO_REGION_INFO_FLAG_MMAP) {
> > +        int prot = 0;
> > +
> > +        if (bar->flags & VFIO_REGION_INFO_FLAG_READ) {
> > +            prot |= PROT_READ;
> > +        }
> > +
> > +        if (bar->flags & VFIO_REGION_INFO_FLAG_WRITE) {
> > +            prot |= PROT_WRITE;
> > +        }
> > +
> > +        *map = mmap(NULL, size, prot, MAP_SHARED,
> > +                    bar->fd, bar->fd_offset + offset);
> > +        if (*map == MAP_FAILED) {
> > +            *map = NULL;
> > +            ret = -errno;
> > +            goto empty_region;
> > +        }
> > +
> > +        memory_region_init_ram_ptr(submem, name, size, *map);
> > +    } else {
> > +empty_region:
> > +        /* Create a zero sized sub-region to make cleanup easy. */
> > +        memory_region_init(submem, name, 0);
> > +    }
> > +
> > +    memory_region_add_subregion(mem, offset, submem);
> > +
> > +    return ret;
> > +}
> > +
> > +static void vfio_map_bar(VFIODevice *vdev, int nr)
> > +{
> > +    VFIOBAR *bar = &vdev->bars[nr];
> > +    unsigned size = bar->size;
> > +    char name[64];
> > +    uint32_t pci_bar;
> > +    uint8_t type;
> > +    int ret;
> > +
> > +    /* Skip both unimplemented BARs and the upper half of 64bit BARS. */
> > +    if (!size) {
> > +        return;
> > +    }
> > +
> > +    snprintf(name, sizeof(name), "VFIO %04x:%02x:%02x.%x BAR %d",
> > +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +             vdev->host.function, nr);
> > +
> > +    /* Determine what type of BAR this is for registration */
> > +    ret = pread(vdev->fd, &pci_bar, sizeof(pci_bar),
> > +                vdev->config_offset + PCI_BASE_ADDRESS_0 + (4 * nr));
> > +    if (ret != sizeof(pci_bar)) {
> > +        error_report("vfio: Failed to read BAR %d (%s)\n", nr, strerror(errno));
> > +        return;
> > +    }
> > +
> > +    pci_bar = le32_to_cpu(pci_bar);
> > +    type = pci_bar & (pci_bar & PCI_BASE_ADDRESS_SPACE_IO ?
> > +           ~PCI_BASE_ADDRESS_IO_MASK : ~PCI_BASE_ADDRESS_MEM_MASK);
> > +
> > +    /* A "slow" read/write mapping underlies all BARs */
> > +    memory_region_init_io(&bar->mem, &vfio_bar_ops, bar, name, size);
> > +    pci_register_bar(&vdev->pdev, nr, type, &bar->mem);
> > +
> > +    /*
> > +     * We can't mmap areas overlapping the MSIX vector table, so we
> > +     * potentially insert a direct-mapped subregion before and after it.
> > +     */
> > +    if (vdev->msix && vdev->msix->table_bar == nr) {
> > +        size = vdev->msix->table_offset & TARGET_PAGE_MASK;
> > +    }
> > +
> > +    strncat(name, " mmap", sizeof(name) - strlen(name) - 1);
> 
> This could generate an unterminated name if we actually have to cut the
> appended string. You could set name[sizeof(name)-1] = 0.

strncat adds the terminator, that's why we have the -1 so that there's
space for it.  strlen does not include the terminator.

> > +    if (vfio_mmap_bar(bar, &bar->mem,
> > +                      &bar->mmap_mem, &bar->mmap, size, 0, name)) {
> > +        error_report("%s unsupported. Performance may be slow\n", name);
> > +    }
> > +
> > +    if (vdev->msix && vdev->msix->table_bar == nr) {
> > +        unsigned start;
> > +
> > +        start = TARGET_PAGE_ALIGN(vdev->msix->table_offset +
> > +                                  (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
> > +
> > +        size = start < bar->size ? bar->size - start : 0;
> > +        strncat(name, " msix-hi", sizeof(name) - strlen(name) - 1);
> 
> Same here.
> 
> > +        /* MSIXInfo contains another MemoryRegion for this mapping */
> > +        if (vfio_mmap_bar(bar, &bar->mem, &vdev->msix->mmap_mem,
> > +                          &vdev->msix->mmap, size, start, name)) {
> > +            error_report("%s unsupported. Performance may be slow\n", name);
> > +        }
> > +    }
> > +
> > +    return;
> 
> Unneeded return.

Thanks.

> > +}
> > +
> 
> ...
> 
> > +
> > +static int vfio_initfn(struct PCIDevice *pdev)
> > +{
> > +    VFIODevice *pvdev, *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    VFIOGroup *group;
> > +    char path[PATH_MAX], iommu_group_path[PATH_MAX], *group_name;
> > +    ssize_t len;
> > +    struct stat st;
> > +    int groupid;
> > +    int ret;
> > +
> > +    /* Check that the host device exists */
> > +    snprintf(path, sizeof(path),
> > +             "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
> > +             vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +             vdev->host.function);
> > +    if (stat(path, &st) < 0) {
> > +        error_report("vfio: error: no such host device: %s", path);
> > +        return -1;
> > +    }
> > +
> > +    strncat(path, "iommu_group", sizeof(path) - strlen(path) - 1);
> 
> See above for the termination problem.
> 
> > +
> > +    len = readlink(path, iommu_group_path, PATH_MAX);
> > +    if (len <= 0) {
> > +        error_report("vfio: error no iommu_group for device\n");
> > +        return -1;
> > +    }
> > +
> > +    iommu_group_path[len] = 0;
> > +    group_name = basename(iommu_group_path);
> > +
> > +    if (sscanf(group_name, "%d", &groupid) != 1) {
> > +        error_report("vfio: error reading %s: %s", path, strerror(errno));
> > +        return -1;
> > +    }
> > +
> > +    DPRINTF("%s(%04x:%02x:%02x.%x) group %d\n", __func__, vdev->host.domain,
> > +            vdev->host.bus, vdev->host.slot, vdev->host.function, groupid);
> > +
> > +    group = vfio_get_group(groupid);
> > +    if (!group) {
> > +        error_report("vfio: failed to get group %d", groupid);
> > +        return -1;
> > +    }
> > +
> > +    snprintf(path, sizeof(path), "%04x:%02x:%02x.%01x",
> > +            vdev->host.domain, vdev->host.bus, vdev->host.slot,
> > +            vdev->host.function);
> > +
> > +    QLIST_FOREACH(pvdev, &group->device_list, next) {
> > +        if (pvdev->host.domain == vdev->host.domain &&
> > +            pvdev->host.bus == vdev->host.bus &&
> > +            pvdev->host.slot == vdev->host.slot &&
> > +            pvdev->host.function == vdev->host.function) {
> > +
> > +            error_report("vfio: error: device %s is already attached\n", path);
> > +            vfio_put_group(group);
> > +            return -1;
> > +        }
> > +    }
> > +
> > +    ret = vfio_get_device(group, path, vdev);
> > +    if (ret) {
> > +        error_report("vfio: failed to get device %s", path);
> > +        vfio_put_group(group);
> > +        return -1;
> > +    }
> > +
> > +    /* Get a copy of config space */
> > +    assert(pci_config_size(&vdev->pdev) <= vdev->config_size);
> > +    ret = pread(vdev->fd, vdev->pdev.config,
> > +                pci_config_size(&vdev->pdev), vdev->config_offset);
> > +    if (ret < (int)pci_config_size(&vdev->pdev)) {
> > +        error_report("vfio: Failed to read device config space\n");
> > +        goto out_put;
> > +    }
> > +
> > +    /*
> > +     * Clear host resource mapping info.  If we choose not to register a
> > +     * BAR, such as might be the case with the option ROM, we can get
> > +     * confusing, unwritable, residual addresses from the host here.
> > +     */
> > +    memset(&vdev->pdev.config[PCI_BASE_ADDRESS_0], 0, 24);
> > +    memset(&vdev->pdev.config[PCI_ROM_ADDRESS], 0, 4);
> > +
> > +    vfio_load_rom(vdev);
> > +
> > +    if (vfio_early_setup_msix(vdev)) {
> > +        goto out_put;
> > +    }
> > +
> > +    vfio_map_bars(vdev);
> > +
> > +    if (vfio_add_capabilities(vdev)) {
> > +        goto out_teardown;
> > +    }
> > +
> > +    if (vfio_pci_read_config(&vdev->pdev, PCI_INTERRUPT_PIN, 1)) {
> > +        pci_device_set_intx_routing_notifier(&vdev->pdev, vfio_update_irq);
> > +    }
> > +
> > +    if (vfio_enable_intx(vdev)) {
> 
> Although vfio_enable_intx also check for PCI_INTERRUPT_PIN, I would move
> this under the test above - more consistent when reading the code.

Sure, that makes sense.

> > +        goto out_teardown;
> > +    }
> > +
> > +    return 0;
> > +
> > +out_teardown:
> > +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > +    vfio_teardown_msi(vdev);
> > +    vfio_unmap_bars(vdev);
> > +out_put:
> > +    vfio_put_device(vdev);
> > +    vfio_put_group(group);
> > +    return -1;
> > +}
> > +
> > +static void vfio_exitfn(struct PCIDevice *pdev)
> > +{
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +    VFIOGroup *group = vdev->group;
> > +
> > +    pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
> > +    vfio_disable_interrupts(vdev);
> > +    vfio_teardown_msi(vdev);
> > +    vfio_unmap_bars(vdev);
> > +    vfio_put_device(vdev);
> > +    vfio_put_group(group);
> > +}
> > +
> > +static void vfio_reset(DeviceState *dev)
> > +{
> > +    PCIDevice *pdev = DO_UPCAST(PCIDevice, qdev, dev);
> > +    VFIODevice *vdev = DO_UPCAST(VFIODevice, pdev, pdev);
> > +
> > +    if (!vdev->reset_works) {
> > +        return;
> > +    }
> > +
> > +    if (ioctl(vdev->fd, VFIO_DEVICE_RESET)) {
> > +        error_report("vfio: Error unable to reset physical device "
> > +                     "(%04x:%02x:%02x.%x): %s\n", vdev->host.domain,
> > +                     vdev->host.bus, vdev->host.slot, vdev->host.function,
> > +                     strerror(errno));
> > +    }
> > +}
> > +
> > +static Property vfio_pci_dev_properties[] = {
> > +    DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIODevice, host),
> > +    /*
> > +     * TODO - support passed fds... is this necessary?
> > +     * DEFINE_PROP_STRING("vfiofd", VFIODevice, vfiofd_name),
> > +     * DEFINE_PROP_STRING("vfiogroupfd, VFIODevice, vfiogroupfd_name),
> > +     */
> > +    DEFINE_PROP_END_OF_LIST(),
> > +};
> > +
> > +
> > +static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
> > +{
> > +    PCIDeviceClass *dc = PCI_DEVICE_CLASS(klass);
> > +
> > +    dc->parent_class.reset = vfio_reset;
> > +    dc->init = vfio_initfn;
> > +    dc->exit = vfio_exitfn;
> > +    dc->config_read = vfio_pci_read_config;
> > +    dc->config_write = vfio_pci_write_config;
> > +    dc->parent_class.props = vfio_pci_dev_properties;
> > +}
> > +
> > +static TypeInfo vfio_pci_dev_info = {
> > +    .name          = "vfio-pci",
> > +    .parent        = TYPE_PCI_DEVICE,
> > +    .instance_size = sizeof(VFIODevice),
> > +    .class_init    = vfio_pci_dev_class_init,
> > +};
> > +
> > +static void register_vfio_pci_dev_type(void)
> > +{
> > +    type_register_static(&vfio_pci_dev_info);
> > +}
> > +
> > +type_init(register_vfio_pci_dev_type)
> > diff --git a/hw/vfio_pci.h b/hw/vfio_pci.h
> > new file mode 100644
> > index 0000000..0a71bce
> > --- /dev/null
> > +++ b/hw/vfio_pci.h
> > @@ -0,0 +1,101 @@
> > +#ifndef HW_VFIO_PCI_H
> > +#define HW_VFIO_PCI_H
> > +
> > +#include "qemu-common.h"
> > +#include "qemu-queue.h"
> > +#include "pci.h"
> > +#include "event_notifier.h"
> > +
> > +typedef struct VFIOBAR {
> > +    off_t fd_offset; /* offset of BAR within device fd */
> > +    int fd; /* device fd, allows us to pass VFIOBAR as opaque data */
> > +    MemoryRegion mem; /* slow, read/write access */
> > +    MemoryRegion mmap_mem; /* direct mapped access */
> > +    void *mmap;
> > +    size_t size;
> > +    uint32_t flags; /* VFIO region flags (rd/wr/mmap) */
> > +    uint8_t nr; /* cache the BAR number for debug */
> > +} VFIOBAR;
> > +
> > +typedef struct INTx {
> > +    bool pending; /* interrupt pending */
> > +    bool kvm_accel; /* set when QEMU bypass through KVM enabled */
> > +    uint8_t pin; /* which pin to pull for qemu_set_irq */
> > +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> > +    EventNotifier unmask; /* eventfd for unmask on QEMU bypass */
> > +    PCIINTxRoute route; /* routing info for QEMU bypass */
> > +} INTx;
> 
> Please add a VFIO prefix.

Ok

> > +
> > +struct VFIODevice;
> > +
> > +typedef struct MSIVector {
> > +    EventNotifier interrupt; /* eventfd triggered on interrupt */
> > +    struct VFIODevice *vdev; /* back pointer to device */
> > +    int vector; /* the vector number for this element */
> 
> Could also be calculated via vector - vector->vdev->msi_vectors. But I
> don't mind.

True, I'll add a note for now.

> > +    int virq; /* KVM irqchip route for QEMU bypass */
> > +    bool use;
> > +} MSIVector;
> 
> Also here. Just in case we ever decide to introduce a generic structure
> with this name.

Yep

> > +
> > +enum {
> > +    INT_NONE = 0,
> > +    INT_INTx = 1,
> > +    INT_MSI  = 2,
> > +    INT_MSIX = 3,
> > +};

And these.

> > +
> > +struct VFIOGroup;
> > +
> > +typedef struct VFIOContainer {
> > +    int fd; /* /dev/vfio/vfio, empowered by the attached groups */
> > +    struct {
> > +        /* enable abstraction to support various iommu backends */
> > +        union {
> > +            MemoryListener listener; /* Used by type1 iommu */
> > +        };
> > +        void (*release)(struct VFIOContainer *);
> > +    } iommu_data;
> > +    QLIST_HEAD(, VFIOGroup) group_list;
> > +    QLIST_ENTRY(VFIOContainer) next;
> > +} VFIOContainer;
> > +
> > +/* Cache of MSI-X setup plus extra mmap and memory region for split BAR map */
> > +typedef struct MSIXInfo {
> > +    uint8_t table_bar;
> > +    uint8_t pba_bar;
> > +    uint16_t entries;
> > +    uint32_t table_offset;
> > +    uint32_t pba_offset;
> > +    MemoryRegion mmap_mem;
> > +    void *mmap;
> > +} MSIXInfo;
> 
> Also a pretty generic name.

Yep s/MSIXInfo/VFIOMSIXInfo/g/

> > +
> > +typedef struct VFIODevice {
> > +    PCIDevice pdev;
> > +    int fd;
> > +    INTx intx;
> > +    unsigned int config_size;
> > +    off_t config_offset; /* Offset of config space region within device fd */
> > +    unsigned int rom_size;
> > +    off_t rom_offset; /* Offset of ROM region within device fd */
> > +    int msi_cap_size;
> > +    MSIVector *msi_vectors;
> > +    MSIXInfo *msix;
> > +    int nr_vectors; /* Number of MSI/MSIX vectors currently in use */
> > +    int interrupt; /* Current interrupt type */
> > +    VFIOBAR bars[PCI_NUM_REGIONS - 1]; /* No ROM */
> > +    PCIHostDeviceAddress host;
> > +    QLIST_ENTRY(VFIODevice) next;
> > +    struct VFIOGroup *group;
> > +    bool reset_works;
> > +} VFIODevice;
> > +
> > +typedef struct VFIOGroup {
> > +    int fd;
> > +    int groupid;
> > +    VFIOContainer *container;
> > +    QLIST_HEAD(, VFIODevice) device_list;
> > +    QLIST_ENTRY(VFIOGroup) next;
> > +    QLIST_ENTRY(VFIOGroup) container_next;
> > +} VFIOGroup;
> > +
> > +#endif /* HW_VFIO_PCI_H */
> > 
> 
> Why do all theses structs have to go into a header file? Will there be
> more users than vfio_pci.c?

Only to avoid cluttering vfio_pci.c.  Anthony suggested renaming this to
vfio_pci_int.h (internal), which I've done.  I wouldn't be opposed to
rolling this into vfio_pci.c if that's the preference.  Thanks for the
review!

Alex

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox