kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h
  2011-02-18  8:53 [PATCH 0/4 v9] MSI-X MMIO support for KVM Sheng Yang
@ 2011-02-18  8:53 ` Sheng Yang
  0 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-18  8:53 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: kvm, Michael S. Tsirkin, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 0/4 v10] MSI-X MMIO support for KVM
@ 2011-02-24  9:51 Sheng Yang
  2011-02-24  9:51 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (3 more replies)
  0 siblings, 4 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-24  9:51 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Alex Williamson, Michael S. Tsirkin, kvm, Sheng Yang

Change from v8:
1. Fix one MSI-X routing update exit bug.
2. Update according to the comments of Alex and Michael.

Notice this patchset still based on 2.6.37 due to a block bug on assigned
device in the upstream now.

Sheng Yang (4):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Add kvm_io_ext_data to IO handler
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt       |   58 ++++++++
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/i8254.c            |    6 +-
 arch/x86/kvm/i8259.c            |    3 +-
 arch/x86/kvm/lapic.c            |    3 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   51 +++++--
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   67 +++++++++-
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/coalesced_mmio.c       |    3 +-
 virt/kvm/eventfd.c              |    2 +-
 virt/kvm/ioapic.c               |    2 +-
 virt/kvm/iodev.h                |   31 +----
 virt/kvm/kvm_main.c             |   40 +++++-
 virt/kvm/msix_mmio.c            |  296 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 18 files changed, 612 insertions(+), 52 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h
  2011-02-24  9:51 [PATCH 0/4 v10] MSI-X MMIO support for KVM Sheng Yang
@ 2011-02-24  9:51 ` Sheng Yang
  2011-02-24  9:51 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-24  9:51 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Alex Williamson, Michael S. Tsirkin, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-24  9:51 [PATCH 0/4 v10] MSI-X MMIO support for KVM Sheng Yang
  2011-02-24  9:51 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2011-02-24  9:51 ` Sheng Yang
  2011-02-24 10:22   ` Michael S. Tsirkin
  2011-02-24  9:51 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-02-24  9:51 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang
  3 siblings, 1 reply; 21+ messages in thread
From: Sheng Yang @ 2011-02-24  9:51 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Alex Williamson, Michael S. Tsirkin, kvm, Sheng Yang

Add a new parameter to IO writing handler, so that we can transfer information
from IO handler to caller.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/i8254.c      |    6 ++++--
 arch/x86/kvm/i8259.c      |    3 ++-
 arch/x86/kvm/lapic.c      |    3 ++-
 arch/x86/kvm/x86.c        |   13 ++++++++-----
 include/linux/kvm_host.h  |   12 ++++++++++--
 virt/kvm/coalesced_mmio.c |    3 ++-
 virt/kvm/eventfd.c        |    2 +-
 virt/kvm/ioapic.c         |    2 +-
 virt/kvm/iodev.h          |    6 ++++--
 virt/kvm/kvm_main.c       |    4 ++--
 10 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index efad723..bd8f0c5 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
 }
 
 static int pit_ioport_write(struct kvm_io_device *this,
-			    gpa_t addr, int len, const void *data)
+			    gpa_t addr, int len, const void *data,
+			    struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
@@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
 }
 
 static int speaker_ioport_write(struct kvm_io_device *this,
-				gpa_t addr, int len, const void *data)
+				gpa_t addr, int len, const void *data,
+				struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 3cece05..96b1070 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
 }
 
 static int picdev_write(struct kvm_io_device *this,
-			 gpa_t addr, int len, const void *val)
+			 gpa_t addr, int len, const void *val,
+			 struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_pic *s = to_pic(this);
 	unsigned char data = *(unsigned char *)val;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 93cf9d0..f413e9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 }
 
 static int apic_mmio_write(struct kvm_io_device *this,
-			    gpa_t address, int len, const void *data)
+			    gpa_t address, int len, const void *data,
+			    struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
 	unsigned int offset = address - apic->base_address;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..21b84e2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
 }
 
 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
-			   const void *v)
+			   const void *v, struct kvm_io_ext_data *ext_data)
 {
 	if (vcpu->arch.apic &&
-	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
+	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v, ext_data))
 		return 0;
 
-	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
+	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
+				addr, len, v, ext_data);
 }
 
 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	struct kvm_io_ext_data ext_data;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3825,7 +3827,7 @@ mmio:
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
@@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 {
 	/* TODO: String I/O for in kernel device */
 	int r;
+	struct kvm_io_ext_data ext_data;
 
 	if (vcpu->arch.pio.in)
 		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
@@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 	else
 		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
 				     vcpu->arch.pio.port, vcpu->arch.pio.size,
-				     pd);
+				     pd, &ext_data);
 	return r;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..6bb211d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,8 +68,15 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+struct kvm_io_ext_data {
+	int type;
+	union {
+		char padding[256];
+	};
+};
+
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-		     int len, const void *val);
+		     int len, const void *val, struct kvm_io_ext_data *data);
 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
 		    void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -113,7 +120,8 @@ struct kvm_io_device_ops {
 	int (*write)(struct kvm_io_device *this,
 		     gpa_t addr,
 		     int len,
-		     const void *val);
+		     const void *val,
+		     struct kvm_io_ext_data *data);
 	void (*destructor)(struct kvm_io_device *this);
 };
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index fc84875..37b254c 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
 }
 
 static int coalesced_mmio_write(struct kvm_io_device *this,
-				gpa_t addr, int len, const void *val)
+				gpa_t addr, int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 2ca4535..8edd757 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
-		const void *val)
+		const void *val, struct kvm_io_ext_data *ext_data)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
 
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 0b9df83..6a027ef 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 }
 
 static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-			     const void *val)
+			     const void *val, struct kvm_io_ext_data *ext_data)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 data;
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index d1f5651..340ab79 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct kvm_io_device *dev,
 }
 
 static inline int kvm_iodevice_write(struct kvm_io_device *dev,
-				     gpa_t addr, int l, const void *v)
+				     gpa_t addr, int l, const void *v,
+				     struct kvm_io_ext_data *data)
 {
-	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+	return dev->ops->write ?
+		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
 }
 
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..a61f90e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 
 /* kvm_io_bus_write - called under kvm->slots_lock */
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-		     int len, const void *val)
+		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
 	int i;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
 	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
 			return 0;
 	return -EOPNOTSUPP;
 }
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-24  9:51 [PATCH 0/4 v10] MSI-X MMIO support for KVM Sheng Yang
  2011-02-24  9:51 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2011-02-24  9:51 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
@ 2011-02-24  9:51 ` Sheng Yang
  2011-02-24 10:45   ` Michael S. Tsirkin
  2011-02-24  9:51 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang
  3 siblings, 1 reply; 21+ messages in thread
From: Sheng Yang @ 2011-02-24  9:51 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Alex Williamson, Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   40 ++++-
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   34 +++++
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/kvm_main.c             |   38 +++++-
 virt/kvm/msix_mmio.c            |  296 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 10 files changed, 497 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21..4a390a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -635,6 +635,7 @@ enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
+	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
 };
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb4..912dca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
+	case EMULATE_USERSPACE_EXIT:
+		/* fall through */
 	case EMULATE_FAIL:
 		return 0;
 	default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b84e2..87308eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 {
 	gpa_t                 gpa;
 	struct kvm_io_ext_data ext_data;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-	memcpy(vcpu->run->mmio.data, val, bytes);
+	if (r == -ENOTSYNC) {
+		vcpu->userspace_exit_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
+		vcpu->run->msix_routing.dev_id =
+			ext_data.msix_routing.dev_id;
+		vcpu->run->msix_routing.type =
+			ext_data.msix_routing.type;
+		vcpu->run->msix_routing.entry_idx =
+			ext_data.msix_routing.entry_idx;
+		vcpu->run->msix_routing.flags =
+			ext_data.msix_routing.flags;
+	} else  {
+		vcpu->mmio_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+		memcpy(vcpu->run->mmio.data, val, bytes);
+	}
 
 	return X86EMUL_CONTINUE;
 }
@@ -4469,6 +4485,8 @@ done:
 		r = EMULATE_DO_MMIO;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else if (vcpu->userspace_exit_needed)
+		r = EMULATE_USERSPACE_EXIT;
 	else
 		r = EMULATE_DONE;
 
@@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+			vcpu->userspace_exit_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
 			vcpu->mmio_needed = 0;
 		}
+		if (vcpu->userspace_exit_needed) {
+			vcpu->userspace_exit_needed = 0;
+			r = 0;
+			goto out;
+		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..4393e4e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,13 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +681,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6bb211d..6aaf85e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,9 +68,16 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
 struct kvm_io_ext_data {
 	int type;
 	union {
+		struct {
+			u32 dev_id;
+			u16 type;
+			u16 entry_idx;
+			u64 flags;
+		} msix_routing;
 		char padding[256];
 	};
 };
@@ -168,6 +175,8 @@ struct kvm_vcpu {
 	} async_pf;
 #endif
 
+	int userspace_exit_needed;
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -241,6 +250,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -289,6 +319,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a61f90e..f211e49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..083b15b
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,296 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+	unsigned long __user *ctrl_pos;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (!(len == 4 || len == 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	ctrl_pos = (unsigned long __user *)(entry_base +
+			PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* Don't allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
+	ext_data->msix_routing.dev_id = mmio->dev_id;
+	ext_data->msix_routing.type = mmio->type;
+	ext_data->msix_routing.entry_idx = entry;
+	ext_data->msix_routing.flags = 0;
+
+	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
+		ret = -ENOTSYNC;
+		goto out;
+	}
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if (old_ctrl == new_ctrl) {
+		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
+			ret = -ENOTSYNC;
+		goto out;
+	}
+	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
+				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
+	if (r)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	/* Check alignment and accessibility */
+	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
+	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+		mmio_dev->mmio_nr++;
+	}
+
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = -EINVAL, i, j;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			r = 0;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 4/4] KVM: Add documents for MSI-X MMIO API
  2011-02-24  9:51 [PATCH 0/4 v10] MSI-X MMIO support for KVM Sheng Yang
                   ` (2 preceding siblings ...)
  2011-02-24  9:51 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-24  9:51 ` Sheng Yang
  3 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-24  9:51 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Alex Williamson, Michael S. Tsirkin, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   58 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..dd10c3b 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+NOTICE: Writing the MSI-X MMIO page after it was registered with this API may
+be dangerous for userspace program. The writing during VM running may result
+in synchronization issue therefore the assigned device can't work properly.
+The writing is allowed when VM is not running and can be used as save/restore
+mechanism.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
@@ -1445,6 +1492,17 @@ Userspace can now handle the hypercall and when it's done modify the gprs as
 necessary. Upon guest entry all guest GPRs will then be replaced by the values
 in this struct.
 
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
+
+KVM_EXIT_MSIX_ROUTING_UPDATE indicates one MSI-X entry has been modified, and
+userspace need to update the correlated routing table.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-24  9:51 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
@ 2011-02-24 10:22   ` Michael S. Tsirkin
  2011-02-25  3:23     ` Sheng Yang
  0 siblings, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:22 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Thu, Feb 24, 2011 at 05:51:03PM +0800, Sheng Yang wrote:
> Add a new parameter to IO writing handler, so that we can transfer information
> from IO handler to caller.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/kvm/i8254.c      |    6 ++++--
>  arch/x86/kvm/i8259.c      |    3 ++-
>  arch/x86/kvm/lapic.c      |    3 ++-
>  arch/x86/kvm/x86.c        |   13 ++++++++-----
>  include/linux/kvm_host.h  |   12 ++++++++++--
>  virt/kvm/coalesced_mmio.c |    3 ++-
>  virt/kvm/eventfd.c        |    2 +-
>  virt/kvm/ioapic.c         |    2 +-
>  virt/kvm/iodev.h          |    6 ++++--
>  virt/kvm/kvm_main.c       |    4 ++--
>  10 files changed, 36 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> index efad723..bd8f0c5 100644
> --- a/arch/x86/kvm/i8254.c
> +++ b/arch/x86/kvm/i8254.c
> @@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
>  }
>  
>  static int pit_ioport_write(struct kvm_io_device *this,
> -			    gpa_t addr, int len, const void *data)
> +			    gpa_t addr, int len, const void *data,
> +			    struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_pit *pit = dev_to_pit(this);
>  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> @@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
>  }
>  
>  static int speaker_ioport_write(struct kvm_io_device *this,
> -				gpa_t addr, int len, const void *data)
> +				gpa_t addr, int len, const void *data,
> +				struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_pit *pit = speaker_to_pit(this);
>  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> index 3cece05..96b1070 100644
> --- a/arch/x86/kvm/i8259.c
> +++ b/arch/x86/kvm/i8259.c
> @@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct kvm_io_device *dev)
>  }
>  
>  static int picdev_write(struct kvm_io_device *this,
> -			 gpa_t addr, int len, const void *val)
> +			 gpa_t addr, int len, const void *val,
> +			 struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_pic *s = to_pic(this);
>  	unsigned char data = *(unsigned char *)val;
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 93cf9d0..f413e9c 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
>  }
>  
>  static int apic_mmio_write(struct kvm_io_device *this,
> -			    gpa_t address, int len, const void *data)
> +			    gpa_t address, int len, const void *data,
> +			    struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_lapic *apic = to_lapic(this);
>  	unsigned int offset = address - apic->base_address;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fa708c9..21b84e2 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
>  }
>  
>  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> -			   const void *v)
> +			   const void *v, struct kvm_io_ext_data *ext_data)
>  {
>  	if (vcpu->arch.apic &&
> -	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
> +	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v, ext_data))
>  		return 0;
>  
> -	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
> +	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
> +				addr, len, v, ext_data);
>  }
>  
>  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
> @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  					   struct kvm_vcpu *vcpu)
>  {
>  	gpa_t                 gpa;
> +	struct kvm_io_ext_data ext_data;
>  
>  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
>  
> @@ -3825,7 +3827,7 @@ mmio:
>  	/*
>  	 * Is this MMIO handled locally?
>  	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> +	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
>  		return X86EMUL_CONTINUE;
>  
>  	vcpu->mmio_needed = 1;
> @@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
>  {
>  	/* TODO: String I/O for in kernel device */
>  	int r;
> +	struct kvm_io_ext_data ext_data;
>  
>  	if (vcpu->arch.pio.in)
>  		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
> @@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
>  	else
>  		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
>  				     vcpu->arch.pio.port, vcpu->arch.pio.size,
> -				     pd);
> +				     pd, &ext_data);
>  	return r;
>  }
>  
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7d313e0..6bb211d 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -68,8 +68,15 @@ enum kvm_bus {
>  	KVM_NR_BUSES
>  };
>  
> +struct kvm_io_ext_data {
> +	int type;

What values does this get? Please add documentation in comments.

> +	union {
> +		char padding[256];
> +	};

So the structure size is 260 bytes?
What's the point of the padding?

> +};
> +
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> -		     int len, const void *val);
> +		     int len, const void *val, struct kvm_io_ext_data *data);
>  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
>  		    void *val);
>  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> @@ -113,7 +120,8 @@ struct kvm_io_device_ops {
>  	int (*write)(struct kvm_io_device *this,
>  		     gpa_t addr,
>  		     int len,
> -		     const void *val);
> +		     const void *val,
> +		     struct kvm_io_ext_data *data);
>  	void (*destructor)(struct kvm_io_device *this);
>  };
>  
> diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> index fc84875..37b254c 100644
> --- a/virt/kvm/coalesced_mmio.c
> +++ b/virt/kvm/coalesced_mmio.c
> @@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct kvm_coalesced_mmio_dev *dev,
>  }
>  
>  static int coalesced_mmio_write(struct kvm_io_device *this,
> -				gpa_t addr, int len, const void *val)
> +				gpa_t addr, int len, const void *val,
> +				struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
>  	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index 2ca4535..8edd757 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
>  /* MMIO/PIO writes trigger an event if the addr/val match */
>  static int
>  ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> -		const void *val)
> +		const void *val, struct kvm_io_ext_data *ext_data)
>  {
>  	struct _ioeventfd *p = to_ioeventfd(this);
>  
> diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> index 0b9df83..6a027ef 100644
> --- a/virt/kvm/ioapic.c
> +++ b/virt/kvm/ioapic.c
> @@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
>  }
>  
>  static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
> -			     const void *val)
> +			     const void *val, struct kvm_io_ext_data *ext_data)
>  {
>  	struct kvm_ioapic *ioapic = to_ioapic(this);
>  	u32 data;
> diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> index d1f5651..340ab79 100644
> --- a/virt/kvm/iodev.h
> +++ b/virt/kvm/iodev.h
> @@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct kvm_io_device *dev,
>  }
>  
>  static inline int kvm_iodevice_write(struct kvm_io_device *dev,
> -				     gpa_t addr, int l, const void *v)
> +				     gpa_t addr, int l, const void *v,
> +				     struct kvm_io_ext_data *data)
>  {
> -	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
> +	return dev->ops->write ?
> +		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
>  }
>  
>  static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..a61f90e 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  
>  /* kvm_io_bus_write - called under kvm->slots_lock */
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> -		     int len, const void *val)
> +		     int len, const void *val, struct kvm_io_ext_data *ext_data)
>  {
>  	int i;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
>  	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
>  			return 0;
>  	return -EOPNOTSUPP;
>  }
> -- 
> 1.7.0.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-24  9:51 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-24 10:45   ` Michael S. Tsirkin
  2011-02-25  6:28     ` Sheng Yang
                       ` (2 more replies)
  0 siblings, 3 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:45 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>

Doesn't look like all comments got addressed.
E.g. gpa_t entry_base is still there and in reality
you said it's a host virtual address so
should be void __user *;
And ENOTSYNC meaning 'MSIX' is pretty hacky.

> ---
>  arch/x86/include/asm/kvm_host.h |    1 +
>  arch/x86/kvm/Makefile           |    2 +-
>  arch/x86/kvm/mmu.c              |    2 +
>  arch/x86/kvm/x86.c              |   40 ++++-
>  include/linux/kvm.h             |   28 ++++
>  include/linux/kvm_host.h        |   34 +++++
>  virt/kvm/assigned-dev.c         |   44 ++++++
>  virt/kvm/kvm_main.c             |   38 +++++-
>  virt/kvm/msix_mmio.c            |  296 +++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h            |   25 ++++
>  10 files changed, 497 insertions(+), 13 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index aa75f21..4a390a4 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -635,6 +635,7 @@ enum emulation_result {
>  	EMULATE_DONE,       /* no further processing */
>  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
>  	EMULATE_FAIL,         /* can't emulate this instruction */
> +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
>  };
>  
>  #define EMULTYPE_NO_DECODE	    (1 << 0)
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..3a0d851 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
>  
>  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
>  				coalesced_mmio.o irq_comm.o eventfd.o \
> -				assigned-dev.o)
> +				assigned-dev.o msix_mmio.o)
>  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
>  
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9cafbb4..912dca4 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
>  	case EMULATE_DO_MMIO:
>  		++vcpu->stat.mmio_exits;
>  		/* fall through */
> +	case EMULATE_USERSPACE_EXIT:
> +		/* fall through */
>  	case EMULATE_FAIL:
>  		return 0;
>  	default:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 21b84e2..87308eb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
> +	case KVM_CAP_MSIX_MMIO:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  {
>  	gpa_t                 gpa;
>  	struct kvm_io_ext_data ext_data;
> +	int r;
>  
>  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
>  
> @@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  
>  mmio:
>  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
>  	/*
>  	 * Is this MMIO handled locally?
>  	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> +	if (!r)
>  		return X86EMUL_CONTINUE;
>  
> -	vcpu->mmio_needed = 1;
> -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> -	memcpy(vcpu->run->mmio.data, val, bytes);
> +	if (r == -ENOTSYNC) {
> +		vcpu->userspace_exit_needed = 1;
> +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> +		vcpu->run->msix_routing.dev_id =
> +			ext_data.msix_routing.dev_id;
> +		vcpu->run->msix_routing.type =
> +			ext_data.msix_routing.type;
> +		vcpu->run->msix_routing.entry_idx =
> +			ext_data.msix_routing.entry_idx;
> +		vcpu->run->msix_routing.flags =
> +			ext_data.msix_routing.flags;
> +	} else  {
> +		vcpu->mmio_needed = 1;
> +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> +		memcpy(vcpu->run->mmio.data, val, bytes);
> +	}
>  
>  	return X86EMUL_CONTINUE;
>  }
> @@ -4469,6 +4485,8 @@ done:
>  		r = EMULATE_DO_MMIO;
>  	} else if (r == EMULATION_RESTART)
>  		goto restart;
> +	else if (vcpu->userspace_exit_needed)
> +		r = EMULATE_USERSPACE_EXIT;
>  	else
>  		r = EMULATE_DONE;
>  
> @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
>  		}
>  	}
>  
> -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> +			vcpu->userspace_exit_needed) {
>  		if (vcpu->mmio_needed) {
>  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
>  			vcpu->mmio_read_completed = 1;
>  			vcpu->mmio_needed = 0;
>  		}
> +		if (vcpu->userspace_exit_needed) {
> +			vcpu->userspace_exit_needed = 0;
> +			r = 0;
> +			goto out;
> +		}
>  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
>  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
>  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..4393e4e 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>  #define KVM_EXIT_NMI              16
>  #define KVM_EXIT_INTERNAL_ERROR   17
>  #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -264,6 +265,13 @@ struct kvm_run {
>  		struct {
>  			__u64 gprs[32];
>  		} osi;
> +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> +		struct {
> +			__u32 dev_id;
> +			__u16 type;
> +			__u16 entry_idx;
> +			__u64 flags;
> +		} msix_routing;
>  		/* Fix the size of the union. */
>  		char padding[256];
>  	};
> @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_GET_PVINFO 57
>  #define KVM_CAP_PPC_IRQ_LEVEL 58
>  #define KVM_CAP_ASYNC_PF 59
> +#define KVM_CAP_MSIX_MMIO 60
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -672,6 +681,9 @@ struct kvm_clock_data {
>  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +/* Available with KVM_CAP_MSIX_MMIO */
> +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
> +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
>  	__u16 padding[3];
>  };
>  
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> +
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> +
> +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;
> +	__u16 max_entries_nr;
> +	__u64 base_addr;
> +	__u64 base_va;
> +	__u64 flags;
> +	__u64 reserved[4];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 6bb211d..6aaf85e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -68,9 +68,16 @@ enum kvm_bus {
>  	KVM_NR_BUSES
>  };
>  
> +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
>  struct kvm_io_ext_data {
>  	int type;
>  	union {
> +		struct {
> +			u32 dev_id;
> +			u16 type;
> +			u16 entry_idx;
> +			u64 flags;
> +		} msix_routing;
>  		char padding[256];
>  	};
>  };
> @@ -168,6 +175,8 @@ struct kvm_vcpu {
>  	} async_pf;
>  #endif
>  
> +	int userspace_exit_needed;
> +
>  	struct kvm_vcpu_arch arch;
>  };
>  
> @@ -241,6 +250,27 @@ struct kvm_memslots {
>  					KVM_PRIVATE_MEM_SLOTS];
>  };
>  
> +#define KVM_MSIX_MMIO_MAX    32
> +
> +struct kvm_msix_mmio {
> +	u32 dev_id;
> +	u16 type;
> +	u16 max_entries_nr;
> +	u64 flags;
> +	gpa_t table_base_addr;
> +	hva_t table_base_va;
> +	gpa_t pba_base_addr;
> +	hva_t pba_base_va;
> +};
> +
> +struct kvm_msix_mmio_dev {
> +	struct kvm *kvm;
> +	struct kvm_io_device table_dev;
> +	int mmio_nr;
> +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> +	struct mutex lock;
> +};
> +
>  struct kvm {
>  	spinlock_t mmu_lock;
>  	raw_spinlock_t requests_lock;
> @@ -289,6 +319,7 @@ struct kvm {
>  	long mmu_notifier_count;
>  #endif
>  	long tlbs_dirty;
> +	struct kvm_msix_mmio_dev msix_mmio_dev;
>  };
>  
>  /* The guest did something we don't support. */
> @@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
>  int kvm_request_irq_source_id(struct kvm *kvm);
>  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +			int assigned_dev_id, int entry, bool mask);
> +
>  /* For vcpu->arch.iommu_flags */
>  #define KVM_IOMMU_CACHE_COHERENCY	0x1
>  
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..d1598a6 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +/* The caller should hold kvm->lock */
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	/* For non-MSIX enabled devices, entries_nr == 0 */
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index a61f90e..f211e49 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..083b15b
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,296 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}
> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	/*TODO: Add big endian support */
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;

addr & len as below?

> +
> +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;

then this test won't be needed.

> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val,
> +				struct kvm_io_ext_data *ext_data)
> +{
> +	/*TODO: Add big endian support */
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	unsigned long __user *ctrl_pos;

long? It's 8 bytes on 64 bit.
You really want
__le32 old_ctrl, new_ctrl;
__le32 __user *ctrl_pos;

> +
> +	mutex_lock(&mmio_dev->kvm->lock);
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if (!(len == 4 || len == 8) || addr & (len - 1))

Nice hack. Even a bit nicer
	if ((len != 4 && len != 8) || addr & (len - 1))


> +		goto out;
> +
> +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (unsigned long __user *)(entry_base +
> +			PCI_MSIX_ENTRY_VECTOR_CTRL);

So this is the issue: if you cast a type to unsigned long *
compiler can assume that the address is aligned.
To prevent problems please add a check that table_base_va
is aligned.

> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* Don't allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;
> +
> +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> +	ext_data->msix_routing.dev_id = mmio->dev_id;
> +	ext_data->msix_routing.type = mmio->type;
> +	ext_data->msix_routing.entry_idx = entry;
> +	ext_data->msix_routing.flags = 0;
> +
> +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> +		ret = -ENOTSYNC;
> +		goto out;
> +	}
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if (old_ctrl == new_ctrl) {
> +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> +			ret = -ENOTSYNC;
> +		goto out;
> +	}
> +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> +	if (r)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	mutex_unlock(&mmio_dev->kvm->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	/* Check alignment and accessibility */
> +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,

You also should check that base_va and friends fit in a pointer
for 32 bit architectures. Same for other va values.

> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +		mmio_dev->mmio_nr++;
> +	}
> +
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = -EINVAL, i, j;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			r = 0;
> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
> diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> new file mode 100644
> index 0000000..01b6587
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.h
> @@ -0,0 +1,25 @@
> +#ifndef __KVM_MSIX_MMIO_H__
> +#define __KVM_MSIX_MMIO_H__
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/pci.h>
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user);
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user);
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
> +
> +#endif
> -- 
> 1.7.0.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-24 10:22   ` Michael S. Tsirkin
@ 2011-02-25  3:23     ` Sheng Yang
  2011-02-25  8:12       ` Michael S. Tsirkin
  0 siblings, 1 reply; 21+ messages in thread
From: Sheng Yang @ 2011-02-25  3:23 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Thursday 24 February 2011 18:22:19 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 05:51:03PM +0800, Sheng Yang wrote:
> > Add a new parameter to IO writing handler, so that we can transfer
> > information from IO handler to caller.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > ---
> > 
> >  arch/x86/kvm/i8254.c      |    6 ++++--
> >  arch/x86/kvm/i8259.c      |    3 ++-
> >  arch/x86/kvm/lapic.c      |    3 ++-
> >  arch/x86/kvm/x86.c        |   13 ++++++++-----
> >  include/linux/kvm_host.h  |   12 ++++++++++--
> >  virt/kvm/coalesced_mmio.c |    3 ++-
> >  virt/kvm/eventfd.c        |    2 +-
> >  virt/kvm/ioapic.c         |    2 +-
> >  virt/kvm/iodev.h          |    6 ++++--
> >  virt/kvm/kvm_main.c       |    4 ++--
> >  10 files changed, 36 insertions(+), 18 deletions(-)
> > 
> > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> > index efad723..bd8f0c5 100644
> > --- a/arch/x86/kvm/i8254.c
> > +++ b/arch/x86/kvm/i8254.c
> > @@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
> > 
> >  }
> >  
> >  static int pit_ioport_write(struct kvm_io_device *this,
> > 
> > -			    gpa_t addr, int len, const void *data)
> > +			    gpa_t addr, int len, const void *data,
> > +			    struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_pit *pit = dev_to_pit(this);
> >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > 
> > @@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device
> > *this,
> > 
> >  }
> >  
> >  static int speaker_ioport_write(struct kvm_io_device *this,
> > 
> > -				gpa_t addr, int len, const void *data)
> > +				gpa_t addr, int len, const void *data,
> > +				struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_pit *pit = speaker_to_pit(this);
> >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > 
> > diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> > index 3cece05..96b1070 100644
> > --- a/arch/x86/kvm/i8259.c
> > +++ b/arch/x86/kvm/i8259.c
> > @@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct
> > kvm_io_device *dev)
> > 
> >  }
> >  
> >  static int picdev_write(struct kvm_io_device *this,
> > 
> > -			 gpa_t addr, int len, const void *val)
> > +			 gpa_t addr, int len, const void *val,
> > +			 struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_pic *s = to_pic(this);
> >  	unsigned char data = *(unsigned char *)val;
> > 
> > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > index 93cf9d0..f413e9c 100644
> > --- a/arch/x86/kvm/lapic.c
> > +++ b/arch/x86/kvm/lapic.c
> > @@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32
> > reg, u32 val)
> > 
> >  }
> >  
> >  static int apic_mmio_write(struct kvm_io_device *this,
> > 
> > -			    gpa_t address, int len, const void *data)
> > +			    gpa_t address, int len, const void *data,
> > +			    struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_lapic *apic = to_lapic(this);
> >  	unsigned int offset = address - apic->base_address;
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index fa708c9..21b84e2 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
> > 
> >  }
> >  
> >  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> > 
> > -			   const void *v)
> > +			   const void *v, struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	if (vcpu->arch.apic &&
> > 
> > -	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
> > +	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v, ext_data))
> > 
> >  		return 0;
> > 
> > -	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
> > +	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
> > +				addr, len, v, ext_data);
> > 
> >  }
> >  
> >  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> >  void *v)
> > 
> > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  					   struct kvm_vcpu *vcpu)
> >  
> >  {
> >  
> >  	gpa_t                 gpa;
> > 
> > +	struct kvm_io_ext_data ext_data;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3825,7 +3827,7 @@ mmio:
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > +	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > 
> >  		return X86EMUL_CONTINUE;
> >  	
> >  	vcpu->mmio_needed = 1;
> > 
> > @@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void
> > *pd)
> > 
> >  {
> >  
> >  	/* TODO: String I/O for in kernel device */
> >  	int r;
> > 
> > +	struct kvm_io_ext_data ext_data;
> > 
> >  	if (vcpu->arch.pio.in)
> >  	
> >  		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
> > 
> > @@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void
> > *pd)
> > 
> >  	else
> >  	
> >  		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
> >  		
> >  				     vcpu->arch.pio.port, vcpu->arch.pio.size,
> > 
> > -				     pd);
> > +				     pd, &ext_data);
> > 
> >  	return r;
> >  
> >  }
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7d313e0..6bb211d 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -68,8 +68,15 @@ enum kvm_bus {
> > 
> >  	KVM_NR_BUSES
> >  
> >  };
> > 
> > +struct kvm_io_ext_data {
> > +	int type;
> 
> What values does this get? Please add documentation in comments.

See the next patch.
> 
> > +	union {
> > +		char padding[256];
> > +	};
> 
> So the structure size is 260 bytes?
> What's the point of the padding?

Reserved spaces. Also used in the next patch.

--
regards
Yang, Sheng

> 
> > +};
> > +
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > 
> > -		     int len, const void *val);
> > +		     int len, const void *val, struct kvm_io_ext_data *data);
> > 
> >  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  int len,
> >  
> >  		    void *val);
> >  
> >  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> > 
> > @@ -113,7 +120,8 @@ struct kvm_io_device_ops {
> > 
> >  	int (*write)(struct kvm_io_device *this,
> >  	
> >  		     gpa_t addr,
> >  		     int len,
> > 
> > -		     const void *val);
> > +		     const void *val,
> > +		     struct kvm_io_ext_data *data);
> > 
> >  	void (*destructor)(struct kvm_io_device *this);
> >  
> >  };
> > 
> > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> > index fc84875..37b254c 100644
> > --- a/virt/kvm/coalesced_mmio.c
> > +++ b/virt/kvm/coalesced_mmio.c
> > @@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct
> > kvm_coalesced_mmio_dev *dev,
> > 
> >  }
> >  
> >  static int coalesced_mmio_write(struct kvm_io_device *this,
> > 
> > -				gpa_t addr, int len, const void *val)
> > +				gpa_t addr, int len, const void *val,
> > +				struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
> >  	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
> > 
> > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > index 2ca4535..8edd757 100644
> > --- a/virt/kvm/eventfd.c
> > +++ b/virt/kvm/eventfd.c
> > @@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr,
> > int len, const void *val)
> > 
> >  /* MMIO/PIO writes trigger an event if the addr/val match */
> >  static int
> >  ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> > 
> > -		const void *val)
> > +		const void *val, struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct _ioeventfd *p = to_ioeventfd(this);
> > 
> > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> > index 0b9df83..6a027ef 100644
> > --- a/virt/kvm/ioapic.c
> > +++ b/virt/kvm/ioapic.c
> > @@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device
> > *this, gpa_t addr, int len,
> > 
> >  }
> >  
> >  static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int
> >  len,
> > 
> > -			     const void *val)
> > +			     const void *val, struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	struct kvm_ioapic *ioapic = to_ioapic(this);
> >  	u32 data;
> > 
> > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> > index d1f5651..340ab79 100644
> > --- a/virt/kvm/iodev.h
> > +++ b/virt/kvm/iodev.h
> > @@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct
> > kvm_io_device *dev,
> > 
> >  }
> >  
> >  static inline int kvm_iodevice_write(struct kvm_io_device *dev,
> > 
> > -				     gpa_t addr, int l, const void *v)
> > +				     gpa_t addr, int l, const void *v,
> > +				     struct kvm_io_ext_data *data)
> > 
> >  {
> > 
> > -	return dev->ops->write ? dev->ops->write(dev, addr, l, v) :
> > -EOPNOTSUPP; +	return dev->ops->write ?
> > +		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
> > 
> >  }
> >  
> >  static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
> > 
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index b1b6cbb..a61f90e 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  /* kvm_io_bus_write - called under kvm->slots_lock */
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > 
> > -		     int len, const void *val)
> > +		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > 
> >  {
> >  
> >  	int i;
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> >  	for (i = 0; i < bus->dev_count; i++)
> > 
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > +		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > 
> >  			return 0;
> >  	
> >  	return -EOPNOTSUPP;
> >  
> >  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:45   ` Michael S. Tsirkin
@ 2011-02-25  6:28     ` Sheng Yang
  2011-02-25  8:29       ` Michael S. Tsirkin
  2011-02-25  6:50     ` Sheng Yang
  2011-02-25  6:50     ` [PATCH 3/4 v10 UPDATED] " Sheng Yang
  2 siblings, 1 reply; 21+ messages in thread
From: Sheng Yang @ 2011-02-25  6:28 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> 
> Doesn't look like all comments got addressed.
> E.g. gpa_t entry_base is still there and in reality
> you said it's a host virtual address so
> should be void __user *;

Would update it.

> And ENOTSYNC meaning 'MSIX' is pretty hacky.

I'd like to discuss it later. We may need some work on all MMIO handling side to 
make it more straightforward. But I don't want to bundle it with this one... 
> 
> > ---
> > 
> >  arch/x86/include/asm/kvm_host.h |    1 +
> >  arch/x86/kvm/Makefile           |    2 +-
> >  arch/x86/kvm/mmu.c              |    2 +
> >  arch/x86/kvm/x86.c              |   40 ++++-
> >  include/linux/kvm.h             |   28 ++++
> >  include/linux/kvm_host.h        |   34 +++++
> >  virt/kvm/assigned-dev.c         |   44 ++++++
> >  virt/kvm/kvm_main.c             |   38 +++++-
> >  virt/kvm/msix_mmio.c            |  296
> >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> >  |   25 ++++
> >  10 files changed, 497 insertions(+), 13 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -635,6 +635,7 @@ enum emulation_result {
> > 
> >  	EMULATE_DONE,       /* no further processing */
> >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > 
> > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > 
> >  };
> >  
> >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index f15501f..3a0d851 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > 
> >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> >  
> >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > 
> > -				assigned-dev.o)
> > +				assigned-dev.o msix_mmio.o)
> > 
> >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> >  async_pf.o)
> > 
> > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > index 9cafbb4..912dca4 100644
> > --- a/arch/x86/kvm/mmu.c
> > +++ b/arch/x86/kvm/mmu.c
> > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > cr2, u32 error_code,
> > 
> >  	case EMULATE_DO_MMIO:
> >  		++vcpu->stat.mmio_exits;
> >  		/* fall through */
> > 
> > +	case EMULATE_USERSPACE_EXIT:
> > +		/* fall through */
> > 
> >  	case EMULATE_FAIL:
> >  		return 0;
> >  	
> >  	default:
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index 21b84e2..87308eb 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 
> >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >  	case KVM_CAP_XSAVE:
> > 
> >  	case KVM_CAP_ASYNC_PF:
> > +	case KVM_CAP_MSIX_MMIO:
> >  		r = 1;
> >  		break;
> >  	
> >  	case KVM_CAP_COALESCED_MMIO:
> > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  {
> >  
> >  	gpa_t                 gpa;
> >  	struct kvm_io_ext_data ext_data;
> > 
> > +	int r;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3824,18 +3826,32 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >  mmio:
> >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > 
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > +	if (!r)
> > 
> >  		return X86EMUL_CONTINUE;
> > 
> > -	vcpu->mmio_needed = 1;
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > +	if (r == -ENOTSYNC) {
> > +		vcpu->userspace_exit_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > +		vcpu->run->msix_routing.dev_id =
> > +			ext_data.msix_routing.dev_id;
> > +		vcpu->run->msix_routing.type =
> > +			ext_data.msix_routing.type;
> > +		vcpu->run->msix_routing.entry_idx =
> > +			ext_data.msix_routing.entry_idx;
> > +		vcpu->run->msix_routing.flags =
> > +			ext_data.msix_routing.flags;
> > +	} else  {
> > +		vcpu->mmio_needed = 1;
> > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > +	}
> > 
> >  	return X86EMUL_CONTINUE;
> >  
> >  }
> > 
> > @@ -4469,6 +4485,8 @@ done:
> >  		r = EMULATE_DO_MMIO;
> >  	
> >  	} else if (r == EMULATION_RESTART)
> >  	
> >  		goto restart;
> > 
> > +	else if (vcpu->userspace_exit_needed)
> > +		r = EMULATE_USERSPACE_EXIT;
> > 
> >  	else
> >  	
> >  		r = EMULATE_DONE;
> > 
> > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > *vcpu, struct kvm_run *kvm_run)
> > 
> >  		}
> >  	
> >  	}
> > 
> > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > +			vcpu->userspace_exit_needed) {
> > 
> >  		if (vcpu->mmio_needed) {
> >  		
> >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> >  			vcpu->mmio_read_completed = 1;
> >  			vcpu->mmio_needed = 0;
> >  		
> >  		}
> > 
> > +		if (vcpu->userspace_exit_needed) {
> > +			vcpu->userspace_exit_needed = 0;
> > +			r = 0;
> > +			goto out;
> > +		}
> > 
> >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index ea2dc1a..4393e4e 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > 
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> > 
> > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > 
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > 
> > @@ -264,6 +265,13 @@ struct kvm_run {
> > 
> >  		struct {
> >  		
> >  			__u64 gprs[32];
> >  		
> >  		} osi;
> > 
> > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > +		struct {
> > +			__u32 dev_id;
> > +			__u16 type;
> > +			__u16 entry_idx;
> > +			__u64 flags;
> > +		} msix_routing;
> > 
> >  		/* Fix the size of the union. */
> >  		char padding[256];
> >  	
> >  	};
> > 
> > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > 
> >  #define KVM_CAP_PPC_GET_PVINFO 57
> >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> >  #define KVM_CAP_ASYNC_PF 59
> > 
> > +#define KVM_CAP_MSIX_MMIO 60
> > 
> >  #ifdef KVM_CAP_IRQ_ROUTING
> > 
> > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > 
> >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > 
> > +/* Available with KVM_CAP_MSIX_MMIO */
> > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > 0x7e, struct kvm_msix_mmio_user)
> > 
> >  /* Available with KVM_CAP_PIT_STATE2 */
> >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> >  struct kvm_pit_state2)
> > 
> > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > 
> >  	__u16 padding[3];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> >  #endif /* __LINUX_KVM_H */
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 6bb211d..6aaf85e 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -68,9 +68,16 @@ enum kvm_bus {
> > 
> >  	KVM_NR_BUSES
> >  
> >  };
> > 
> > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > 
> >  struct kvm_io_ext_data {
> >  
> >  	int type;
> >  	union {
> > 
> > +		struct {
> > +			u32 dev_id;
> > +			u16 type;
> > +			u16 entry_idx;
> > +			u64 flags;
> > +		} msix_routing;
> > 
> >  		char padding[256];
> >  	
> >  	};
> >  
> >  };
> > 
> > @@ -168,6 +175,8 @@ struct kvm_vcpu {
> > 
> >  	} async_pf;
> >  
> >  #endif
> > 
> > +	int userspace_exit_needed;
> > +
> > 
> >  	struct kvm_vcpu_arch arch;
> >  
> >  };
> > 
> > @@ -241,6 +250,27 @@ struct kvm_memslots {
> > 
> >  					KVM_PRIVATE_MEM_SLOTS];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_MAX    32
> > +
> > +struct kvm_msix_mmio {
> > +	u32 dev_id;
> > +	u16 type;
> > +	u16 max_entries_nr;
> > +	u64 flags;
> > +	gpa_t table_base_addr;
> > +	hva_t table_base_va;
> > +	gpa_t pba_base_addr;
> > +	hva_t pba_base_va;
> > +};
> > +
> > +struct kvm_msix_mmio_dev {
> > +	struct kvm *kvm;
> > +	struct kvm_io_device table_dev;
> > +	int mmio_nr;
> > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > +	struct mutex lock;
> > +};
> > +
> > 
> >  struct kvm {
> >  
> >  	spinlock_t mmu_lock;
> >  	raw_spinlock_t requests_lock;
> > 
> > @@ -289,6 +319,7 @@ struct kvm {
> > 
> >  	long mmu_notifier_count;
> >  
> >  #endif
> >  
> >  	long tlbs_dirty;
> > 
> > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > 
> >  };
> >  
> >  /* The guest did something we don't support. */
> > 
> > @@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > 
> >  int kvm_request_irq_source_id(struct kvm *kvm);
> >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +			int assigned_dev_id, int entry, bool mask);
> > +
> > 
> >  /* For vcpu->arch.iommu_flags */
> >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..d1598a6 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +/* The caller should hold kvm->lock */
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index a61f90e..f211e49 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> > 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..083b15b
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,296 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > +	return -EFAULT;
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	/*TODO: Add big endian support */
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> 
> addr & len as below?
> 
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> 
> then this test won't be needed.
> 
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val,
> > +				struct kvm_io_ext_data *ext_data)
> > +{
> > +	/*TODO: Add big endian support */
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +	unsigned long __user *ctrl_pos;
> 
> long? It's 8 bytes on 64 bit.
> You really want
> __le32 old_ctrl, new_ctrl;
> __le32 __user *ctrl_pos;

__le32 here may cause wrong idea that we support big endian, but it's not true. So 
I want to use u32 here, and add TODO above.
> 
> > +
> > +	mutex_lock(&mmio_dev->kvm->lock);
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if (!(len == 4 || len == 8) || addr & (len - 1))
> 
> Nice hack. Even a bit nicer

Thanks Alex for this line. :)

> 	if ((len != 4 && len != 8) || addr & (len - 1))

I think it's personal style difference. These two look same to me...

> 
> > +		goto out;
> > +
> > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	ctrl_pos = (unsigned long __user *)(entry_base +
> > +			PCI_MSIX_ENTRY_VECTOR_CTRL);
> 
> So this is the issue: if you cast a type to unsigned long *
> compiler can assume that the address is aligned.
> To prevent problems please add a check that table_base_va
> is aligned.

Already checked it when mmio register.
> 
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* Don't allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > +		goto out;
> > +
> > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > +	ext_data->msix_routing.type = mmio->type;
> > +	ext_data->msix_routing.entry_idx = entry;
> > +	ext_data->msix_routing.flags = 0;
> > +
> > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > +		ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if (old_ctrl == new_ctrl) {
> > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > +			ret = -ENOTSYNC;
> > +		goto out;
> > +	}
> > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > +	if (r)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	mutex_unlock(&mmio_dev->kvm->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	/* Check alignment and accessibility */
> > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> 
> You also should check that base_va and friends fit in a pointer
> for 32 bit architectures. Same for other va values.

OK

--
regards
Yang, Sheng

> 
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +		mmio_dev->mmio_nr++;
> > +	}
> > +
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = -EINVAL, i, j;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			r = 0;
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > new file mode 100644
> > index 0000000..01b6587
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __KVM_MSIX_MMIO_H__
> > +#define __KVM_MSIX_MMIO_H__
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/pci.h>
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio_user); +
> > +#endif

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:45   ` Michael S. Tsirkin
  2011-02-25  6:28     ` Sheng Yang
@ 2011-02-25  6:50     ` Sheng Yang
  2011-02-25  6:50     ` [PATCH 3/4 v10 UPDATED] " Sheng Yang
  2 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-25  6:50 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   40 ++++-
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   34 +++++
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/kvm_main.c             |   38 +++++-
 virt/kvm/msix_mmio.c            |  302 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 10 files changed, 503 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21..4a390a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -635,6 +635,7 @@ enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
+	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
 };
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb4..912dca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
+	case EMULATE_USERSPACE_EXIT:
+		/* fall through */
 	case EMULATE_FAIL:
 		return 0;
 	default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b84e2..87308eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 {
 	gpa_t                 gpa;
 	struct kvm_io_ext_data ext_data;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-	memcpy(vcpu->run->mmio.data, val, bytes);
+	if (r == -ENOTSYNC) {
+		vcpu->userspace_exit_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
+		vcpu->run->msix_routing.dev_id =
+			ext_data.msix_routing.dev_id;
+		vcpu->run->msix_routing.type =
+			ext_data.msix_routing.type;
+		vcpu->run->msix_routing.entry_idx =
+			ext_data.msix_routing.entry_idx;
+		vcpu->run->msix_routing.flags =
+			ext_data.msix_routing.flags;
+	} else  {
+		vcpu->mmio_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+		memcpy(vcpu->run->mmio.data, val, bytes);
+	}
 
 	return X86EMUL_CONTINUE;
 }
@@ -4469,6 +4485,8 @@ done:
 		r = EMULATE_DO_MMIO;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else if (vcpu->userspace_exit_needed)
+		r = EMULATE_USERSPACE_EXIT;
 	else
 		r = EMULATE_DONE;
 
@@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+			vcpu->userspace_exit_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
 			vcpu->mmio_needed = 0;
 		}
+		if (vcpu->userspace_exit_needed) {
+			vcpu->userspace_exit_needed = 0;
+			r = 0;
+			goto out;
+		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..4393e4e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,13 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +681,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6bb211d..6aaf85e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,9 +68,16 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
 struct kvm_io_ext_data {
 	int type;
 	union {
+		struct {
+			u32 dev_id;
+			u16 type;
+			u16 entry_idx;
+			u64 flags;
+		} msix_routing;
 		char padding[256];
 	};
 };
@@ -168,6 +175,8 @@ struct kvm_vcpu {
 	} async_pf;
 #endif
 
+	int userspace_exit_needed;
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -241,6 +250,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -289,6 +319,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a61f90e..f211e49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..4a9cf15
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,302 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (!(len == 4 || len == 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	void __user *entry_base;
+	u32 __user *ctrl_pos;
+	u32 old_ctrl, new_ctrl;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (!(len == 4 || len == 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE);
+	ctrl_pos = entry_base +
+			PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* Don't allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
+	ext_data->msix_routing.dev_id = mmio->dev_id;
+	ext_data->msix_routing.type = mmio->type;
+	ext_data->msix_routing.entry_idx = entry;
+	ext_data->msix_routing.flags = 0;
+
+	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
+		ret = -ENOTSYNC;
+		goto out;
+	}
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if (old_ctrl == new_ctrl) {
+		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
+			ret = -ENOTSYNC;
+		goto out;
+	}
+	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
+				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
+	if (r)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+#ifndef CONFIG_64BIT
+	if (mmio_user->base_va >= 0xffffffff ||
+	    mmio_user->base_addr >= 0xffffffff) {
+		r = -EINVAL;
+		goto out;
+	}
+#endif
+
+	/* Check alignment and accessibility */
+	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
+	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+		mmio_dev->mmio_nr++;
+	}
+
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = -EINVAL, i, j;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			r = 0;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH 3/4 v10 UPDATED] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:45   ` Michael S. Tsirkin
  2011-02-25  6:28     ` Sheng Yang
  2011-02-25  6:50     ` Sheng Yang
@ 2011-02-25  6:50     ` Sheng Yang
  2 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-25  6:50 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/include/asm/kvm_host.h |    1 +
 arch/x86/kvm/Makefile           |    2 +-
 arch/x86/kvm/mmu.c              |    2 +
 arch/x86/kvm/x86.c              |   40 ++++-
 include/linux/kvm.h             |   28 ++++
 include/linux/kvm_host.h        |   34 +++++
 virt/kvm/assigned-dev.c         |   44 ++++++
 virt/kvm/kvm_main.c             |   38 +++++-
 virt/kvm/msix_mmio.c            |  302 +++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h            |   25 ++++
 10 files changed, 503 insertions(+), 13 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa75f21..4a390a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -635,6 +635,7 @@ enum emulation_result {
 	EMULATE_DONE,       /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
 	EMULATE_FAIL,         /* can't emulate this instruction */
+	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
 };
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9cafbb4..912dca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
 		/* fall through */
+	case EMULATE_USERSPACE_EXIT:
+		/* fall through */
 	case EMULATE_FAIL:
 		return 0;
 	default:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 21b84e2..87308eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 {
 	gpa_t                 gpa;
 	struct kvm_io_ext_data ext_data;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3824,18 +3826,32 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
-	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
-	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
-	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
-	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
-	memcpy(vcpu->run->mmio.data, val, bytes);
+	if (r == -ENOTSYNC) {
+		vcpu->userspace_exit_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
+		vcpu->run->msix_routing.dev_id =
+			ext_data.msix_routing.dev_id;
+		vcpu->run->msix_routing.type =
+			ext_data.msix_routing.type;
+		vcpu->run->msix_routing.entry_idx =
+			ext_data.msix_routing.entry_idx;
+		vcpu->run->msix_routing.flags =
+			ext_data.msix_routing.flags;
+	} else  {
+		vcpu->mmio_needed = 1;
+		vcpu->run->exit_reason = KVM_EXIT_MMIO;
+		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+		memcpy(vcpu->run->mmio.data, val, bytes);
+	}
 
 	return X86EMUL_CONTINUE;
 }
@@ -4469,6 +4485,8 @@ done:
 		r = EMULATE_DO_MMIO;
 	} else if (r == EMULATION_RESTART)
 		goto restart;
+	else if (vcpu->userspace_exit_needed)
+		r = EMULATE_USERSPACE_EXIT;
 	else
 		r = EMULATE_DONE;
 
@@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
+	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
+			vcpu->userspace_exit_needed) {
 		if (vcpu->mmio_needed) {
 			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
 			vcpu->mmio_read_completed = 1;
 			vcpu->mmio_needed = 0;
 		}
+		if (vcpu->userspace_exit_needed) {
+			vcpu->userspace_exit_needed = 0;
+			r = 0;
+			goto out;
+		}
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..4393e4e 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -264,6 +265,13 @@ struct kvm_run {
 		struct {
 			__u64 gprs[32];
 		} osi;
+		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
+		struct {
+			__u32 dev_id;
+			__u16 type;
+			__u16 entry_idx;
+			__u64 flags;
+		} msix_routing;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +681,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6bb211d..6aaf85e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -68,9 +68,16 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
+#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
 struct kvm_io_ext_data {
 	int type;
 	union {
+		struct {
+			u32 dev_id;
+			u16 type;
+			u16 entry_idx;
+			u64 flags;
+		} msix_routing;
 		char padding[256];
 	};
 };
@@ -168,6 +175,8 @@ struct kvm_vcpu {
 	} async_pf;
 #endif
 
+	int userspace_exit_needed;
+
 	struct kvm_vcpu_arch arch;
 };
 
@@ -241,6 +250,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -289,6 +319,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a61f90e..f211e49 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val, struct kvm_io_ext_data *ext_data)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..4a9cf15
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,302 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (!(len == 4 || len == 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val,
+				struct kvm_io_ext_data *ext_data)
+{
+	/*TODO: Add big endian support */
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	void __user *entry_base;
+	u32 __user *ctrl_pos;
+	u32 old_ctrl, new_ctrl;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if (!(len == 4 || len == 8) || addr & (len - 1))
+		goto out;
+
+	offset = addr % PCI_MSIX_ENTRY_SIZE;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE);
+	ctrl_pos = entry_base +
+			PCI_MSIX_ENTRY_VECTOR_CTRL;
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* Don't allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
+	ext_data->msix_routing.dev_id = mmio->dev_id;
+	ext_data->msix_routing.type = mmio->type;
+	ext_data->msix_routing.entry_idx = entry;
+	ext_data->msix_routing.flags = 0;
+
+	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
+		ret = -ENOTSYNC;
+		goto out;
+	}
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if (old_ctrl == new_ctrl) {
+		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
+			ret = -ENOTSYNC;
+		goto out;
+	}
+	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
+				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
+	if (r)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+#ifndef CONFIG_64BIT
+	if (mmio_user->base_va >= 0xffffffff ||
+	    mmio_user->base_addr >= 0xffffffff) {
+		r = -EINVAL;
+		goto out;
+	}
+#endif
+
+	/* Check alignment and accessibility */
+	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
+	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+		mmio_dev->mmio_nr++;
+	}
+
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = -EINVAL, i, j;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			r = 0;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-25  3:23     ` Sheng Yang
@ 2011-02-25  8:12       ` Michael S. Tsirkin
  2011-02-28  5:13         ` Sheng Yang
  0 siblings, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2011-02-25  8:12 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Fri, Feb 25, 2011 at 11:23:30AM +0800, Sheng Yang wrote:
> On Thursday 24 February 2011 18:22:19 Michael S. Tsirkin wrote:
> > On Thu, Feb 24, 2011 at 05:51:03PM +0800, Sheng Yang wrote:
> > > Add a new parameter to IO writing handler, so that we can transfer
> > > information from IO handler to caller.
> > > 
> > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > ---
> > > 
> > >  arch/x86/kvm/i8254.c      |    6 ++++--
> > >  arch/x86/kvm/i8259.c      |    3 ++-
> > >  arch/x86/kvm/lapic.c      |    3 ++-
> > >  arch/x86/kvm/x86.c        |   13 ++++++++-----
> > >  include/linux/kvm_host.h  |   12 ++++++++++--
> > >  virt/kvm/coalesced_mmio.c |    3 ++-
> > >  virt/kvm/eventfd.c        |    2 +-
> > >  virt/kvm/ioapic.c         |    2 +-
> > >  virt/kvm/iodev.h          |    6 ++++--
> > >  virt/kvm/kvm_main.c       |    4 ++--
> > >  10 files changed, 36 insertions(+), 18 deletions(-)
> > > 
> > > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> > > index efad723..bd8f0c5 100644
> > > --- a/arch/x86/kvm/i8254.c
> > > +++ b/arch/x86/kvm/i8254.c
> > > @@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
> > > 
> > >  }
> > >  
> > >  static int pit_ioport_write(struct kvm_io_device *this,
> > > 
> > > -			    gpa_t addr, int len, const void *data)
> > > +			    gpa_t addr, int len, const void *data,
> > > +			    struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_pit *pit = dev_to_pit(this);
> > >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > > 
> > > @@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device
> > > *this,
> > > 
> > >  }
> > >  
> > >  static int speaker_ioport_write(struct kvm_io_device *this,
> > > 
> > > -				gpa_t addr, int len, const void *data)
> > > +				gpa_t addr, int len, const void *data,
> > > +				struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_pit *pit = speaker_to_pit(this);
> > >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > > 
> > > diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> > > index 3cece05..96b1070 100644
> > > --- a/arch/x86/kvm/i8259.c
> > > +++ b/arch/x86/kvm/i8259.c
> > > @@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct
> > > kvm_io_device *dev)
> > > 
> > >  }
> > >  
> > >  static int picdev_write(struct kvm_io_device *this,
> > > 
> > > -			 gpa_t addr, int len, const void *val)
> > > +			 gpa_t addr, int len, const void *val,
> > > +			 struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_pic *s = to_pic(this);
> > >  	unsigned char data = *(unsigned char *)val;
> > > 
> > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > index 93cf9d0..f413e9c 100644
> > > --- a/arch/x86/kvm/lapic.c
> > > +++ b/arch/x86/kvm/lapic.c
> > > @@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic, u32
> > > reg, u32 val)
> > > 
> > >  }
> > >  
> > >  static int apic_mmio_write(struct kvm_io_device *this,
> > > 
> > > -			    gpa_t address, int len, const void *data)
> > > +			    gpa_t address, int len, const void *data,
> > > +			    struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_lapic *apic = to_lapic(this);
> > >  	unsigned int offset = address - apic->base_address;
> > > 
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index fa708c9..21b84e2 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
> > > 
> > >  }
> > >  
> > >  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> > > 
> > > -			   const void *v)
> > > +			   const void *v, struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	if (vcpu->arch.apic &&
> > > 
> > > -	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
> > > +	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v, ext_data))
> > > 
> > >  		return 0;
> > > 
> > > -	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
> > > +	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
> > > +				addr, len, v, ext_data);
> > > 
> > >  }
> > >  
> > >  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len,
> > >  void *v)
> > > 
> > > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > long addr,
> > > 
> > >  					   struct kvm_vcpu *vcpu)
> > >  
> > >  {
> > >  
> > >  	gpa_t                 gpa;
> > > 
> > > +	struct kvm_io_ext_data ext_data;
> > > 
> > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > 
> > > @@ -3825,7 +3827,7 @@ mmio:
> > >  	/*
> > >  	
> > >  	 * Is this MMIO handled locally?
> > >  	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > +	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > 
> > >  		return X86EMUL_CONTINUE;
> > >  	
> > >  	vcpu->mmio_needed = 1;
> > > 
> > > @@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void
> > > *pd)
> > > 
> > >  {
> > >  
> > >  	/* TODO: String I/O for in kernel device */
> > >  	int r;
> > > 
> > > +	struct kvm_io_ext_data ext_data;
> > > 
> > >  	if (vcpu->arch.pio.in)
> > >  	
> > >  		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
> > > 
> > > @@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void
> > > *pd)
> > > 
> > >  	else
> > >  	
> > >  		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
> > >  		
> > >  				     vcpu->arch.pio.port, vcpu->arch.pio.size,
> > > 
> > > -				     pd);
> > > +				     pd, &ext_data);
> > > 
> > >  	return r;
> > >  
> > >  }
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 7d313e0..6bb211d 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -68,8 +68,15 @@ enum kvm_bus {
> > > 
> > >  	KVM_NR_BUSES
> > >  
> > >  };
> > > 
> > > +struct kvm_io_ext_data {
> > > +	int type;
> > 
> > What values does this get? Please add documentation in comments.
> 
> See the next patch.
> > 
> > > +	union {
> > > +		char padding[256];
> > > +	};
> > 
> > So the structure size is 260 bytes?
> > What's the point of the padding?
> 
> Reserved spaces. Also used in the next patch.

I was unable to find anything related to padding in the next patch.
This is an internal API, isn't it? So why reserve space? Further,
256 bytes is quite a lot to reserve. Also,
making the total size a power of 2 would seem to make more sense if we
do need to reserve any space.

> --
> regards
> Yang, Sheng
> 
> > 
> > > +};
> > > +
> > > 
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > > 
> > > -		     int len, const void *val);
> > > +		     int len, const void *val, struct kvm_io_ext_data *data);
> > > 
> > >  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > >  int len,
> > >  
> > >  		    void *val);
> > >  
> > >  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> > > 
> > > @@ -113,7 +120,8 @@ struct kvm_io_device_ops {
> > > 
> > >  	int (*write)(struct kvm_io_device *this,
> > >  	
> > >  		     gpa_t addr,
> > >  		     int len,
> > > 
> > > -		     const void *val);
> > > +		     const void *val,
> > > +		     struct kvm_io_ext_data *data);
> > > 
> > >  	void (*destructor)(struct kvm_io_device *this);
> > >  
> > >  };
> > > 
> > > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> > > index fc84875..37b254c 100644
> > > --- a/virt/kvm/coalesced_mmio.c
> > > +++ b/virt/kvm/coalesced_mmio.c
> > > @@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct
> > > kvm_coalesced_mmio_dev *dev,
> > > 
> > >  }
> > >  
> > >  static int coalesced_mmio_write(struct kvm_io_device *this,
> > > 
> > > -				gpa_t addr, int len, const void *val)
> > > +				gpa_t addr, int len, const void *val,
> > > +				struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
> > >  	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
> > > 
> > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > index 2ca4535..8edd757 100644
> > > --- a/virt/kvm/eventfd.c
> > > +++ b/virt/kvm/eventfd.c
> > > @@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr,
> > > int len, const void *val)
> > > 
> > >  /* MMIO/PIO writes trigger an event if the addr/val match */
> > >  static int
> > >  ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> > > 
> > > -		const void *val)
> > > +		const void *val, struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct _ioeventfd *p = to_ioeventfd(this);
> > > 
> > > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> > > index 0b9df83..6a027ef 100644
> > > --- a/virt/kvm/ioapic.c
> > > +++ b/virt/kvm/ioapic.c
> > > @@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device
> > > *this, gpa_t addr, int len,
> > > 
> > >  }
> > >  
> > >  static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int
> > >  len,
> > > 
> > > -			     const void *val)
> > > +			     const void *val, struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	struct kvm_ioapic *ioapic = to_ioapic(this);
> > >  	u32 data;
> > > 
> > > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> > > index d1f5651..340ab79 100644
> > > --- a/virt/kvm/iodev.h
> > > +++ b/virt/kvm/iodev.h
> > > @@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct
> > > kvm_io_device *dev,
> > > 
> > >  }
> > >  
> > >  static inline int kvm_iodevice_write(struct kvm_io_device *dev,
> > > 
> > > -				     gpa_t addr, int l, const void *v)
> > > +				     gpa_t addr, int l, const void *v,
> > > +				     struct kvm_io_ext_data *data)
> > > 
> > >  {
> > > 
> > > -	return dev->ops->write ? dev->ops->write(dev, addr, l, v) :
> > > -EOPNOTSUPP; +	return dev->ops->write ?
> > > +		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
> > > 
> > >  }
> > >  
> > >  static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
> > > 
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index b1b6cbb..a61f90e 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > *bus)
> > > 
> > >  /* kvm_io_bus_write - called under kvm->slots_lock */
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > > 
> > > -		     int len, const void *val)
> > > +		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > > 
> > >  {
> > >  
> > >  	int i;
> > >  	struct kvm_io_bus *bus;
> > >  	
> > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > >  	for (i = 0; i < bus->dev_count; i++)
> > > 
> > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > +		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > > 
> > >  			return 0;
> > >  	
> > >  	return -EOPNOTSUPP;
> > >  
> > >  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-25  6:28     ` Sheng Yang
@ 2011-02-25  8:29       ` Michael S. Tsirkin
  2011-02-28  5:18         ` Sheng Yang
  2011-03-01 20:18         ` Marcelo Tosatti
  0 siblings, 2 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2011-02-25  8:29 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Fri, Feb 25, 2011 at 02:28:02PM +0800, Sheng Yang wrote:
> On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> > On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > 
> > Doesn't look like all comments got addressed.
> > E.g. gpa_t entry_base is still there and in reality
> > you said it's a host virtual address so
> > should be void __user *;
> 
> Would update it.
> 
> > And ENOTSYNC meaning 'MSIX' is pretty hacky.
> 
> I'd like to discuss it later. We may need some work on all MMIO handling side to 
> make it more straightforward. But I don't want to bundle it with this one... 

It's not PCI related so I'll defer to Avi/Marcelo on this.
Are you guys happy with the ENOTSYNC meaning 'MSIX'
and userspace_exit_needed hacks in this code?


> > 
> > > ---
> > > 
> > >  arch/x86/include/asm/kvm_host.h |    1 +
> > >  arch/x86/kvm/Makefile           |    2 +-
> > >  arch/x86/kvm/mmu.c              |    2 +
> > >  arch/x86/kvm/x86.c              |   40 ++++-
> > >  include/linux/kvm.h             |   28 ++++
> > >  include/linux/kvm_host.h        |   34 +++++
> > >  virt/kvm/assigned-dev.c         |   44 ++++++
> > >  virt/kvm/kvm_main.c             |   38 +++++-
> > >  virt/kvm/msix_mmio.c            |  296
> > >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h           
> > >  |   25 ++++
> > >  10 files changed, 497 insertions(+), 13 deletions(-)
> > >  create mode 100644 virt/kvm/msix_mmio.c
> > >  create mode 100644 virt/kvm/msix_mmio.h
> > > 
> > > diff --git a/arch/x86/include/asm/kvm_host.h
> > > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > > --- a/arch/x86/include/asm/kvm_host.h
> > > +++ b/arch/x86/include/asm/kvm_host.h
> > > @@ -635,6 +635,7 @@ enum emulation_result {
> > > 
> > >  	EMULATE_DONE,       /* no further processing */
> > >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> > >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > > 
> > > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > > 
> > >  };
> > >  
> > >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > > 
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index f15501f..3a0d851 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > 
> > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> > >  
> > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > 
> > > -				assigned-dev.o)
> > > +				assigned-dev.o msix_mmio.o)
> > > 
> > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > >  async_pf.o)
> > > 
> > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > index 9cafbb4..912dca4 100644
> > > --- a/arch/x86/kvm/mmu.c
> > > +++ b/arch/x86/kvm/mmu.c
> > > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t
> > > cr2, u32 error_code,
> > > 
> > >  	case EMULATE_DO_MMIO:
> > >  		++vcpu->stat.mmio_exits;
> > >  		/* fall through */
> > > 
> > > +	case EMULATE_USERSPACE_EXIT:
> > > +		/* fall through */
> > > 
> > >  	case EMULATE_FAIL:
> > >  		return 0;
> > >  	
> > >  	default:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index 21b84e2..87308eb 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > 
> > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > >  	case KVM_CAP_XSAVE:
> > > 
> > >  	case KVM_CAP_ASYNC_PF:
> > > +	case KVM_CAP_MSIX_MMIO:
> > >  		r = 1;
> > >  		break;
> > >  	
> > >  	case KVM_CAP_COALESCED_MMIO:
> > > @@ -3809,6 +3810,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > long addr,
> > > 
> > >  {
> > >  
> > >  	gpa_t                 gpa;
> > >  	struct kvm_io_ext_data ext_data;
> > > 
> > > +	int r;
> > > 
> > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > 
> > > @@ -3824,18 +3826,32 @@ static int
> > > emulator_write_emulated_onepage(unsigned long addr,
> > > 
> > >  mmio:
> > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > 
> > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > > 
> > >  	/*
> > >  	
> > >  	 * Is this MMIO handled locally?
> > >  	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > +	if (!r)
> > > 
> > >  		return X86EMUL_CONTINUE;
> > > 
> > > -	vcpu->mmio_needed = 1;
> > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > > +	if (r == -ENOTSYNC) {
> > > +		vcpu->userspace_exit_needed = 1;
> > > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > > +		vcpu->run->msix_routing.dev_id =
> > > +			ext_data.msix_routing.dev_id;
> > > +		vcpu->run->msix_routing.type =
> > > +			ext_data.msix_routing.type;
> > > +		vcpu->run->msix_routing.entry_idx =
> > > +			ext_data.msix_routing.entry_idx;
> > > +		vcpu->run->msix_routing.flags =
> > > +			ext_data.msix_routing.flags;
> > > +	} else  {
> > > +		vcpu->mmio_needed = 1;
> > > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > > +	}
> > > 
> > >  	return X86EMUL_CONTINUE;
> > >  
> > >  }
> > > 
> > > @@ -4469,6 +4485,8 @@ done:
> > >  		r = EMULATE_DO_MMIO;
> > >  	
> > >  	} else if (r == EMULATION_RESTART)
> > >  	
> > >  		goto restart;
> > > 
> > > +	else if (vcpu->userspace_exit_needed)
> > > +		r = EMULATE_USERSPACE_EXIT;
> > > 
> > >  	else
> > >  	
> > >  		r = EMULATE_DONE;
> > > 
> > > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > > *vcpu, struct kvm_run *kvm_run)
> > > 
> > >  		}
> > >  	
> > >  	}
> > > 
> > > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > > +			vcpu->userspace_exit_needed) {
> > > 
> > >  		if (vcpu->mmio_needed) {
> > >  		
> > >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> > >  			vcpu->mmio_read_completed = 1;
> > >  			vcpu->mmio_needed = 0;
> > >  		
> > >  		}
> > > 
> > > +		if (vcpu->userspace_exit_needed) {
> > > +			vcpu->userspace_exit_needed = 0;
> > > +			r = 0;
> > > +			goto out;
> > > +		}
> > > 
> > >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > 
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index ea2dc1a..4393e4e 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > 
> > >  #define KVM_EXIT_NMI              16
> > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > >  #define KVM_EXIT_OSI              18
> > > 
> > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > 
> > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > 
> > > @@ -264,6 +265,13 @@ struct kvm_run {
> > > 
> > >  		struct {
> > >  		
> > >  			__u64 gprs[32];
> > >  		
> > >  		} osi;
> > > 
> > > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > > +		struct {
> > > +			__u32 dev_id;
> > > +			__u16 type;
> > > +			__u16 entry_idx;
> > > +			__u64 flags;
> > > +		} msix_routing;
> > > 
> > >  		/* Fix the size of the union. */
> > >  		char padding[256];
> > >  	
> > >  	};
> > > 
> > > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > > 
> > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > >  #define KVM_CAP_ASYNC_PF 59
> > > 
> > > +#define KVM_CAP_MSIX_MMIO 60
> > > 
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > 
> > > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > > 
> > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > 
> > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > > 0x7e, struct kvm_msix_mmio_user)
> > > 
> > >  /* Available with KVM_CAP_PIT_STATE2 */
> > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > >  struct kvm_pit_state2)
> > > 
> > > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > > 
> > >  	__u16 padding[3];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > +struct kvm_msix_mmio_user {
> > > +	__u32 dev_id;
> > > +	__u16 type;
> > > +	__u16 max_entries_nr;
> > > +	__u64 base_addr;
> > > +	__u64 base_va;
> > > +	__u64 flags;
> > > +	__u64 reserved[4];
> > > +};
> > > +
> > > 
> > >  #endif /* __LINUX_KVM_H */
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 6bb211d..6aaf85e 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -68,9 +68,16 @@ enum kvm_bus {
> > > 
> > >  	KVM_NR_BUSES
> > >  
> > >  };
> > > 
> > > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > > 
> > >  struct kvm_io_ext_data {
> > >  
> > >  	int type;
> > >  	union {
> > > 
> > > +		struct {
> > > +			u32 dev_id;
> > > +			u16 type;
> > > +			u16 entry_idx;
> > > +			u64 flags;


Please, document the structure fields.

> > > +		} msix_routing;
> > > 
> > >  		char padding[256];

Note that there's another 2 bytes of padding before
the union as struct will be 8 byte aligned.
So if the purpose of the padding was to ensure
some fixed structure size, it failed in that.

> > >  	
> > >  	};
> > >  
> > >  };
> > > 
> > > @@ -168,6 +175,8 @@ struct kvm_vcpu {
> > > 
> > >  	} async_pf;
> > >  
> > >  #endif
> > > 
> > > +	int userspace_exit_needed;
> > > +
> > > 
> > >  	struct kvm_vcpu_arch arch;
> > >  
> > >  };
> > > 
> > > @@ -241,6 +250,27 @@ struct kvm_memslots {
> > > 
> > >  					KVM_PRIVATE_MEM_SLOTS];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_MAX    32
> > > +
> > > +struct kvm_msix_mmio {
> > > +	u32 dev_id;
> > > +	u16 type;
> > > +	u16 max_entries_nr;
> > > +	u64 flags;
> > > +	gpa_t table_base_addr;
> > > +	hva_t table_base_va;
> > > +	gpa_t pba_base_addr;
> > > +	hva_t pba_base_va;
> > > +};
> > > +
> > > +struct kvm_msix_mmio_dev {
> > > +	struct kvm *kvm;
> > > +	struct kvm_io_device table_dev;
> > > +	int mmio_nr;
> > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > +	struct mutex lock;
> > > +};
> > > +
> > > 
> > >  struct kvm {
> > >  
> > >  	spinlock_t mmu_lock;
> > >  	raw_spinlock_t requests_lock;
> > > 
> > > @@ -289,6 +319,7 @@ struct kvm {
> > > 
> > >  	long mmu_notifier_count;
> > >  
> > >  #endif
> > >  
> > >  	long tlbs_dirty;
> > > 
> > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > 
> > >  };
> > >  
> > >  /* The guest did something we don't support. */
> > > 
> > > @@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > > 
> > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > 
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +			int assigned_dev_id, int entry, bool mask);
> > > +
> > > 
> > >  /* For vcpu->arch.iommu_flags */
> > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > 
> > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > index ae72ae6..d1598a6 100644
> > > --- a/virt/kvm/assigned-dev.c
> > > +++ b/virt/kvm/assigned-dev.c
> > > @@ -18,6 +18,7 @@
> > > 
> > >  #include <linux/interrupt.h>
> > >  #include <linux/slab.h>
> > >  #include "irq.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > >  list_head *head,
> > >  
> > >  						      int assigned_dev_id)
> > > 
> > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > > 
> > >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> > >  
> > >  }
> > > 
> > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > +				struct kvm_assigned_dev_kernel *adev)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = adev->assigned_dev_id;
> > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > 
> > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > >  
> > >  				     struct kvm_assigned_dev_kernel
> > >  				     *assigned_dev)
> > >  
> > >  {
> > >  
> > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > 
> > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > +
> > > 
> > >  	__pci_reset_function(assigned_dev->dev);
> > >  	pci_restore_state(assigned_dev->dev);
> > > 
> > > @@ -785,3 +799,33 @@ out:
> > >  	return r;
> > >  
> > >  }
> > > 
> > > +/* The caller should hold kvm->lock */
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +				int assigned_dev_id, int entry, bool mask)
> > > +{
> > > +	int r = -EFAULT;
> > > +	struct kvm_assigned_dev_kernel *adev;
> > > +	int i;
> > > +
> > > +	if (!irqchip_in_kernel(kvm))
> > > +		return r;
> > > +
> > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > +				      assigned_dev_id);
> > > +	if (!adev)
> > > +		goto out;
> > > +
> > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > +	for (i = 0; i < adev->entries_nr; i++)
> > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > +			if (mask)
> > > +				disable_irq_nosync(
> > > +					adev->host_msix_entries[i].vector);
> > > +			else
> > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > +			r = 0;
> > > +			break;
> > > +		}
> > > +out:
> > > +	return r;
> > > +}
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index a61f90e..f211e49 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > > 
> > >  #include "coalesced_mmio.h"
> > >  #include "async_pf.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  #define CREATE_TRACE_POINTS
> > >  #include <trace/events/kvm.h>
> > > 
> > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > 
> > >  	struct mm_struct *mm = kvm->mm;
> > >  	
> > >  	kvm_arch_sync_events(kvm);
> > > 
> > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > 
> > >  	spin_lock(&kvm_lock);
> > >  	list_del(&kvm->vm_list);
> > >  	spin_unlock(&kvm_lock);
> > > 
> > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > 
> > >  		mutex_unlock(&kvm->lock);
> > >  		break;
> > >  
> > >  #endif
> > > 
> > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > 
> > >  	default:
> > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > >  		if (r == -ENOTTY)
> > > 
> > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > 
> > >  		return r;
> > >  	
> > >  	}
> > >  
> > >  #endif
> > > 
> > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > +	if (r < 0) {
> > > +		kvm_put_kvm(kvm);
> > > +		return r;
> > > +	}
> > > +
> > > 
> > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > >  	if (r < 0)
> > >  	
> > >  		kvm_put_kvm(kvm);
> > > 
> > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > *bus)
> > > 
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > >  
> > >  		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > >  
> > >  {
> > > 
> > > -	int i;
> > > +	int i, r = -EOPNOTSUPP;
> > > 
> > >  	struct kvm_io_bus *bus;
> > >  	
> > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > 
> > > -	for (i = 0; i < bus->dev_count; i++)
> > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > > +	for (i = 0; i < bus->dev_count; i++) {
> > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > > +		if (r == -ENOTSYNC)
> > > +			break;
> > > +		else if (!r)
> > > 
> > >  			return 0;
> > > 
> > > -	return -EOPNOTSUPP;
> > > +	}
> > > +	return r;
> > > 
> > >  }
> > >  
> > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > 
> > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > new file mode 100644
> > > index 0000000..083b15b
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.c
> > > @@ -0,0 +1,296 @@
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/kvm.h>
> > > +
> > > +#include "msix_mmio.h"
> > > +#include "iodev.h"
> > > +
> > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio, +				int entry, u32 flag)
> > > +{
> > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > +				mmio->dev_id, entry, flag);
> > > +	return -EFAULT;
> > > +}
> > > +
> > > +/* Caller must hold dev->lock */
> > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > +				gpa_t addr, int len)
> > > +{
> > > +	gpa_t start, end;
> > > +	int i, r = -EINVAL;
> > > +
> > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > +		start = dev->mmio[i].table_base_addr;
> > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > +			dev->mmio[i].max_entries_nr;
> > > +		if (addr >= start && addr + len <= end) {
> > > +			r = i;
> > > +			break;
> > > +		}
> > > +	}
> > > +
> > > +	return r;
> > > +}
> > > +
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	/*TODO: Add big endian support */
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > 
> > addr & len as below?
> > 
> > > +
> > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;
> > 
> > then this test won't be needed.
> > 
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val,
> > > +				struct kvm_io_ext_data *ext_data)
> > > +{
> > > +	/*TODO: Add big endian support */
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;
> > > +	unsigned long __user *ctrl_pos;
> > 
> > long? It's 8 bytes on 64 bit.
> > You really want
> > __le32 old_ctrl, new_ctrl;
> > __le32 __user *ctrl_pos;
> 
> __le32 here may cause wrong idea that we support big endian, but it's not true. So 
> I want to use u32 here, and add TODO above.

Users don't go looking for __le32 in code.  If you want to make sure
code is built on intel only, just check the architecture.

But understanding endianness and making the code correct
upfront is not hard, I really don't understand why
do you keep hardcoding LE assumptions in there,
__le32 is simply better, it let us know that the value
is related to pci as opposed to a random integer value.

> > 
> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if (!(len == 4 || len == 8) || addr & (len - 1))
> > 
> > Nice hack. Even a bit nicer
> 
> Thanks Alex for this line. :)
> 
> > 	if ((len != 4 && len != 8) || addr & (len - 1))
> 
> I think it's personal style difference. These two look same to me...

One character less :) And you use != above. Be consistent or better
add an inline to avoid duplication.

> > 
> > > +		goto out;
> > > +
> > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (unsigned long __user *)(entry_base +
> > > +			PCI_MSIX_ENTRY_VECTOR_CTRL);
> > 
> > So this is the issue: if you cast a type to unsigned long *
> > compiler can assume that the address is aligned.
> > To prevent problems please add a check that table_base_va
> > is aligned.
> 
> Already checked it when mmio register.

Right, missed it the first time around.

> > 
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* Don't allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > +		goto out;
> > > +
> > > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > > +	ext_data->msix_routing.type = mmio->type;
> > > +	ext_data->msix_routing.entry_idx = entry;
> > > +	ext_data->msix_routing.flags = 0;
> > > +
> > > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > > +		ret = -ENOTSYNC;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if (old_ctrl == new_ctrl) {
> > > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > > +			ret = -ENOTSYNC;
> > > +		goto out;
> > > +	}
> > > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > +	if (r)
> > > +		ret = -ENOTSYNC;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > +	.read     = msix_table_mmio_read,
> > > +	.write    = msix_table_mmio_write,
> > > +};
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	struct kvm_msix_mmio *mmio = NULL;
> > > +	int r = 0, i;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > +			mmio = &mmio_dev->mmio[i];
> > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > +				r = -EINVAL;
> > > +				goto out;
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	/* All reserved currently */
> > > +	if (mmio_user->flags) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	/* Check alignment and accessibility */
> > > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > 
> > You also should check that base_va and friends fit in a pointer
> > for 32 bit architectures. Same for other va values.
> 
> OK
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if (!mmio) {
> > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > +			r = -ENOSPC;
> > > +			goto out;
> > > +		}
> > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > +		mmio_dev->mmio_nr++;
> > > +	}
> > > +
> > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > +	mmio->dev_id = mmio_user->dev_id;
> > > +	mmio->flags = mmio_user->flags;
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > +		mmio->table_base_va = mmio_user->base_va;
> > > +	}
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	int r = -EINVAL, i, j;
> > > +
> > > +	if (!mmio)
> > > +		return 0;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > +			r = 0;
> > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > +			mmio_dev->mmio_nr--;
> > > +			break;
> > > +		}
> > > +	}
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > new file mode 100644
> > > index 0000000..01b6587
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.h
> > > @@ -0,0 +1,25 @@
> > > +#ifndef __KVM_MSIX_MMIO_H__
> > > +#define __KVM_MSIX_MMIO_H__
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/pci.h>
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio_user); +
> > > +#endif

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler
  2011-02-25  8:12       ` Michael S. Tsirkin
@ 2011-02-28  5:13         ` Sheng Yang
  0 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-28  5:13 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Friday 25 February 2011 16:12:30 Michael S. Tsirkin wrote:
> On Fri, Feb 25, 2011 at 11:23:30AM +0800, Sheng Yang wrote:
> > On Thursday 24 February 2011 18:22:19 Michael S. Tsirkin wrote:
> > > On Thu, Feb 24, 2011 at 05:51:03PM +0800, Sheng Yang wrote:
> > > > Add a new parameter to IO writing handler, so that we can transfer
> > > > information from IO handler to caller.
> > > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > ---
> > > > 
> > > >  arch/x86/kvm/i8254.c      |    6 ++++--
> > > >  arch/x86/kvm/i8259.c      |    3 ++-
> > > >  arch/x86/kvm/lapic.c      |    3 ++-
> > > >  arch/x86/kvm/x86.c        |   13 ++++++++-----
> > > >  include/linux/kvm_host.h  |   12 ++++++++++--
> > > >  virt/kvm/coalesced_mmio.c |    3 ++-
> > > >  virt/kvm/eventfd.c        |    2 +-
> > > >  virt/kvm/ioapic.c         |    2 +-
> > > >  virt/kvm/iodev.h          |    6 ++++--
> > > >  virt/kvm/kvm_main.c       |    4 ++--
> > > >  10 files changed, 36 insertions(+), 18 deletions(-)
> > > > 
> > > > diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> > > > index efad723..bd8f0c5 100644
> > > > --- a/arch/x86/kvm/i8254.c
> > > > +++ b/arch/x86/kvm/i8254.c
> > > > @@ -439,7 +439,8 @@ static inline int pit_in_range(gpa_t addr)
> > > > 
> > > >  }
> > > >  
> > > >  static int pit_ioport_write(struct kvm_io_device *this,
> > > > 
> > > > -			    gpa_t addr, int len, const void *data)
> > > > +			    gpa_t addr, int len, const void *data,
> > > > +			    struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_pit *pit = dev_to_pit(this);
> > > >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > > > 
> > > > @@ -585,7 +586,8 @@ static int pit_ioport_read(struct kvm_io_device
> > > > *this,
> > > > 
> > > >  }
> > > >  
> > > >  static int speaker_ioport_write(struct kvm_io_device *this,
> > > > 
> > > > -				gpa_t addr, int len, const void *data)
> > > > +				gpa_t addr, int len, const void *data,
> > > > +				struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_pit *pit = speaker_to_pit(this);
> > > >  	struct kvm_kpit_state *pit_state = &pit->pit_state;
> > > > 
> > > > diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> > > > index 3cece05..96b1070 100644
> > > > --- a/arch/x86/kvm/i8259.c
> > > > +++ b/arch/x86/kvm/i8259.c
> > > > @@ -480,7 +480,8 @@ static inline struct kvm_pic *to_pic(struct
> > > > kvm_io_device *dev)
> > > > 
> > > >  }
> > > >  
> > > >  static int picdev_write(struct kvm_io_device *this,
> > > > 
> > > > -			 gpa_t addr, int len, const void *val)
> > > > +			 gpa_t addr, int len, const void *val,
> > > > +			 struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_pic *s = to_pic(this);
> > > >  	unsigned char data = *(unsigned char *)val;
> > > > 
> > > > diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> > > > index 93cf9d0..f413e9c 100644
> > > > --- a/arch/x86/kvm/lapic.c
> > > > +++ b/arch/x86/kvm/lapic.c
> > > > @@ -836,7 +836,8 @@ static int apic_reg_write(struct kvm_lapic *apic,
> > > > u32 reg, u32 val)
> > > > 
> > > >  }
> > > >  
> > > >  static int apic_mmio_write(struct kvm_io_device *this,
> > > > 
> > > > -			    gpa_t address, int len, const void *data)
> > > > +			    gpa_t address, int len, const void *data,
> > > > +			    struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_lapic *apic = to_lapic(this);
> > > >  	unsigned int offset = address - apic->base_address;
> > > > 
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index fa708c9..21b84e2 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -3571,13 +3571,14 @@ static void kvm_init_msr_list(void)
> > > > 
> > > >  }
> > > >  
> > > >  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int
> > > >  len,
> > > > 
> > > > -			   const void *v)
> > > > +			   const void *v, struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	if (vcpu->arch.apic &&
> > > > 
> > > > -	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
> > > > +	    !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v,
> > > > ext_data))
> > > > 
> > > >  		return 0;
> > > > 
> > > > -	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
> > > > +	return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS,
> > > > +				addr, len, v, ext_data);
> > > > 
> > > >  }
> > > >  
> > > >  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int
> > > >  len, void *v)
> > > > 
> > > > @@ -3807,6 +3808,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  					   struct kvm_vcpu *vcpu)
> > > >  
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > > 
> > > > +	struct kvm_io_ext_data ext_data;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3825,7 +3827,7 @@ mmio:
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > >  	
> > > >  	vcpu->mmio_needed = 1;
> > > > 
> > > > @@ -3940,6 +3942,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu,
> > > > void *pd)
> > > > 
> > > >  {
> > > >  
> > > >  	/* TODO: String I/O for in kernel device */
> > > >  	int r;
> > > > 
> > > > +	struct kvm_io_ext_data ext_data;
> > > > 
> > > >  	if (vcpu->arch.pio.in)
> > > >  	
> > > >  		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu-
>arch.pio.port,
> > > > 
> > > > @@ -3947,7 +3950,7 @@ static int kernel_pio(struct kvm_vcpu *vcpu,
> > > > void *pd)
> > > > 
> > > >  	else
> > > >  	
> > > >  		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
> > > >  		
> > > >  				     vcpu->arch.pio.port, vcpu->arch.pio.size,
> > > > 
> > > > -				     pd);
> > > > +				     pd, &ext_data);
> > > > 
> > > >  	return r;
> > > >  
> > > >  }
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 7d313e0..6bb211d 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -68,8 +68,15 @@ enum kvm_bus {
> > > > 
> > > >  	KVM_NR_BUSES
> > > >  
> > > >  };
> > > > 
> > > > +struct kvm_io_ext_data {
> > > > +	int type;
> > > 
> > > What values does this get? Please add documentation in comments.
> > 
> > See the next patch.
> > 
> > > > +	union {
> > > > +		char padding[256];
> > > > +	};
> > > 
> > > So the structure size is 260 bytes?
> > > What's the point of the padding?
> > 
> > Reserved spaces. Also used in the next patch.
> 
> I was unable to find anything related to padding in the next patch.
> This is an internal API, isn't it? So why reserve space? Further,
> 256 bytes is quite a lot to reserve. Also,
> making the total size a power of 2 would seem to make more sense if we
> do need to reserve any space.

Yes you're right. Use union alone should be fine. Would update it.

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +};
> > > > +
> > > > 
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr,
> > > > 
> > > > -		     int len, const void *val);
> > > > +		     int len, const void *val, struct kvm_io_ext_data *data);
> > > > 
> > > >  int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr, int len,
> > > >  
> > > >  		    void *val);
> > > >  
> > > >  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx,
> > > > 
> > > > @@ -113,7 +120,8 @@ struct kvm_io_device_ops {
> > > > 
> > > >  	int (*write)(struct kvm_io_device *this,
> > > >  	
> > > >  		     gpa_t addr,
> > > >  		     int len,
> > > > 
> > > > -		     const void *val);
> > > > +		     const void *val,
> > > > +		     struct kvm_io_ext_data *data);
> > > > 
> > > >  	void (*destructor)(struct kvm_io_device *this);
> > > >  
> > > >  };
> > > > 
> > > > diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> > > > index fc84875..37b254c 100644
> > > > --- a/virt/kvm/coalesced_mmio.c
> > > > +++ b/virt/kvm/coalesced_mmio.c
> > > > @@ -59,7 +59,8 @@ static int coalesced_mmio_in_range(struct
> > > > kvm_coalesced_mmio_dev *dev,
> > > > 
> > > >  }
> > > >  
> > > >  static int coalesced_mmio_write(struct kvm_io_device *this,
> > > > 
> > > > -				gpa_t addr, int len, const void *val)
> > > > +				gpa_t addr, int len, const void *val,
> > > > +				struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
> > > >  	struct kvm_coalesced_mmio_ring *ring =
> > > >  	dev->kvm->coalesced_mmio_ring;
> > > > 
> > > > diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> > > > index 2ca4535..8edd757 100644
> > > > --- a/virt/kvm/eventfd.c
> > > > +++ b/virt/kvm/eventfd.c
> > > > @@ -483,7 +483,7 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t
> > > > addr, int len, const void *val)
> > > > 
> > > >  /* MMIO/PIO writes trigger an event if the addr/val match */
> > > >  static int
> > > >  ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
> > > > 
> > > > -		const void *val)
> > > > +		const void *val, struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct _ioeventfd *p = to_ioeventfd(this);
> > > > 
> > > > diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
> > > > index 0b9df83..6a027ef 100644
> > > > --- a/virt/kvm/ioapic.c
> > > > +++ b/virt/kvm/ioapic.c
> > > > @@ -321,7 +321,7 @@ static int ioapic_mmio_read(struct kvm_io_device
> > > > *this, gpa_t addr, int len,
> > > > 
> > > >  }
> > > >  
> > > >  static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > >  int len,
> > > > 
> > > > -			     const void *val)
> > > > +			     const void *val, struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	struct kvm_ioapic *ioapic = to_ioapic(this);
> > > >  	u32 data;
> > > > 
> > > > diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
> > > > index d1f5651..340ab79 100644
> > > > --- a/virt/kvm/iodev.h
> > > > +++ b/virt/kvm/iodev.h
> > > > @@ -33,9 +33,11 @@ static inline int kvm_iodevice_read(struct
> > > > kvm_io_device *dev,
> > > > 
> > > >  }
> > > >  
> > > >  static inline int kvm_iodevice_write(struct kvm_io_device *dev,
> > > > 
> > > > -				     gpa_t addr, int l, const void *v)
> > > > +				     gpa_t addr, int l, const void *v,
> > > > +				     struct kvm_io_ext_data *data)
> > > > 
> > > >  {
> > > > 
> > > > -	return dev->ops->write ? dev->ops->write(dev, addr, l, v) :
> > > > -EOPNOTSUPP; +	return dev->ops->write ?
> > > > +		dev->ops->write(dev, addr, l, v, data) : -EOPNOTSUPP;
> > > > 
> > > >  }
> > > >  
> > > >  static inline void kvm_iodevice_destructor(struct kvm_io_device
> > > >  *dev)
> > > > 
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index b1b6cbb..a61f90e 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -2221,14 +2221,14 @@ static void kvm_io_bus_destroy(struct
> > > > kvm_io_bus *bus)
> > > > 
> > > >  /* kvm_io_bus_write - called under kvm->slots_lock */
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr,
> > > > 
> > > > -		     int len, const void *val)
> > > > +		     int len, const void *val, struct kvm_io_ext_data *ext_data)
> > > > 
> > > >  {
> > > >  
> > > >  	int i;
> > > >  	struct kvm_io_bus *bus;
> > > >  	
> > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > >  	for (i = 0; i < bus->dev_count; i++)
> > > > 
> > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > +		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data))
> > > > 
> > > >  			return 0;
> > > >  	
> > > >  	return -EOPNOTSUPP;
> > > >  
> > > >  }

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-25  8:29       ` Michael S. Tsirkin
@ 2011-02-28  5:18         ` Sheng Yang
  2011-03-01 20:18         ` Marcelo Tosatti
  1 sibling, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-28  5:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, Alex Williamson, kvm

On Friday 25 February 2011 16:29:38 Michael S. Tsirkin wrote:
> On Fri, Feb 25, 2011 at 02:28:02PM +0800, Sheng Yang wrote:
> > On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> > > On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > 
> > > Doesn't look like all comments got addressed.
> > > E.g. gpa_t entry_base is still there and in reality
> > > you said it's a host virtual address so
> > > should be void __user *;
> > 
> > Would update it.
> > 
> > > And ENOTSYNC meaning 'MSIX' is pretty hacky.
> > 
> > I'd like to discuss it later. We may need some work on all MMIO handling
> > side to make it more straightforward. But I don't want to bundle it with
> > this one...
> 
> It's not PCI related so I'll defer to Avi/Marcelo on this.
> Are you guys happy with the ENOTSYNC meaning 'MSIX'
> and userspace_exit_needed hacks in this code?
> 
> > > > ---
> > > > 
> > > >  arch/x86/include/asm/kvm_host.h |    1 +
> > > >  arch/x86/kvm/Makefile           |    2 +-
> > > >  arch/x86/kvm/mmu.c              |    2 +
> > > >  arch/x86/kvm/x86.c              |   40 ++++-
> > > >  include/linux/kvm.h             |   28 ++++
> > > >  include/linux/kvm_host.h        |   34 +++++
> > > >  virt/kvm/assigned-dev.c         |   44 ++++++
> > > >  virt/kvm/kvm_main.c             |   38 +++++-
> > > >  virt/kvm/msix_mmio.c            |  296
> > > >  +++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > >  
> > > >  |   25 ++++
> > > >  
> > > >  10 files changed, 497 insertions(+), 13 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/include/asm/kvm_host.h
> > > > b/arch/x86/include/asm/kvm_host.h index aa75f21..4a390a4 100644
> > > > --- a/arch/x86/include/asm/kvm_host.h
> > > > +++ b/arch/x86/include/asm/kvm_host.h
> > > > @@ -635,6 +635,7 @@ enum emulation_result {
> > > > 
> > > >  	EMULATE_DONE,       /* no further processing */
> > > >  	EMULATE_DO_MMIO,      /* kvm_run filled with mmio request */
> > > >  	EMULATE_FAIL,         /* can't emulate this instruction */
> > > > 
> > > > +	EMULATE_USERSPACE_EXIT, /* we need exit to userspace */
> > > > 
> > > >  };
> > > >  
> > > >  #define EMULTYPE_NO_DECODE	    (1 << 0)
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
\
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> > > > index 9cafbb4..912dca4 100644
> > > > --- a/arch/x86/kvm/mmu.c
> > > > +++ b/arch/x86/kvm/mmu.c
> > > > @@ -3358,6 +3358,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu,
> > > > gva_t cr2, u32 error_code,
> > > > 
> > > >  	case EMULATE_DO_MMIO:
> > > >  		++vcpu->stat.mmio_exits;
> > > >  		/* fall through */
> > > > 
> > > > +	case EMULATE_USERSPACE_EXIT:
> > > > +		/* fall through */
> > > > 
> > > >  	case EMULATE_FAIL:
> > > >  		return 0;
> > > >  	
> > > >  	default:
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index 21b84e2..87308eb 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3809,6 +3810,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > >  	struct kvm_io_ext_data ext_data;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3824,18 +3826,32 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val, &ext_data))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > > 
> > > > -	vcpu->mmio_needed = 1;
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > -	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > -	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > -	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > -	memcpy(vcpu->run->mmio.data, val, bytes);
> > > > +	if (r == -ENOTSYNC) {
> > > > +		vcpu->userspace_exit_needed = 1;
> > > > +		vcpu->run->exit_reason = KVM_EXIT_MSIX_ROUTING_UPDATE;
> > > > +		vcpu->run->msix_routing.dev_id =
> > > > +			ext_data.msix_routing.dev_id;
> > > > +		vcpu->run->msix_routing.type =
> > > > +			ext_data.msix_routing.type;
> > > > +		vcpu->run->msix_routing.entry_idx =
> > > > +			ext_data.msix_routing.entry_idx;
> > > > +		vcpu->run->msix_routing.flags =
> > > > +			ext_data.msix_routing.flags;
> > > > +	} else  {
> > > > +		vcpu->mmio_needed = 1;
> > > > +		vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +		vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > +		vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > +		vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > +		memcpy(vcpu->run->mmio.data, val, bytes);
> > > > +	}
> > > > 
> > > >  	return X86EMUL_CONTINUE;
> > > >  
> > > >  }
> > > > 
> > > > @@ -4469,6 +4485,8 @@ done:
> > > >  		r = EMULATE_DO_MMIO;
> > > >  	
> > > >  	} else if (r == EMULATION_RESTART)
> > > >  	
> > > >  		goto restart;
> > > > 
> > > > +	else if (vcpu->userspace_exit_needed)
> > > > +		r = EMULATE_USERSPACE_EXIT;
> > > > 
> > > >  	else
> > > >  	
> > > >  		r = EMULATE_DONE;
> > > > 
> > > > @@ -5397,12 +5415,18 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu
> > > > *vcpu, struct kvm_run *kvm_run)
> > > > 
> > > >  		}
> > > >  	
> > > >  	}
> > > > 
> > > > -	if (vcpu->arch.pio.count || vcpu->mmio_needed) {
> > > > +	if (vcpu->arch.pio.count || vcpu->mmio_needed ||
> > > > +			vcpu->userspace_exit_needed) {
> > > > 
> > > >  		if (vcpu->mmio_needed) {
> > > >  		
> > > >  			memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
> > > >  			vcpu->mmio_read_completed = 1;
> > > >  			vcpu->mmio_needed = 0;
> > > >  		
> > > >  		}
> > > > 
> > > > +		if (vcpu->userspace_exit_needed) {
> > > > +			vcpu->userspace_exit_needed = 0;
> > > > +			r = 0;
> > > > +			goto out;
> > > > +		}
> > > > 
> > > >  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
> > > >  		r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
> > > >  		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
> > > > 
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index ea2dc1a..4393e4e 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > 
> > > >  #define KVM_EXIT_NMI              16
> > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > >  #define KVM_EXIT_OSI              18
> > > > 
> > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > 
> > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > 
> > > > @@ -264,6 +265,13 @@ struct kvm_run {
> > > > 
> > > >  		struct {
> > > >  		
> > > >  			__u64 gprs[32];
> > > >  		
> > > >  		} osi;
> > > > 
> > > > +		/* KVM_EXIT_MSIX_ROUTING_UPDATE*/
> > > > +		struct {
> > > > +			__u32 dev_id;
> > > > +			__u16 type;
> > > > +			__u16 entry_idx;
> > > > +			__u64 flags;
> > > > +		} msix_routing;
> > > > 
> > > >  		/* Fix the size of the union. */
> > > >  		char padding[256];
> > > >  	
> > > >  	};
> > > > 
> > > > @@ -541,6 +549,7 @@ struct kvm_ppc_pvinfo {
> > > > 
> > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > >  #define KVM_CAP_ASYNC_PF 59
> > > > 
> > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > 
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > 
> > > > @@ -672,6 +681,9 @@ struct kvm_clock_data {
> > > > 
> > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > 
> > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,
> > > > 0x7e, struct kvm_msix_mmio_user)
> > > > 
> > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > >  struct kvm_pit_state2)
> > > > 
> > > > @@ -795,4 +807,20 @@ struct kvm_assigned_msix_entry {
> > > > 
> > > >  	__u16 padding[3];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > +struct kvm_msix_mmio_user {
> > > > +	__u32 dev_id;
> > > > +	__u16 type;
> > > > +	__u16 max_entries_nr;
> > > > +	__u64 base_addr;
> > > > +	__u64 base_va;
> > > > +	__u64 flags;
> > > > +	__u64 reserved[4];
> > > > +};
> > > > +
> > > > 
> > > >  #endif /* __LINUX_KVM_H */
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 6bb211d..6aaf85e 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -68,9 +68,16 @@ enum kvm_bus {
> > > > 
> > > >  	KVM_NR_BUSES
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING   1
> > > > 
> > > >  struct kvm_io_ext_data {
> > > >  
> > > >  	int type;
> > > >  	union {
> > > > 
> > > > +		struct {
> > > > +			u32 dev_id;
> > > > +			u16 type;
> > > > +			u16 entry_idx;
> > > > +			u64 flags;
> 
> Please, document the structure fields.
> 
> > > > +		} msix_routing;
> > > > 
> > > >  		char padding[256];
> 
> Note that there's another 2 bytes of padding before
> the union as struct will be 8 byte aligned.
> So if the purpose of the padding was to ensure
> some fixed structure size, it failed in that.
> 
> > > >  	};
> > > >  
> > > >  };
> > > > 
> > > > @@ -168,6 +175,8 @@ struct kvm_vcpu {
> > > > 
> > > >  	} async_pf;
> > > >  
> > > >  #endif
> > > > 
> > > > +	int userspace_exit_needed;
> > > > +
> > > > 
> > > >  	struct kvm_vcpu_arch arch;
> > > >  
> > > >  };
> > > > 
> > > > @@ -241,6 +250,27 @@ struct kvm_memslots {
> > > > 
> > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > +
> > > > +struct kvm_msix_mmio {
> > > > +	u32 dev_id;
> > > > +	u16 type;
> > > > +	u16 max_entries_nr;
> > > > +	u64 flags;
> > > > +	gpa_t table_base_addr;
> > > > +	hva_t table_base_va;
> > > > +	gpa_t pba_base_addr;
> > > > +	hva_t pba_base_va;
> > > > +};
> > > > +
> > > > +struct kvm_msix_mmio_dev {
> > > > +	struct kvm *kvm;
> > > > +	struct kvm_io_device table_dev;
> > > > +	int mmio_nr;
> > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > +	struct mutex lock;
> > > > +};
> > > > +
> > > > 
> > > >  struct kvm {
> > > >  
> > > >  	spinlock_t mmu_lock;
> > > >  	raw_spinlock_t requests_lock;
> > > > 
> > > > @@ -289,6 +319,7 @@ struct kvm {
> > > > 
> > > >  	long mmu_notifier_count;
> > > >  
> > > >  #endif
> > > >  
> > > >  	long tlbs_dirty;
> > > > 
> > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > 
> > > >  };
> > > >  
> > > >  /* The guest did something we don't support. */
> > > > 
> > > > @@ -561,6 +592,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm
> > > > *kvm,
> > > > 
> > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > 
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +			int assigned_dev_id, int entry, bool mask);
> > > > +
> > > > 
> > > >  /* For vcpu->arch.iommu_flags */
> > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > 
> > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > index ae72ae6..d1598a6 100644
> > > > --- a/virt/kvm/assigned-dev.c
> > > > +++ b/virt/kvm/assigned-dev.c
> > > > @@ -18,6 +18,7 @@
> > > > 
> > > >  #include <linux/interrupt.h>
> > > >  #include <linux/slab.h>
> > > >  #include "irq.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > >  list_head *head,
> > > >  
> > > >  						      int assigned_dev_id)
> > > > 
> > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm
> > > > *kvm,
> > > > 
> > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > >  	assigned_dev->irq_requested_type);
> > > >  
> > > >  }
> > > > 
> > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > 
> > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > >  
> > > >  				     struct kvm_assigned_dev_kernel
> > > >  				     *assigned_dev)
> > > >  
> > > >  {
> > > >  
> > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > 
> > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > +
> > > > 
> > > >  	__pci_reset_function(assigned_dev->dev);
> > > >  	pci_restore_state(assigned_dev->dev);
> > > > 
> > > > @@ -785,3 +799,33 @@ out:
> > > >  	return r;
> > > >  
> > > >  }
> > > > 
> > > > +/* The caller should hold kvm->lock */
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +				int assigned_dev_id, int entry, bool mask)
> > > > +{
> > > > +	int r = -EFAULT;
> > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > +	int i;
> > > > +
> > > > +	if (!irqchip_in_kernel(kvm))
> > > > +		return r;
> > > > +
> > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > +				      assigned_dev_id);
> > > > +	if (!adev)
> > > > +		goto out;
> > > > +
> > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > +			if (mask)
> > > > +				disable_irq_nosync(
> > > > +					adev->host_msix_entries[i].vector);
> > > > +			else
> > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > +			r = 0;
> > > > +			break;
> > > > +		}
> > > > +out:
> > > > +	return r;
> > > > +}
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index a61f90e..f211e49 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -56,6 +56,7 @@
> > > > 
> > > >  #include "coalesced_mmio.h"
> > > >  #include "async_pf.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  #define CREATE_TRACE_POINTS
> > > >  #include <trace/events/kvm.h>
> > > > 
> > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > 
> > > >  	struct mm_struct *mm = kvm->mm;
> > > >  	
> > > >  	kvm_arch_sync_events(kvm);
> > > > 
> > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > 
> > > >  	spin_lock(&kvm_lock);
> > > >  	list_del(&kvm->vm_list);
> > > >  	spin_unlock(&kvm_lock);
> > > > 
> > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > 
> > > >  		mutex_unlock(&kvm->lock);
> > > >  		break;
> > > >  
> > > >  #endif
> > > > 
> > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > 
> > > >  	default:
> > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > >  		if (r == -ENOTTY)
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >  		return r;
> > > >  	
> > > >  	}
> > > >  
> > > >  #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r < 0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > > +
> > > > 
> > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > >  	if (r < 0)
> > > >  	
> > > >  		kvm_put_kvm(kvm);
> > > > 
> > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > kvm_io_bus *bus)
> > > > 
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr,
> > > >  
> > > >  		     int len, const void *val, struct kvm_io_ext_data 
*ext_data)
> > > >  
> > > >  {
> > > > 
> > > > -	int i;
> > > > +	int i, r = -EOPNOTSUPP;
> > > > 
> > > >  	struct kvm_io_bus *bus;
> > > >  	
> > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > 
> > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val, 
ext_data))
> > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val, ext_data);
> > > > +		if (r == -ENOTSYNC)
> > > > +			break;
> > > > +		else if (!r)
> > > > 
> > > >  			return 0;
> > > > 
> > > > -	return -EOPNOTSUPP;
> > > > +	}
> > > > +	return r;
> > > > 
> > > >  }
> > > >  
> > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > 
> > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > new file mode 100644
> > > > index 0000000..083b15b
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.c
> > > > @@ -0,0 +1,296 @@
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/kvm.h>
> > > > +
> > > > +#include "msix_mmio.h"
> > > > +#include "iodev.h"
> > > > +
> > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > +{
> > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > +				mmio->dev_id, entry, flag);
> > > > +	return -EFAULT;
> > > > +}
> > > > +
> > > > +/* Caller must hold dev->lock */
> > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > +				gpa_t addr, int len)
> > > > +{
> > > > +	gpa_t start, end;
> > > > +	int i, r = -EINVAL;
> > > > +
> > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > +		start = dev->mmio[i].table_base_addr;
> > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > +			dev->mmio[i].max_entries_nr;
> > > > +		if (addr >= start && addr + len <= end) {
> > > > +			r = i;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +
> > > > +	return r;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > addr, int len, +				void *val)
> > > > +{
> > > > +	/*TODO: Add big endian support */
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > 
> > > addr & len as below?
> > > 
> > > > +
> > > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> > > 
> > > then this test won't be needed.
> > > 
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > addr, +				int len, const void *val,
> > > > +				struct kvm_io_ext_data *ext_data)
> > > > +{
> > > > +	/*TODO: Add big endian support */
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> > > > +	unsigned long __user *ctrl_pos;
> > > 
> > > long? It's 8 bytes on 64 bit.
> > > You really want
> > > __le32 old_ctrl, new_ctrl;
> > > __le32 __user *ctrl_pos;
> > 
> > __le32 here may cause wrong idea that we support big endian, but it's not
> > true. So I want to use u32 here, and add TODO above.
> 
> Users don't go looking for __le32 in code.  If you want to make sure
> code is built on intel only, just check the architecture.
> 
> But understanding endianness and making the code correct
> upfront is not hard, I really don't understand why
> do you keep hardcoding LE assumptions in there,
> __le32 is simply better, it let us know that the value
> is related to pci as opposed to a random integer value.

The reason is I can't test it... But OK, I would update it, and ensure it works 
well at least on Intel platform...
 
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if (!(len == 4 || len == 8) || addr & (len - 1))
> > > 
> > > Nice hack. Even a bit nicer
> > 
> > Thanks Alex for this line. :)
> > 
> > > 	if ((len != 4 && len != 8) || addr & (len - 1))
> > 
> > I think it's personal style difference. These two look same to me...
> 
> One character less :) And you use != above. Be consistent or better
> add an inline to avoid duplication.

Er... OK, I would update it...

--
regards
Yang, Sheng

> 
> > > > +		goto out;
> > > > +
> > > > +	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (unsigned long __user *)(entry_base +
> > > > +			PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > 
> > > So this is the issue: if you cast a type to unsigned long *
> > > compiler can assume that the address is aligned.
> > > To prevent problems please add a check that table_base_va
> > > is aligned.
> > 
> > Already checked it when mmio register.
> 
> Right, missed it the first time around.
> 
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* Don't allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > +		goto out;
> > > > +
> > > > +	ext_data->type = KVM_IO_EXT_DATA_TYPE_MSIX_ROUTING;
> > > > +	ext_data->msix_routing.dev_id = mmio->dev_id;
> > > > +	ext_data->msix_routing.type = mmio->type;
> > > > +	ext_data->msix_routing.entry_idx = entry;
> > > > +	ext_data->msix_routing.flags = 0;
> > > > +
> > > > +	if (offset + len < PCI_MSIX_ENTRY_VECTOR_CTRL) {
> > > > +		ret = -ENOTSYNC;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if (old_ctrl == new_ctrl) {
> > > > +		if (offset == PCI_MSIX_ENTRY_DATA && len == 8)
> > > > +			ret = -ENOTSYNC;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry,
> > > > +				!!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > > +	if (r)
> > > > +		ret = -ENOTSYNC;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > +	.read     = msix_table_mmio_read,
> > > > +	.write    = msix_table_mmio_write,
> > > > +};
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > +	int r = 0, i;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > +			mmio = &mmio_dev->mmio[i];
> > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > +				r = -EINVAL;
> > > > +				goto out;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	/* All reserved currently */
> > > > +	if (mmio_user->flags) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	/* Check alignment and accessibility */
> > > > +	if ((mmio_user->base_va % PCI_MSIX_ENTRY_SIZE) ||
> > > > +	    !access_ok(VERIFY_WRITE, (void __user *)mmio_user->base_va,
> > > 
> > > You also should check that base_va and friends fit in a pointer
> > > for 32 bit architectures. Same for other va values.
> > 
> > OK
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if (!mmio) {
> > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > +			r = -ENOSPC;
> > > > +			goto out;
> > > > +		}
> > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > +		mmio_dev->mmio_nr++;
> > > > +	}
> > > > +
> > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > +	mmio->flags = mmio_user->flags;
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > +	}
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	int r = -EINVAL, i, j;
> > > > +
> > > > +	if (!mmio)
> > > > +		return 0;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > +			r = 0;
> > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > +			mmio_dev->mmio_nr--;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > new file mode 100644
> > > > index 0000000..01b6587
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.h
> > > > @@ -0,0 +1,25 @@
> > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > +#define __KVM_MSIX_MMIO_H__
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/pci.h>
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > *mmio_user); +
> > > > +#endif

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h
  2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
@ 2011-02-28  7:20 ` Sheng Yang
  0 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-02-28  7:20 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-02-25  8:29       ` Michael S. Tsirkin
  2011-02-28  5:18         ` Sheng Yang
@ 2011-03-01 20:18         ` Marcelo Tosatti
  2011-03-01 20:59           ` Michael S. Tsirkin
  2011-03-02  1:23           ` Sheng Yang
  1 sibling, 2 replies; 21+ messages in thread
From: Marcelo Tosatti @ 2011-03-01 20:18 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Avi Kivity, Alex Williamson, kvm

On Fri, Feb 25, 2011 at 10:29:38AM +0200, Michael S. Tsirkin wrote:
> On Fri, Feb 25, 2011 at 02:28:02PM +0800, Sheng Yang wrote:
> > On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> > > On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > 
> > > Doesn't look like all comments got addressed.
> > > E.g. gpa_t entry_base is still there and in reality
> > > you said it's a host virtual address so
> > > should be void __user *;
> > 
> > Would update it.
> > 
> > > And ENOTSYNC meaning 'MSIX' is pretty hacky.
> > 
> > I'd like to discuss it later. We may need some work on all MMIO handling side to 
> > make it more straightforward. But I don't want to bundle it with this one... 
> 
> It's not PCI related so I'll defer to Avi/Marcelo on this.
> Are you guys happy with the ENOTSYNC meaning 'MSIX'

What would be a better alternative to ENOTSYNC? Can't see any.

> and userspace_exit_needed hacks in this code?

I thought this was handled by mmio_needed in a previous patch? 

Since x86_emulate_instruction does

        } else if (vcpu->mmio_needed) {
                if (vcpu->mmio_is_write)
                        vcpu->mmio_needed = 0;
                r = EMULATE_DO_MMIO;

It should be fine. Sheng why did you introduce userspace_exit_needed?


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-03-01 20:18         ` Marcelo Tosatti
@ 2011-03-01 20:59           ` Michael S. Tsirkin
  2011-03-02  1:23           ` Sheng Yang
  1 sibling, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2011-03-01 20:59 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Sheng Yang, Avi Kivity, Alex Williamson, kvm

On Tue, Mar 01, 2011 at 05:18:58PM -0300, Marcelo Tosatti wrote:
> On Fri, Feb 25, 2011 at 10:29:38AM +0200, Michael S. Tsirkin wrote:
> > On Fri, Feb 25, 2011 at 02:28:02PM +0800, Sheng Yang wrote:
> > > On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> > > > On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > 
> > > > Doesn't look like all comments got addressed.
> > > > E.g. gpa_t entry_base is still there and in reality
> > > > you said it's a host virtual address so
> > > > should be void __user *;
> > > 
> > > Would update it.
> > > 
> > > > And ENOTSYNC meaning 'MSIX' is pretty hacky.
> > > 
> > > I'd like to discuss it later. We may need some work on all MMIO handling side to 
> > > make it more straightforward. But I don't want to bundle it with this one... 
> > 
> > It's not PCI related so I'll defer to Avi/Marcelo on this.
> > Are you guys happy with the ENOTSYNC meaning 'MSIX'
> 
> What would be a better alternative to ENOTSYNC? Can't see any.

Return a negative value on error, positive exit code
if we want to exit to userspace.
As a bonus MSIX knowledge is localized in one file.

> > and userspace_exit_needed hacks in this code?
> 
> I thought this was handled by mmio_needed in a previous patch? 
> 
> Since x86_emulate_instruction does
> 
>         } else if (vcpu->mmio_needed) {
>                 if (vcpu->mmio_is_write)
>                         vcpu->mmio_needed = 0;
>                 r = EMULATE_DO_MMIO;
> 
> It should be fine. Sheng why did you introduce userspace_exit_needed?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/4] KVM: Emulate MSI-X table in kernel
  2011-03-01 20:18         ` Marcelo Tosatti
  2011-03-01 20:59           ` Michael S. Tsirkin
@ 2011-03-02  1:23           ` Sheng Yang
  1 sibling, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-03-02  1:23 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Michael S. Tsirkin, Avi Kivity, Alex Williamson, kvm

On Wednesday 02 March 2011 04:18:58 Marcelo Tosatti wrote:
> On Fri, Feb 25, 2011 at 10:29:38AM +0200, Michael S. Tsirkin wrote:
> > On Fri, Feb 25, 2011 at 02:28:02PM +0800, Sheng Yang wrote:
> > > On Thursday 24 February 2011 18:45:08 Michael S. Tsirkin wrote:
> > > > On Thu, Feb 24, 2011 at 05:51:04PM +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > 
> > > > Doesn't look like all comments got addressed.
> > > > E.g. gpa_t entry_base is still there and in reality
> > > > you said it's a host virtual address so
> > > > should be void __user *;
> > > 
> > > Would update it.
> > > 
> > > > And ENOTSYNC meaning 'MSIX' is pretty hacky.
> > > 
> > > I'd like to discuss it later. We may need some work on all MMIO
> > > handling side to make it more straightforward. But I don't want to
> > > bundle it with this one...
> > 
> > It's not PCI related so I'll defer to Avi/Marcelo on this.
> > Are you guys happy with the ENOTSYNC meaning 'MSIX'
> 
> What would be a better alternative to ENOTSYNC? Can't see any.
> 
> > and userspace_exit_needed hacks in this code?
> 
> I thought this was handled by mmio_needed in a previous patch?
> 
> Since x86_emulate_instruction does
> 
>         } else if (vcpu->mmio_needed) {
>                 if (vcpu->mmio_is_write)
>                         vcpu->mmio_needed = 0;
>                 r = EMULATE_DO_MMIO;
> 
> It should be fine. Sheng why did you introduce userspace_exit_needed?

Because strictly speaking it's not MMIO exit, I don't know if Avi would object the 
confusing concept here, so I introduced another type of exit.

But if it's OK, I still would use mmio_needed in the next version, which is also  
more simple.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h
  2011-03-02  7:26 [PATCH 0/4 v12] MSI-X MMIO support for KVM Sheng Yang
@ 2011-03-02  7:26 ` Sheng Yang
  0 siblings, 0 replies; 21+ messages in thread
From: Sheng Yang @ 2011-03-02  7:26 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti
  Cc: Michael S. Tsirkin, Alex Williamson, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2011-03-02  7:24 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-02-24  9:51 [PATCH 0/4 v10] MSI-X MMIO support for KVM Sheng Yang
2011-02-24  9:51 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-02-24  9:51 ` [PATCH 2/4] KVM: Add kvm_io_ext_data to IO handler Sheng Yang
2011-02-24 10:22   ` Michael S. Tsirkin
2011-02-25  3:23     ` Sheng Yang
2011-02-25  8:12       ` Michael S. Tsirkin
2011-02-28  5:13         ` Sheng Yang
2011-02-24  9:51 ` [PATCH 3/4] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-02-24 10:45   ` Michael S. Tsirkin
2011-02-25  6:28     ` Sheng Yang
2011-02-25  8:29       ` Michael S. Tsirkin
2011-02-28  5:18         ` Sheng Yang
2011-03-01 20:18         ` Marcelo Tosatti
2011-03-01 20:59           ` Michael S. Tsirkin
2011-03-02  1:23           ` Sheng Yang
2011-02-25  6:50     ` Sheng Yang
2011-02-25  6:50     ` [PATCH 3/4 v10 UPDATED] " Sheng Yang
2011-02-24  9:51 ` [PATCH 4/4] KVM: Add documents for MSI-X MMIO API Sheng Yang
  -- strict thread matches above, loose matches on Subject: below --
2011-03-02  7:26 [PATCH 0/4 v12] MSI-X MMIO support for KVM Sheng Yang
2011-03-02  7:26 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-02-28  7:20 [PATCH 0/4 v11] MSI-X MMIO support for KVM Sheng Yang
2011-02-28  7:20 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-02-18  8:53 [PATCH 0/4 v9] MSI-X MMIO support for KVM Sheng Yang
2011-02-18  8:53 ` [PATCH 1/4] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).