[PATCH 0/3 v8] MSI-X MMIO support for KVM

kvm.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/3 v8] MSI-X MMIO support for KVM
@ 2011-01-30  5:11 Sheng Yang
  2011-01-30  5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Change from v7:
Update according to Marcelo and Avi's comments.

BTW: I would be on vacation for Chinese New Year soon, and would be back mid Feb.

Sheng Yang (3):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Emulate MSI-X table in kernel
  KVM: Add documents for MSI-X MMIO API

 Documentation/kvm/api.txt |   47 ++++++++
 arch/x86/kvm/Makefile     |    2 +-
 arch/x86/kvm/x86.c        |    8 +-
 include/linux/kvm.h       |   21 ++++
 include/linux/kvm_host.h  |   48 ++++++++
 virt/kvm/assigned-dev.c   |   44 +++++++
 virt/kvm/iodev.h          |   25 +----
 virt/kvm/kvm_main.c       |   38 ++++++-
 virt/kvm/msix_mmio.c      |  286 +++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h      |   25 ++++
 10 files changed, 513 insertions(+), 31 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h
  2011-01-30  5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang
@ 2011-01-30  5:11 ` Sheng Yang
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-01-30  5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
  2 siblings, 0 replies; 20+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b5021db..7d313e0 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang
  2011-01-30  5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2011-01-30  5:11 ` Sheng Yang
  2011-02-03  1:05   ` Marcelo Tosatti
  2011-02-23  0:19   ` Alex Williamson
  2011-01-30  5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
  2 siblings, 2 replies; 20+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   21 ++++
 include/linux/kvm_host.h |   25 ++++
 virt/kvm/assigned-dev.c  |   44 +++++++
 virt/kvm/kvm_main.c      |   38 ++++++-
 virt/kvm/msix_mmio.c     |  286 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   25 ++++
 8 files changed, 442 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fa708c9..89bf12c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
+		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..ad9df4b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7d313e0..c10670c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -233,6 +233,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -281,6 +302,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+			int assigned_dev_id, int entry, bool mask);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..d1598a6 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/slab.h>
 #include "irq.h"
+#include "msix_mmio.h"
 
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
@@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
 }
 
+static void assigned_device_free_msix_mmio(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *adev)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = adev->assigned_dev_id;
+	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
+		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+	kvm_free_msix_mmio(kvm, &mmio);
+}
+
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
 	kvm_free_assigned_irq(kvm, assigned_dev);
 
+	assigned_device_free_msix_mmio(kvm, assigned_dev);
+
 	__pci_reset_function(assigned_dev->dev);
 	pci_restore_state(assigned_dev->dev);
 
@@ -785,3 +799,33 @@ out:
 	return r;
 }
 
+/* The caller should hold kvm->lock */
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, bool mask)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	/* For non-MSIX enabled devices, entries_nr == 0 */
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (mask)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1b6cbb..b7807c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	struct mm_struct *mm = kvm->mm;
 
 	kvm_arch_sync_events(kvm);
+	kvm_unregister_msix_mmio_dev(kvm);
 	spin_lock(&kvm_lock);
 	list_del(&kvm->vm_list);
 	spin_unlock(&kvm_lock);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..dde9b62
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,286 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xf;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+	u32 *ctrl_pos;
+
+	mutex_lock(&mmio_dev->kvm->lock);
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+
+	offset = addr & 0xF;
+	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
+		goto out;
+
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
+
+	if (get_user(old_ctrl, ctrl_pos))
+		goto out;
+
+	/* No allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user((void __user *)(entry_base + offset), val, len))
+		goto out;
+
+	if (get_user(new_ctrl, ctrl_pos))
+		goto out;
+
+	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
+	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
+		ret = -ENOTSYNC;
+	if (old_ctrl == new_ctrl)
+		goto out;
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
+	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
+	if (r || ret)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	mutex_unlock(&mmio_dev->kvm->lock);
+	return ret;
+}
+
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	/* All reserved currently */
+	if (mmio_user->flags) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
+		r = -EINVAL;
+		goto out;
+	}
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		r = -EINVAL;
+		goto out;
+	}
+
+	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
+			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
+		r = -EINVAL;
+		goto out;
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+	}
+
+	mmio_dev->mmio_nr++;
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = 0, i, j;
+	bool found = 0;
+
+	if (!mmio)
+		return 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			found = true;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	if (!found)
+		r = -EINVAL;
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio mmio;
+
+	mmio.dev_id = mmio_user->dev_id;
+	mmio.type = mmio_user->type;
+
+	return kvm_free_msix_mmio(kvm, &mmio);
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..01b6587
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,25 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio_user);
+int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-02-03  1:05   ` Marcelo Tosatti
  2011-02-18  8:15     ` Sheng Yang
  2011-02-23  0:19   ` Alex Williamson
  1 sibling, 1 reply; 20+ messages in thread
From: Marcelo Tosatti @ 2011-02-03  1:05 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
> 
> Signed-off-by: Sheng Yang <sheng@linux.intel.com>

> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}

Why allow this ioctl to succeed if there's an entry already present?
This case is broken as mmio_dev->mmio_nr is increased below.

PCI bits must be reviewed...


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-03  1:05   ` Marcelo Tosatti
@ 2011-02-18  8:15     ` Sheng Yang
  2011-02-22 17:58       ` Marcelo Tosatti
  0 siblings, 1 reply; 20+ messages in thread
From: Sheng Yang @ 2011-02-18  8:15 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote:
> On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > 
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> 
> Why allow this ioctl to succeed if there's an entry already present?
> This case is broken as mmio_dev->mmio_nr is increased below.

Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.

The reason we allow multiply callings is userspace can register different types of 
address here(table address and PBA address).

> PCI bits must be reviewed...

Pardon? PCI related things are already in 2.6.38-rc.

--
regards
Yang, Sheng



^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-18  8:15     ` Sheng Yang
@ 2011-02-22 17:58       ` Marcelo Tosatti
  0 siblings, 0 replies; 20+ messages in thread
From: Marcelo Tosatti @ 2011-02-22 17:58 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson

On Fri, Feb 18, 2011 at 04:15:38PM +0800, Sheng Yang wrote:
> > Why allow this ioctl to succeed if there's an entry already present?
> > This case is broken as mmio_dev->mmio_nr is increased below.
> 
> Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it.
> 
> The reason we allow multiply callings is userspace can register different types of 
> address here(table address and PBA address).

Ah OK. 

> > PCI bits must be reviewed...
> 
> Pardon? PCI related things are already in 2.6.38-rc.

I meant someone more familiar with PCI should review this patch,
Michael/Alex, CC'ed.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
  2011-02-03  1:05   ` Marcelo Tosatti
@ 2011-02-23  0:19   ` Alex Williamson
  2011-02-23  6:59     ` Sheng Yang
  1 sibling, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2011-02-23  0:19 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.

Looks pretty good overall.  A few comments below.  It seems like we
should be able to hook this into vfio with a small stub in kvm.  We just
need to be able to communicate disabling and enabling of individual msix
vectors.  For brute force, we could do this with a lot of eventfds,
triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
if there's something smaller that could do it.  Thanks,

Alex

> Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> ---
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   21 ++++
>  include/linux/kvm_host.h |   25 ++++
>  virt/kvm/assigned-dev.c  |   44 +++++++
>  virt/kvm/kvm_main.c      |   38 ++++++-
>  virt/kvm/msix_mmio.c     |  286 ++++++++++++++++++++++++++++++++++++++++++++++
>  virt/kvm/msix_mmio.h     |   25 ++++
>  8 files changed, 442 insertions(+), 7 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h
> 
> diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> index f15501f..3a0d851 100644
> --- a/arch/x86/kvm/Makefile
> +++ b/arch/x86/kvm/Makefile
> @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
>  
>  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
>  				coalesced_mmio.o irq_comm.o eventfd.o \
> -				assigned-dev.o)
> +				assigned-dev.o msix_mmio.o)
>  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
>  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fa708c9..89bf12c 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
>  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
>  	case KVM_CAP_XSAVE:
>  	case KVM_CAP_ASYNC_PF:
> +	case KVM_CAP_MSIX_MMIO:
>  		r = 1;
>  		break;
>  	case KVM_CAP_COALESCED_MMIO:
> @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  					   struct kvm_vcpu *vcpu)
>  {
>  	gpa_t                 gpa;
> +	int r;
>  
>  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
>  
> @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>  
>  mmio:
>  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
>  	/*
>  	 * Is this MMIO handled locally?
>  	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> +	if (!r)
>  		return X86EMUL_CONTINUE;
>  
>  	vcpu->mmio_needed = 1;
> -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
>  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
>  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
>  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index ea2dc1a..ad9df4b 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -161,6 +161,7 @@ struct kvm_pit_config {
>  #define KVM_EXIT_NMI              16
>  #define KVM_EXIT_INTERNAL_ERROR   17
>  #define KVM_EXIT_OSI              18
> +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
>  
>  /* For KVM_EXIT_INTERNAL_ERROR */
>  #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
>  #define KVM_CAP_PPC_GET_PVINFO 57
>  #define KVM_CAP_PPC_IRQ_LEVEL 58
>  #define KVM_CAP_ASYNC_PF 59
> +#define KVM_CAP_MSIX_MMIO 60
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  
> @@ -672,6 +674,9 @@ struct kvm_clock_data {
>  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
>  #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
>  #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> +/* Available with KVM_CAP_MSIX_MMIO */
> +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
> +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
>  /* Available with KVM_CAP_PIT_STATE2 */
>  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
>  #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
> @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
>  	__u16 padding[3];
>  };
>  
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> +
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> +
> +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;
> +	__u16 max_entries_nr;
> +	__u64 base_addr;
> +	__u64 base_va;
> +	__u64 flags;
> +	__u64 reserved[4];
> +};
> +
>  #endif /* __LINUX_KVM_H */
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 7d313e0..c10670c 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -233,6 +233,27 @@ struct kvm_memslots {
>  					KVM_PRIVATE_MEM_SLOTS];
>  };
>  
> +#define KVM_MSIX_MMIO_MAX    32

I assume this is based on 32 devices/bus, but we could exceed this is we
add more buses or make use of more functions.  How hard is it to make
this dynamic?

> +
> +struct kvm_msix_mmio {
> +	u32 dev_id;
> +	u16 type;
> +	u16 max_entries_nr;
> +	u64 flags;
> +	gpa_t table_base_addr;
> +	hva_t table_base_va;
> +	gpa_t pba_base_addr;
> +	hva_t pba_base_va;
> +};
> +
> +struct kvm_msix_mmio_dev {
> +	struct kvm *kvm;
> +	struct kvm_io_device table_dev;
> +	int mmio_nr;
> +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> +	struct mutex lock;
> +};
> +
>  struct kvm {
>  	spinlock_t mmu_lock;
>  	raw_spinlock_t requests_lock;
> @@ -281,6 +302,7 @@ struct kvm {
>  	long mmu_notifier_count;
>  #endif
>  	long tlbs_dirty;
> +	struct kvm_msix_mmio_dev msix_mmio_dev;
>  };
>  
>  /* The guest did something we don't support. */
> @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
>  int kvm_request_irq_source_id(struct kvm *kvm);
>  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
>  
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +			int assigned_dev_id, int entry, bool mask);
> +
>  /* For vcpu->arch.iommu_flags */
>  #define KVM_IOMMU_CACHE_COHERENCY	0x1
>  
> diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> index ae72ae6..d1598a6 100644
> --- a/virt/kvm/assigned-dev.c
> +++ b/virt/kvm/assigned-dev.c
> @@ -18,6 +18,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/slab.h>
>  #include "irq.h"
> +#include "msix_mmio.h"
>  
>  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
>  						      int assigned_dev_id)
> @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
>  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
>  }
>  
> +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> +				struct kvm_assigned_dev_kernel *adev)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = adev->assigned_dev_id;
> +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +	kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
>  static void kvm_free_assigned_device(struct kvm *kvm,
>  				     struct kvm_assigned_dev_kernel
>  				     *assigned_dev)
>  {
>  	kvm_free_assigned_irq(kvm, assigned_dev);
>  
> +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> +
>  	__pci_reset_function(assigned_dev->dev);
>  	pci_restore_state(assigned_dev->dev);
>  
> @@ -785,3 +799,33 @@ out:
>  	return r;
>  }
>  
> +/* The caller should hold kvm->lock */
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, bool mask)
> +{
> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	/* For non-MSIX enabled devices, entries_nr == 0 */
> +	for (i = 0; i < adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (mask)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	return r;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index b1b6cbb..b7807c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -56,6 +56,7 @@
>  
>  #include "coalesced_mmio.h"
>  #include "async_pf.h"
> +#include "msix_mmio.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include <trace/events/kvm.h>
> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
>  	struct mm_struct *mm = kvm->mm;
>  
>  	kvm_arch_sync_events(kvm);
> +	kvm_unregister_msix_mmio_dev(kvm);
>  	spin_lock(&kvm_lock);
>  	list_del(&kvm->vm_list);
>  	spin_unlock(&kvm_lock);
> @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
>  		mutex_unlock(&kvm->lock);
>  		break;
>  #endif
> +	case KVM_REGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
> +	case KVM_UNREGISTER_MSIX_MMIO: {
> +		struct kvm_msix_mmio_user mmio_user;
> +
> +		r = -EFAULT;
> +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> +			goto out;
> +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> +		break;
> +	}
>  	default:
>  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
>  		if (r == -ENOTTY)
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>  		return r;
>  	}
>  #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r < 0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}
> +
>  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
>  	if (r < 0)
>  		kvm_put_kvm(kvm);
> @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
>  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>  		     int len, const void *val)
>  {
> -	int i;
> +	int i, r = -EOPNOTSUPP;
>  	struct kvm_io_bus *bus;
>  
>  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> -	for (i = 0; i < bus->dev_count; i++)
> -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> +	for (i = 0; i < bus->dev_count; i++) {
> +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> +		if (r == -ENOTSYNC)
> +			break;
> +		else if (!r)
>  			return 0;
> -	return -EOPNOTSUPP;
> +	}
> +	return r;
>  }
>  
>  /* kvm_io_bus_read - called under kvm->slots_lock */
> diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> new file mode 100644
> index 0000000..dde9b62
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.c
> @@ -0,0 +1,286 @@
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/kvm_host.h>
> +#include <linux/kvm.h>
> +
> +#include "msix_mmio.h"
> +#include "iodev.h"
> +
> +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
> +				int entry, u32 flag)
> +{
> +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> +				mmio->dev_id, entry, flag);
> +	return -EFAULT;
> +}
> +
> +/* Caller must hold dev->lock */
> +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> +				gpa_t addr, int len)
> +{
> +	gpa_t start, end;
> +	int i, r = -EINVAL;
> +
> +	for (i = 0; i < dev->mmio_nr; i++) {
> +		start = dev->mmio[i].table_base_addr;
> +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> +			dev->mmio[i].max_entries_nr;
> +		if (addr >= start && addr + len <= end) {
> +			r = i;
> +			break;
> +		}

Yet another potential user of the weight balanced tree ;)

> +	}
> +
> +	return r;
> +}
> +
> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xf;
> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +	u32 *ctrl_pos;
> +
> +	mutex_lock(&mmio_dev->kvm->lock);
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx < 0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr & 0x3) || (len != 4 && len != 8))
> +		goto out;
> +
> +	offset = addr & 0xF;

nit, this 0xf above.

> +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> +		goto out;
> +
> +	mmio = &mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> +
> +	if (get_user(old_ctrl, ctrl_pos))
> +		goto out;
> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> +		goto out;
> +
> +	if (get_user(new_ctrl, ctrl_pos))
> +		goto out;
> +
> +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> +		ret = -ENOTSYNC;

Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
first?  I'd also prefer to see a direct goto out here since it's not
entirely obvious that this first 'if' always falls into the below 'if'.
I'm not a fan of using an error code as a special non-error return
either.

> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +
> +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);

An xor would remove some redundancy here

if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
	r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));

> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	mutex_unlock(&mmio_dev->kvm->lock);
> +	return ret;
> +}
> +
> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> +	mutex_init(&kvm->msix_mmio_dev.lock);
> +	kvm->msix_mmio_dev.kvm = kvm;
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> +{
> +	int ret;
> +
> +	mutex_lock(&kvm->slots_lock);
> +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> +				      &kvm->msix_mmio_dev.table_dev);
> +	mutex_unlock(&kvm->slots_lock);
> +	return ret;
> +}
> +
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio = &mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	/* All reserved currently */
> +	if (mmio_user->flags) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> +		r = -EINVAL;
> +		goto out;
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> +	}
> +
> +	mmio_dev->mmio_nr++;
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;
> +
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> +	int r = 0, i, j;

set r = -EINVAL here

> +	bool found = 0;
> +
> +	if (!mmio)
> +		return 0;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> +		    mmio_dev->mmio[i].type == mmio->type) {
> +			found = true;

set r = 0 here

> +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> +			mmio_dev->mmio_nr--;
> +			break;
> +		}
> +	}
> +	if (!found)
> +		r = -EINVAL;

then get rid of found

> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio mmio;
> +
> +	mmio.dev_id = mmio_user->dev_id;
> +	mmio.type = mmio_user->type;
> +
> +	return kvm_free_msix_mmio(kvm, &mmio);
> +}
> +
> diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> new file mode 100644
> index 0000000..01b6587
> --- /dev/null
> +++ b/virt/kvm/msix_mmio.h
> @@ -0,0 +1,25 @@
> +#ifndef __KVM_MSIX_MMIO_H__
> +#define __KVM_MSIX_MMIO_H__
> +/*
> + * MSI-X MMIO emulation
> + *
> + * Copyright (c) 2010 Intel Corporation
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Author:
> + *   Sheng Yang <sheng.yang@intel.com>
> + */
> +
> +#include <linux/pci.h>
> +
> +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user);
> +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> +				      struct kvm_msix_mmio_user *mmio_user);
> +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user);
> +
> +#endif




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  0:19   ` Alex Williamson
@ 2011-02-23  6:59     ` Sheng Yang
  2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-23 16:34       ` Alex Williamson
  0 siblings, 2 replies; 20+ messages in thread
From: Sheng Yang @ 2011-02-23  6:59 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> 
> Looks pretty good overall.  A few comments below.  It seems like we
> should be able to hook this into vfio with a small stub in kvm.  We just
> need to be able to communicate disabling and enabling of individual msix
> vectors.  For brute force, we could do this with a lot of eventfds,
> triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
> if there's something smaller that could do it.  Thanks,

Alex, thanks for your comments. See my comments below:

> 
> Alex
> 
> > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > ---
> > 
> >  arch/x86/kvm/Makefile    |    2 +-
> >  arch/x86/kvm/x86.c       |    8 +-
> >  include/linux/kvm.h      |   21 ++++
> >  include/linux/kvm_host.h |   25 ++++
> >  virt/kvm/assigned-dev.c  |   44 +++++++
> >  virt/kvm/kvm_main.c      |   38 ++++++-
> >  virt/kvm/msix_mmio.c     |  286
> >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> >  |   25 ++++
> >  8 files changed, 442 insertions(+), 7 deletions(-)
> >  create mode 100644 virt/kvm/msix_mmio.c
> >  create mode 100644 virt/kvm/msix_mmio.h
> > 
> > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > index f15501f..3a0d851 100644
> > --- a/arch/x86/kvm/Makefile
> > +++ b/arch/x86/kvm/Makefile
> > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > 
> >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> >  
> >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > 
> > -				assigned-dev.o)
> > +				assigned-dev.o msix_mmio.o)
> > 
> >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> >  async_pf.o)
> > 
> > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > index fa708c9..89bf12c 100644
> > --- a/arch/x86/kvm/x86.c
> > +++ b/arch/x86/kvm/x86.c
> > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > 
> >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> >  	case KVM_CAP_XSAVE:
> > 
> >  	case KVM_CAP_ASYNC_PF:
> > +	case KVM_CAP_MSIX_MMIO:
> >  		r = 1;
> >  		break;
> >  	
> >  	case KVM_CAP_COALESCED_MMIO:
> > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > long addr,
> > 
> >  					   struct kvm_vcpu *vcpu)
> >  
> >  {
> >  
> >  	gpa_t                 gpa;
> > 
> > +	int r;
> > 
> >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > 
> > @@ -3822,14 +3824,16 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >  mmio:
> >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > 
> >  	/*
> >  	
> >  	 * Is this MMIO handled locally?
> >  	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > +	if (!r)
> > 
> >  		return X86EMUL_CONTINUE;
> >  	
> >  	vcpu->mmio_needed = 1;
> > 
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > 
> > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > index ea2dc1a..ad9df4b 100644
> > --- a/include/linux/kvm.h
> > +++ b/include/linux/kvm.h
> > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > 
> >  #define KVM_EXIT_NMI              16
> >  #define KVM_EXIT_INTERNAL_ERROR   17
> >  #define KVM_EXIT_OSI              18
> > 
> > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > 
> >  /* For KVM_EXIT_INTERNAL_ERROR */
> >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > 
> > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > 
> >  #define KVM_CAP_PPC_GET_PVINFO 57
> >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> >  #define KVM_CAP_ASYNC_PF 59
> > 
> > +#define KVM_CAP_MSIX_MMIO 60
> > 
> >  #ifdef KVM_CAP_IRQ_ROUTING
> > 
> > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > 
> >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > 
> > +/* Available with KVM_CAP_MSIX_MMIO */
> > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > 0x7e, struct kvm_msix_mmio_user)
> > 
> >  /* Available with KVM_CAP_PIT_STATE2 */
> >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> >  struct kvm_pit_state2)
> > 
> > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > 
> >  	__u16 padding[3];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> >  #endif /* __LINUX_KVM_H */
> > 
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 7d313e0..c10670c 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > 
> >  					KVM_PRIVATE_MEM_SLOTS];
> >  
> >  };
> > 
> > +#define KVM_MSIX_MMIO_MAX    32
> 
> I assume this is based on 32 devices/bus, but we could exceed this is we
> add more buses or make use of more functions.  How hard is it to make
> this dynamic?

We can improve this later(maybe with balance tree?). 32 should be more than 
sufficient for now. And this patchset has been in the mailing list for months, I 
really want to get it in first...
 
> > +
> > +struct kvm_msix_mmio {
> > +	u32 dev_id;
> > +	u16 type;
> > +	u16 max_entries_nr;
> > +	u64 flags;
> > +	gpa_t table_base_addr;
> > +	hva_t table_base_va;
> > +	gpa_t pba_base_addr;
> > +	hva_t pba_base_va;
> > +};
> > +
> > +struct kvm_msix_mmio_dev {
> > +	struct kvm *kvm;
> > +	struct kvm_io_device table_dev;
> > +	int mmio_nr;
> > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > +	struct mutex lock;
> > +};
> > +
> > 
> >  struct kvm {
> >  
> >  	spinlock_t mmu_lock;
> >  	raw_spinlock_t requests_lock;
> > 
> > @@ -281,6 +302,7 @@ struct kvm {
> > 
> >  	long mmu_notifier_count;
> >  
> >  #endif
> >  
> >  	long tlbs_dirty;
> > 
> > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > 
> >  };
> >  
> >  /* The guest did something we don't support. */
> > 
> > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > 
> >  int kvm_request_irq_source_id(struct kvm *kvm);
> >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +			int assigned_dev_id, int entry, bool mask);
> > +
> > 
> >  /* For vcpu->arch.iommu_flags */
> >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > 
> > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > index ae72ae6..d1598a6 100644
> > --- a/virt/kvm/assigned-dev.c
> > +++ b/virt/kvm/assigned-dev.c
> > @@ -18,6 +18,7 @@
> > 
> >  #include <linux/interrupt.h>
> >  #include <linux/slab.h>
> >  #include "irq.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> >  list_head *head,
> >  
> >  						      int assigned_dev_id)
> > 
> > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > 
> >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> >  
> >  }
> > 
> > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > +				struct kvm_assigned_dev_kernel *adev)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = adev->assigned_dev_id;
> > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +	kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > 
> >  static void kvm_free_assigned_device(struct kvm *kvm,
> >  
> >  				     struct kvm_assigned_dev_kernel
> >  				     *assigned_dev)
> >  
> >  {
> >  
> >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > 
> > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > +
> > 
> >  	__pci_reset_function(assigned_dev->dev);
> >  	pci_restore_state(assigned_dev->dev);
> > 
> > @@ -785,3 +799,33 @@ out:
> >  	return r;
> >  
> >  }
> > 
> > +/* The caller should hold kvm->lock */
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, bool mask)
> > +{
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > +	for (i = 0; i < adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (mask)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	return r;
> > +}
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index b1b6cbb..b7807c8 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -56,6 +56,7 @@
> > 
> >  #include "coalesced_mmio.h"
> >  #include "async_pf.h"
> > 
> > +#include "msix_mmio.h"
> > 
> >  #define CREATE_TRACE_POINTS
> >  #include <trace/events/kvm.h>
> > 
> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > 
> >  	struct mm_struct *mm = kvm->mm;
> >  	
> >  	kvm_arch_sync_events(kvm);
> > 
> > +	kvm_unregister_msix_mmio_dev(kvm);
> > 
> >  	spin_lock(&kvm_lock);
> >  	list_del(&kvm->vm_list);
> >  	spin_unlock(&kvm_lock);
> > 
> > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > 
> >  		mutex_unlock(&kvm->lock);
> >  		break;
> >  
> >  #endif
> > 
> > +	case KVM_REGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > +		struct kvm_msix_mmio_user mmio_user;
> > +
> > +		r = -EFAULT;
> > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > +			goto out;
> > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > +		break;
> > +	}
> > 
> >  	default:
> >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> >  		if (r == -ENOTTY)
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >  		return r;
> >  	
> >  	}
> >  
> >  #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r < 0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> > +
> > 
> >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> >  	if (r < 0)
> >  	
> >  		kvm_put_kvm(kvm);
> > 
> > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > *bus)
> > 
> >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> >  
> >  		     int len, const void *val)
> >  
> >  {
> > 
> > -	int i;
> > +	int i, r = -EOPNOTSUPP;
> > 
> >  	struct kvm_io_bus *bus;
> >  	
> >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > 
> > -	for (i = 0; i < bus->dev_count; i++)
> > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > +	for (i = 0; i < bus->dev_count; i++) {
> > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > +		if (r == -ENOTSYNC)
> > +			break;
> > +		else if (!r)
> > 
> >  			return 0;
> > 
> > -	return -EOPNOTSUPP;
> > +	}
> > +	return r;
> > 
> >  }
> >  
> >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > 
> > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > new file mode 100644
> > index 0000000..dde9b62
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.c
> > @@ -0,0 +1,286 @@
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/kvm_host.h>
> > +#include <linux/kvm.h>
> > +
> > +#include "msix_mmio.h"
> > +#include "iodev.h"
> > +
> > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio, +				int entry, u32 flag)
> > +{
> > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > +				mmio->dev_id, entry, flag);
> > +	return -EFAULT;
> > +}
> > +
> > +/* Caller must hold dev->lock */
> > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > +				gpa_t addr, int len)
> > +{
> > +	gpa_t start, end;
> > +	int i, r = -EINVAL;
> > +
> > +	for (i = 0; i < dev->mmio_nr; i++) {
> > +		start = dev->mmio[i].table_base_addr;
> > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > +			dev->mmio[i].max_entries_nr;
> > +		if (addr >= start && addr + len <= end) {
> > +			r = i;
> > +			break;
> > +		}
> 
> Yet another potential user of the weight balanced tree ;)

Yeah, we can improve it later.
> 
> > +	}
> > +
> > +	return r;
> > +}
> > +
> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xf;
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +	u32 *ctrl_pos;
> > +
> > +	mutex_lock(&mmio_dev->kvm->lock);
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx < 0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > +		goto out;
> > +
> > +	offset = addr & 0xF;
> 
> nit, this 0xf above.

So you like another mask?
> 
> > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > +		goto out;
> > +
> > +	mmio = &mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > +
> > +	if (get_user(old_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	/* No allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > +		goto out;
> > +
> > +	if (get_user(new_ctrl, ctrl_pos))
> > +		goto out;
> > +
> > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > +		ret = -ENOTSYNC;
> 
> Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> first?  I'd also prefer to see a direct goto out here since it's not
> entirely obvious that this first 'if' always falls into the below 'if'.
> I'm not a fan of using an error code as a special non-error return
> either.

In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
see if they indeed modified ctrl bits.

I would try to make the logic here more clear. 
> 
> > +	if (old_ctrl == new_ctrl)
> > +		goto out;
> > +
> > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> 
> An xor would remove some redundancy here
> 
> if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));

Oh, yes, sure...
> 
> > +	if (r || ret)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	mutex_unlock(&mmio_dev->kvm->lock);
> > +	return ret;
> > +}
> > +
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > +	kvm->msix_mmio_dev.kvm = kvm;
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > +{
> > +	int ret;
> > +
> > +	mutex_lock(&kvm->slots_lock);
> > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > +				      &kvm->msix_mmio_dev.table_dev);
> > +	mutex_unlock(&kvm->slots_lock);
> > +	return ret;
> > +}
> > +
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio = &mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	/* All reserved currently */
> > +	if (mmio_user->flags) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +
> > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > +		r = -EINVAL;
> > +		goto out;
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > +	}
> > +
> > +	mmio_dev->mmio_nr++;
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> > +
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > +	int r = 0, i, j;
> 
> set r = -EINVAL here
> 
> > +	bool found = 0;
> > +
> > +	if (!mmio)
> > +		return 0;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > +		    mmio_dev->mmio[i].type == mmio->type) {
> > +			found = true;
> 
> set r = 0 here
> 
> > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > +			mmio_dev->mmio_nr--;
> > +			break;
> > +		}
> > +	}
> > +	if (!found)
> > +		r = -EINVAL;
> 
> then get rid of found

Sure.

--
regards
Yang, Sheng

> 
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio mmio;
> > +
> > +	mmio.dev_id = mmio_user->dev_id;
> > +	mmio.type = mmio_user->type;
> > +
> > +	return kvm_free_msix_mmio(kvm, &mmio);
> > +}
> > +
> > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > new file mode 100644
> > index 0000000..01b6587
> > --- /dev/null
> > +++ b/virt/kvm/msix_mmio.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __KVM_MSIX_MMIO_H__
> > +#define __KVM_MSIX_MMIO_H__
> > +/*
> > + * MSI-X MMIO emulation
> > + *
> > + * Copyright (c) 2010 Intel Corporation
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > + * the COPYING file in the top-level directory.
> > + *
> > + * Author:
> > + *   Sheng Yang <sheng.yang@intel.com>
> > + */
> > +
> > +#include <linux/pci.h>
> > +
> > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > +				      struct kvm_msix_mmio_user *mmio_user);
> > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > *mmio_user); +
> > +#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  6:59     ` Sheng Yang
@ 2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-24  8:08         ` Sheng Yang
  2011-02-24  9:44         ` Sheng Yang
  2011-02-23 16:34       ` Alex Williamson
  1 sibling, 2 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-02-23  8:45 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > Then we can support mask bit operation of assigned devices now.
> > 
> > Looks pretty good overall.  A few comments below.  It seems like we
> > should be able to hook this into vfio with a small stub in kvm.  We just
> > need to be able to communicate disabling and enabling of individual msix
> > vectors.  For brute force, we could do this with a lot of eventfds,
> > triggered by kvm and consumed by vfio, two per MSI-X vector.  Not sure
> > if there's something smaller that could do it.  Thanks,
> 
> Alex, thanks for your comments. See my comments below:
> 
> > 
> > Alex
> > 
> > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > ---
> > > 
> > >  arch/x86/kvm/Makefile    |    2 +-
> > >  arch/x86/kvm/x86.c       |    8 +-
> > >  include/linux/kvm.h      |   21 ++++
> > >  include/linux/kvm_host.h |   25 ++++
> > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > >  virt/kvm/msix_mmio.c     |  286
> > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h    
> > >  |   25 ++++
> > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > >  create mode 100644 virt/kvm/msix_mmio.c
> > >  create mode 100644 virt/kvm/msix_mmio.h
> > > 
> > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > index f15501f..3a0d851 100644
> > > --- a/arch/x86/kvm/Makefile
> > > +++ b/arch/x86/kvm/Makefile
> > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > 
> > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
> > >  
> > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > 
> > > -				assigned-dev.o)
> > > +				assigned-dev.o msix_mmio.o)
> > > 
> > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > >  async_pf.o)
> > > 
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index fa708c9..89bf12c 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > 
> > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > >  	case KVM_CAP_XSAVE:
> > > 
> > >  	case KVM_CAP_ASYNC_PF:
> > > +	case KVM_CAP_MSIX_MMIO:
> > >  		r = 1;
> > >  		break;
> > >  	
> > >  	case KVM_CAP_COALESCED_MMIO:
> > > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned
> > > long addr,
> > > 
> > >  					   struct kvm_vcpu *vcpu)
> > >  
> > >  {
> > >  
> > >  	gpa_t                 gpa;
> > > 
> > > +	int r;
> > > 
> > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > 
> > > @@ -3822,14 +3824,16 @@ static int
> > > emulator_write_emulated_onepage(unsigned long addr,
> > > 
> > >  mmio:
> > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > 
> > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > 
> > >  	/*
> > >  	
> > >  	 * Is this MMIO handled locally?
> > >  	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > +	if (!r)
> > > 
> > >  		return X86EMUL_CONTINUE;
> > >  	
> > >  	vcpu->mmio_needed = 1;
> > > 
> > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;

This use of -ENOTSYNC is IMO confusing.
How about we make vcpu_mmio_write return the positive exit reason?
Negative value will mean an error.

> > > 
> > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > 
> > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > index ea2dc1a..ad9df4b 100644
> > > --- a/include/linux/kvm.h
> > > +++ b/include/linux/kvm.h
> > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > 
> > >  #define KVM_EXIT_NMI              16
> > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > >  #define KVM_EXIT_OSI              18
> > > 
> > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > 
> > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > 
> > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > 
> > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > >  #define KVM_CAP_ASYNC_PF 59
> > > 
> > > +#define KVM_CAP_MSIX_MMIO 60
> > > 
> > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > 
> > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > 
> > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO, 
> > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK            
> > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > 
> > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO, 
> > > 0x7e, struct kvm_msix_mmio_user)
> > > 
> > >  /* Available with KVM_CAP_PIT_STATE2 */
> > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > >  struct kvm_pit_state2)
> > > 
> > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > 
> > >  	__u16 padding[3];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > +struct kvm_msix_mmio_user {
> > > +	__u32 dev_id;
> > > +	__u16 type;
> > > +	__u16 max_entries_nr;
> > > +	__u64 base_addr;
> > > +	__u64 base_va;
> > > +	__u64 flags;
> > > +	__u64 reserved[4];
> > > +};
> > > +
> > > 
> > >  #endif /* __LINUX_KVM_H */
> > > 
> > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > index 7d313e0..c10670c 100644
> > > --- a/include/linux/kvm_host.h
> > > +++ b/include/linux/kvm_host.h
> > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > 
> > >  					KVM_PRIVATE_MEM_SLOTS];
> > >  
> > >  };
> > > 
> > > +#define KVM_MSIX_MMIO_MAX    32
> > 
> > I assume this is based on 32 devices/bus, but we could exceed this is we
> > add more buses or make use of more functions.  How hard is it to make
> > this dynamic?
> 
> We can improve this later(maybe with balance tree?). 32 should be more than 
> sufficient for now. And this patchset has been in the mailing list for months, I 
> really want to get it in first...
>  
> > > +
> > > +struct kvm_msix_mmio {
> > > +	u32 dev_id;
> > > +	u16 type;
> > > +	u16 max_entries_nr;
> > > +	u64 flags;
> > > +	gpa_t table_base_addr;
> > > +	hva_t table_base_va;
> > > +	gpa_t pba_base_addr;
> > > +	hva_t pba_base_va;
> > > +};
> > > +
> > > +struct kvm_msix_mmio_dev {
> > > +	struct kvm *kvm;
> > > +	struct kvm_io_device table_dev;
> > > +	int mmio_nr;
> > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > +	struct mutex lock;
> > > +};
> > > +
> > > 
> > >  struct kvm {
> > >  
> > >  	spinlock_t mmu_lock;
> > >  	raw_spinlock_t requests_lock;
> > > 
> > > @@ -281,6 +302,7 @@ struct kvm {
> > > 
> > >  	long mmu_notifier_count;
> > >  
> > >  #endif
> > >  
> > >  	long tlbs_dirty;
> > > 
> > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > 
> > >  };
> > >  
> > >  /* The guest did something we don't support. */
> > > 
> > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
> > > 
> > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > 
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +			int assigned_dev_id, int entry, bool mask);
> > > +
> > > 
> > >  /* For vcpu->arch.iommu_flags */
> > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > 
> > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > index ae72ae6..d1598a6 100644
> > > --- a/virt/kvm/assigned-dev.c
> > > +++ b/virt/kvm/assigned-dev.c
> > > @@ -18,6 +18,7 @@
> > > 
> > >  #include <linux/interrupt.h>
> > >  #include <linux/slab.h>
> > >  #include "irq.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > >  list_head *head,
> > >  
> > >  						      int assigned_dev_id)
> > > 
> > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
> > > 
> > >  	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
> > >  
> > >  }
> > > 
> > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > +				struct kvm_assigned_dev_kernel *adev)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = adev->assigned_dev_id;
> > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > 
> > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > >  
> > >  				     struct kvm_assigned_dev_kernel
> > >  				     *assigned_dev)
> > >  
> > >  {
> > >  
> > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > 
> > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > +
> > > 
> > >  	__pci_reset_function(assigned_dev->dev);
> > >  	pci_restore_state(assigned_dev->dev);
> > > 
> > > @@ -785,3 +799,33 @@ out:
> > >  	return r;
> > >  
> > >  }
> > > 
> > > +/* The caller should hold kvm->lock */
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +				int assigned_dev_id, int entry, bool mask)
> > > +{
> > > +	int r = -EFAULT;
> > > +	struct kvm_assigned_dev_kernel *adev;
> > > +	int i;
> > > +
> > > +	if (!irqchip_in_kernel(kvm))
> > > +		return r;
> > > +
> > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > +				      assigned_dev_id);
> > > +	if (!adev)
> > > +		goto out;
> > > +
> > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > +	for (i = 0; i < adev->entries_nr; i++)
> > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > +			if (mask)
> > > +				disable_irq_nosync(
> > > +					adev->host_msix_entries[i].vector);
> > > +			else
> > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > +			r = 0;
> > > +			break;
> > > +		}
> > > +out:
> > > +	return r;
> > > +}
> > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > index b1b6cbb..b7807c8 100644
> > > --- a/virt/kvm/kvm_main.c
> > > +++ b/virt/kvm/kvm_main.c
> > > @@ -56,6 +56,7 @@
> > > 
> > >  #include "coalesced_mmio.h"
> > >  #include "async_pf.h"
> > > 
> > > +#include "msix_mmio.h"
> > > 
> > >  #define CREATE_TRACE_POINTS
> > >  #include <trace/events/kvm.h>
> > > 
> > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > 
> > >  	struct mm_struct *mm = kvm->mm;
> > >  	
> > >  	kvm_arch_sync_events(kvm);
> > > 
> > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > 
> > >  	spin_lock(&kvm_lock);
> > >  	list_del(&kvm->vm_list);
> > >  	spin_unlock(&kvm_lock);
> > > 
> > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > 
> > >  		mutex_unlock(&kvm->lock);
> > >  		break;
> > >  
> > >  #endif
> > > 
> > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > +		struct kvm_msix_mmio_user mmio_user;
> > > +
> > > +		r = -EFAULT;
> > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > +			goto out;
> > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > +		break;
> > > +	}
> > > 
> > >  	default:
> > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > >  		if (r == -ENOTTY)
> > > 
> > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > 
> > >  		return r;
> > >  	
> > >  	}
> > >  
> > >  #endif
> > > 
> > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > +	if (r < 0) {
> > > +		kvm_put_kvm(kvm);
> > > +		return r;
> > > +	}
> > > +
> > > 
> > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > >  	if (r < 0)
> > >  	
> > >  		kvm_put_kvm(kvm);
> > > 
> > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus
> > > *bus)
> > > 
> > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
> > >  
> > >  		     int len, const void *val)
> > >  
> > >  {
> > > 
> > > -	int i;
> > > +	int i, r = -EOPNOTSUPP;
> > > 
> > >  	struct kvm_io_bus *bus;
> > >  	
> > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > 
> > > -	for (i = 0; i < bus->dev_count; i++)
> > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > +	for (i = 0; i < bus->dev_count; i++) {
> > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > +		if (r == -ENOTSYNC)
> > > +			break;
> > > +		else if (!r)
> > > 
> > >  			return 0;

So the above will be
 if (r >= 0)
	 return r;

> > > 
> > > -	return -EOPNOTSUPP;
> > > +	}
> > > +	return r;
> > > 
> > >  }
> > >  
> > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > 
> > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > new file mode 100644
> > > index 0000000..dde9b62
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.c
> > > @@ -0,0 +1,286 @@
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/kvm_host.h>
> > > +#include <linux/kvm.h>
> > > +
> > > +#include "msix_mmio.h"
> > > +#include "iodev.h"
> > > +
> > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio, +				int entry, u32 flag)
> > > +{
> > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > +				mmio->dev_id, entry, flag);
> > > +	return -EFAULT;
> > > +}
> > > +
> > > +/* Caller must hold dev->lock */
> > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > +				gpa_t addr, int len)
> > > +{
> > > +	gpa_t start, end;
> > > +	int i, r = -EINVAL;
> > > +
> > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > +		start = dev->mmio[i].table_base_addr;
> > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > +			dev->mmio[i].max_entries_nr;
> > > +		if (addr >= start && addr + len <= end) {
> > > +			r = i;
> > > +			break;
> > > +		}
> > 
> > Yet another potential user of the weight balanced tree ;)
> 
> Yeah, we can improve it later.
> > 
> > > +	}
> > > +
> > > +	return r;
> > > +}
> > > +
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xf;
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;

This is not as strict as it could be. Also
see below (in write) for an idea how to make the code clearer.

> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;

Want to give the caller some indication of an error?
If not add a comment why.

> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;

These are really __le32. Pls make them so and then
fix up the sparse errors.

> > > +	u32 *ctrl_pos;

get_user should not be done on u32 *, it should be
__user.  Does not sparse complain about this?

> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xF;
> > 
> > nit, this 0xf above.
> 
> So you like another mask?

Yes, this should be

	offset = addr % PCI_MSIX_ENTRY_SIZE;

> > 
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;

I think it's a good idea to be as strict as possible.
Also let's quote the spec. Finally using % will make
the logic more transparent. How about the below?

	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
	   result is undefined. */
	if (len == 4) {
		if (addr % 4)
			goto out;
        } else if (len == 8) {
		if (addr % 8)
			goto out;
        } else
		goto out;

As read needs same logic it's likely a good idea
to add a function for this.

> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))

It's probably a good idea to do access_ok check for the whole
entry here, and then use __get_user etc below, these are cheaper.

> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */

Do not allow.

> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))

any such cast is suspect.
Is entry_base really gpa? If it was you can not cast it to __user.
I think it should be void __user *.

> > > +		goto out;
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;

get_user assumes aligned address. But there's no guarantee ctrl_pos
is aligned because table_base_va might be misaligned.

> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;

Better to use != and not <, effectively same.
Or in the above we can do:
	bool control;
	if (len == 4) {
		if (addr % 4)
			goto out;
		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
        } else if (len == 8) {
		if (addr % 8)
			goto out;
		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
        } else
		goto out;


> > 
> > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > first?  I'd also prefer to see a direct goto out here since it's not
> > entirely obvious that this first 'if' always falls into the below 'if'.
> > I'm not a fan of using an error code as a special non-error return
> > either.
> 
> In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> see if they indeed modified ctrl bits.
> 
> I would try to make the logic here more clear. 
> > 
> > > +	if (old_ctrl == new_ctrl)
> > > +		goto out;
> > > +
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > 
> > An xor would remove some redundancy here
> > 
> > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> 
> Oh, yes, sure...

Also why not set ret to correct value directly?

> > 
> > > +	if (r || ret)
> > > +		ret = -ENOTSYNC;

when is ret set? And why not to the correct value directly?

> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > +	.read     = msix_table_mmio_read,
> > > +	.write    = msix_table_mmio_write,
> > > +};
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
> > > +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > +{
> > > +	int ret;
> > > +
> > > +	mutex_lock(&kvm->slots_lock);
> > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > +				      &kvm->msix_mmio_dev.table_dev);
> > > +	mutex_unlock(&kvm->slots_lock);
> > > +	return ret;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	struct kvm_msix_mmio *mmio = NULL;
> > > +	int r = 0, i;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > +			mmio = &mmio_dev->mmio[i];
> > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > +				r = -EINVAL;
> > > +				goto out;
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	/* All reserved currently */
> > > +	if (mmio_user->flags) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +
> > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > +		r = -EINVAL;
> > > +		goto out;
> > > +	}
> > > +	if (!mmio) {
> > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > +			r = -ENOSPC;
> > > +			goto out;
> > > +		}
> > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > +	}
> > > +
> > > +	mmio_dev->mmio_nr++;
> > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > +	mmio->dev_id = mmio_user->dev_id;
> > > +	mmio->flags = mmio_user->flags;
> > > +
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > +		mmio->table_base_va = mmio_user->base_va;

Also check alignment for these values. base_va should be
at least 4 byte aligned. Maybe more, need to think about it ...
Look in spec for table_base_addr alignment.


> > > +	}
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > +	int r = 0, i, j;
> > 
> > set r = -EINVAL here
> > 
> > > +	bool found = 0;
> > > +
> > > +	if (!mmio)
> > > +		return 0;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > +			found = true;
> > 
> > set r = 0 here
> > 
> > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > +			mmio_dev->mmio_nr--;
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (!found)
> > > +		r = -EINVAL;
> > 
> > then get rid of found
> 
> Sure.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio mmio;
> > > +
> > > +	mmio.dev_id = mmio_user->dev_id;
> > > +	mmio.type = mmio_user->type;
> > > +
> > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > +}
> > > +
> > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > new file mode 100644
> > > index 0000000..01b6587
> > > --- /dev/null
> > > +++ b/virt/kvm/msix_mmio.h
> > > @@ -0,0 +1,25 @@
> > > +#ifndef __KVM_MSIX_MMIO_H__
> > > +#define __KVM_MSIX_MMIO_H__
> > > +/*
> > > + * MSI-X MMIO emulation
> > > + *
> > > + * Copyright (c) 2010 Intel Corporation
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + *
> > > + * Author:
> > > + *   Sheng Yang <sheng.yang@intel.com>
> > > + */
> > > +
> > > +#include <linux/pci.h>
> > > +
> > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > *mmio_user); +
> > > +#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  8:45       ` Michael S. Tsirkin
@ 2011-02-24  8:08         ` Sheng Yang
  2011-02-24 10:11           ` Michael S. Tsirkin
  2011-02-24  9:44         ` Sheng Yang
  1 sibling, 1 reply; 20+ messages in thread
From: Sheng Yang @ 2011-02-24  8:08 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Looks pretty good overall.  A few comments below.  It seems like we
> > > should be able to hook this into vfio with a small stub in kvm.  We
> > > just need to be able to communicate disabling and enabling of
> > > individual msix vectors.  For brute force, we could do this with a lot
> > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > vector.  Not sure if there's something smaller that could do it. 
> > > Thanks,
> > 
> > Alex, thanks for your comments. See my comments below:
> > > Alex
> > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > ---
> > > > 
> > > >  arch/x86/kvm/Makefile    |    2 +-
> > > >  arch/x86/kvm/x86.c       |    8 +-
> > > >  include/linux/kvm.h      |   21 ++++
> > > >  include/linux/kvm_host.h |   25 ++++
> > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > >  virt/kvm/msix_mmio.c     |  286
> > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > >  
> > > >  |   25 ++++
> > > >  
> > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
\
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index fa708c9..89bf12c 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3807,6 +3808,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  					   struct kvm_vcpu *vcpu)
> > > >  
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3822,14 +3824,16 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > >  	
> > > >  	vcpu->mmio_needed = 1;
> > > > 
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> 
> This use of -ENOTSYNC is IMO confusing.
> How about we make vcpu_mmio_write return the positive exit reason?
> Negative value will mean an error.

Make sense. I would update it.
> 
> > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > 
> > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > index ea2dc1a..ad9df4b 100644
> > > > --- a/include/linux/kvm.h
> > > > +++ b/include/linux/kvm.h
> > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > 
> > > >  #define KVM_EXIT_NMI              16
> > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > >  #define KVM_EXIT_OSI              18
> > > > 
> > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > 
> > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > 
> > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > 
> > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > >  #define KVM_CAP_ASYNC_PF 59
> > > > 
> > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > 
> > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > 
> > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > 
> > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > 
> > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,
> > > > 0x7e, struct kvm_msix_mmio_user)
> > > > 
> > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > >  struct kvm_pit_state2)
> > > > 
> > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > 
> > > >  	__u16 padding[3];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > +struct kvm_msix_mmio_user {
> > > > +	__u32 dev_id;
> > > > +	__u16 type;
> > > > +	__u16 max_entries_nr;
> > > > +	__u64 base_addr;
> > > > +	__u64 base_va;
> > > > +	__u64 flags;
> > > > +	__u64 reserved[4];
> > > > +};
> > > > +
> > > > 
> > > >  #endif /* __LINUX_KVM_H */
> > > > 
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index 7d313e0..c10670c 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > 
> > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > >  
> > > >  };
> > > > 
> > > > +#define KVM_MSIX_MMIO_MAX    32
> > > 
> > > I assume this is based on 32 devices/bus, but we could exceed this is
> > > we add more buses or make use of more functions.  How hard is it to
> > > make this dynamic?
> > 
> > We can improve this later(maybe with balance tree?). 32 should be more
> > than sufficient for now. And this patchset has been in the mailing list
> > for months, I really want to get it in first...
> > 
> > > > +
> > > > +struct kvm_msix_mmio {
> > > > +	u32 dev_id;
> > > > +	u16 type;
> > > > +	u16 max_entries_nr;
> > > > +	u64 flags;
> > > > +	gpa_t table_base_addr;
> > > > +	hva_t table_base_va;
> > > > +	gpa_t pba_base_addr;
> > > > +	hva_t pba_base_va;
> > > > +};
> > > > +
> > > > +struct kvm_msix_m4mio_dev {
> > > > +	struct kvm *kvm;
> > > > +	struct kvm_io_device table_dev;
> > > > +	int mmio_nr;
> > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > +	struct mutex lock;
> > > > +};
> > > > +
> > > > 
> > > >  struct kvm {
> > > >  
> > > >  	spinlock_t mmu_lock;
> > > >  	raw_spinlock_t requests_lock;
> > > > 
> > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > 
> > > >  	long mmu_notifier_count;
> > > >  
> > > >  #endif
> > > >  
> > > >  	long tlbs_dirty;
> > > > 
> > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > 
> > > >  };
> > > >  
> > > >  /* The guest did something we don't support. */
> > > > 
> > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm
> > > > *kvm,
> > > > 
> > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > 
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +			int assigned_dev_id, int entry, bool mask);
> > > > +
> > > > 
> > > >  /* For vcpu->arch.iommu_flags */
> > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > 
> > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > index ae72ae6..d1598a6 100644
> > > > --- a/virt/kvm/assigned-dev.c
> > > > +++ b/virt/kvm/assigned-dev.c
> > > > @@ -18,6 +18,7 @@
> > > > 
> > > >  #include <linux/interrupt.h>
> > > >  #include <linux/slab.h>
> > > >  #include "irq.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > >  list_head *head,
> > > >  
> > > >  						      int assigned_dev_id)
> > > > 
> > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm
> > > > *kvm,
> > > > 
> > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > >  	assigned_dev->irq_requested_type);
> > > >  
> > > >  }
> > > > 
> > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > 
> > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > >  
> > > >  				     struct kvm_assigned_dev_kernel
> > > >  				     *assigned_dev)
> > > >  
> > > >  {
> > > >  
> > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > 
> > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > +
> > > > 
> > > >  	__pci_reset_function(assigned_dev->dev);
> > > >  	pci_restore_state(assigned_dev->dev);
> > > > 
> > > > @@ -785,3 +799,33 @@ out:
> > > >  	return r;
> > > >  
> > > >  }
> > > > 
> > > > +/* The caller should hold kvm->lock */
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +				int assigned_dev_id, int entry, bool mask)
> > > > +{
> > > > +	int r = -EFAULT;
> > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > +	int i;
> > > > +
> > > > +	if (!irqchip_in_kernel(kvm))
> > > > +		return r;
> > > > +
> > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > +				      assigned_dev_id);
> > > > +	if (!adev)
> > > > +		goto out;
> > > > +
> > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > +			if (mask)
> > > > +				disable_irq_nosync(
> > > > +					adev->host_msix_entries[i].vector);
> > > > +			else
> > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > +			r = 0;
> > > > +			break;
> > > > +		}
> > > > +out:
> > > > +	return r;
> > > > +}
> > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > index b1b6cbb..b7807c8 100644
> > > > --- a/virt/kvm/kvm_main.c
> > > > +++ b/virt/kvm/kvm_main.c
> > > > @@ -56,6 +56,7 @@
> > > > 
> > > >  #include "coalesced_mmio.h"
> > > >  #include "async_pf.h"
> > > > 
> > > > +#include "msix_mmio.h"
> > > > 
> > > >  #define CREATE_TRACE_POINTS
> > > >  #include <trace/events/kvm.h>
> > > > 
> > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > 
> > > >  	struct mm_struct *mm = kvm->mm;
> > > >  	
> > > >  	kvm_arch_sync_events(kvm);
> > > > 
> > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > 
> > > >  	spin_lock(&kvm_lock);
> > > >  	list_del(&kvm->vm_list);
> > > >  	spin_unlock(&kvm_lock);
> > > > 
> > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > 
> > > >  		mutex_unlock(&kvm->lock);
> > > >  		break;
> > > >  
> > > >  #endif
> > > > 
> > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > +
> > > > +		r = -EFAULT;
> > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > +			goto out;
> > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > +		break;
> > > > +	}
> > > > 
> > > >  	default:
> > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > >  		if (r == -ENOTTY)
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >  		return r;
> > > >  	
> > > >  	}
> > > >  
> > > >  #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r < 0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > > +
> > > > 
> > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > >  	if (r < 0)
> > > >  	
> > > >  		kvm_put_kvm(kvm);
> > > > 
> > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > kvm_io_bus *bus)
> > > > 
> > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > >  addr,
> > > >  
> > > >  		     int len, const void *val)
> > > >  
> > > >  {
> > > > 
> > > > -	int i;
> > > > +	int i, r = -EOPNOTSUPP;
> > > > 
> > > >  	struct kvm_io_bus *bus;
> > > >  	
> > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > 
> > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > +		if (r == -ENOTSYNC)
> > > > +			break;
> > > > +		else if (!r)
> > > > 
> > > >  			return 0;
> 
> So the above will be
>  if (r >= 0)
> 	 return r;
> 
> > > > -	return -EOPNOTSUPP;
> > > > +	}
> > > > +	return r;
> > > > 
> > > >  }
> > > >  
> > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > 
> > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > new file mode 100644
> > > > index 0000000..dde9b62
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.c
> > > > @@ -0,0 +1,286 @@
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/kvm_host.h>
> > > > +#include <linux/kvm.h>
> > > > +
> > > > +#include "msix_mmio.h"
> > > > +#include "iodev.h"
> > > > +
> > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > +{
> > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > +				mmio->dev_id, entry, flag);
> > > > +	return -EFAULT;
> > > > +}
> > > > +
> > > > +/* Caller must hold dev->lock */
> > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > +				gpa_t addr, int len)
> > > > +{
> > > > +	gpa_t start, end;
> > > > +	int i, r = -EINVAL;
> > > > +
> > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > +		start = dev->mmio[i].table_base_addr;
> > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > +			dev->mmio[i].max_entries_nr;
> > > > +		if (addr >= start && addr + len <= end) {
> > > > +			r = i;
> > > > +			break;
> > > > +		}
> > > 
> > > Yet another potential user of the weight balanced tree ;)
> > 
> > Yeah, we can improve it later.
> > 
> > > > +	}
> > > > +
> > > > +	return r;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > addr, int len, +				void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xf;
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> 
> This is not as strict as it could be. Also
> see below (in write) for an idea how to make the code clearer.
> 
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> 
> Want to give the caller some indication of an error?
> If not add a comment why.

I don't think we have a proper way to indicate the error now. Also the 
table_base_va already passed the checking when MMIO registered. We can add more 
error info later.
> 
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > addr, +				int len, const void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> 
> These are really __le32. Pls make them so and then
> fix up the sparse errors.
> 
> > > > +	u32 *ctrl_pos;
> 
> get_user should not be done on u32 *, it should be
> __user.  Does not sparse complain about this?
> 

Haven't use sparse to check the patches. Would do it later.
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xF;
> > > 
> > > nit, this 0xf above.
> > 
> > So you like another mask?
> 
> Yes, this should be
> 
> 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> 
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> 
> I think it's a good idea to be as strict as possible.
> Also let's quote the spec. Finally using % will make
> the logic more transparent. How about the below?
> 
> 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> 	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
> 	   result is undefined. */
> 	if (len == 4) {
> 		if (addr % 4)
> 			goto out;
>         } else if (len == 8) {
> 		if (addr % 8)
> 			goto out;
>         } else
> 		goto out;
> 
> As read needs same logic it's likely a good idea
> to add a function for this.
> 
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> 
> It's probably a good idea to do access_ok check for the whole
> entry here, and then use __get_user etc below, these are cheaper.

table_base_va already passed access_ok checking. I am not sure if we need to do it 
again.
> 
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> 
> Do not allow.
> 
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> 
> any such cast is suspect.
> Is entry_base really gpa? If it was you can not cast it to __user.

HVA.

> I think it should be void __user *.
> 
> > > > +		goto out;
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> 
> get_user assumes aligned address. But there's no guarantee ctrl_pos
> is aligned because table_base_va might be misaligned.

table_base_va already passed access_ok() check.
> 
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> 
> Better to use != and not <, effectively same.
> Or in the above we can do:
> 	bool control;
> 	if (len == 4) {
> 		if (addr % 4)
> 			goto out;
> 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
>         } else if (len == 8) {
> 		if (addr % 8)
> 			goto out;
> 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
>         } else
> 		goto out;
> 
> > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > first?  I'd also prefer to see a direct goto out here since it's not
> > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > I'm not a fan of using an error code as a special non-error return
> > > either.
> > 
> > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check
> > new_ctrl to see if they indeed modified ctrl bits.
> > 
> > I would try to make the logic here more clear.
> > 
> > > > +	if (old_ctrl == new_ctrl)
> > > > +		goto out;
> > > > +
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > 
> > > An xor would remove some redundancy here
> > > 
> > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > 
> > Oh, yes, sure...
> 
> Also why not set ret to correct value directly?
> 
> > > > +	if (r || ret)
> > > > +		ret = -ENOTSYNC;
> 
> when is ret set? And why not to the correct value directly?
> 
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > +	.read     = msix_table_mmio_read,
> > > > +	.write    = msix_table_mmio_write,
> > > > +};
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > +{
> > > > +	int ret;
> > > > +
> > > > +	mutex_lock(&kvm->slots_lock);
> > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > +	mutex_unlock(&kvm->slots_lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > +	int r = 0, i;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > +			mmio = &mmio_dev->mmio[i];
> > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > +				r = -EINVAL;
> > > > +				goto out;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	/* All reserved currently */
> > > > +	if (mmio_user->flags) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +
> > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > +		r = -EINVAL;
> > > > +		goto out;
> > > > +	}
> > > > +	if (!mmio) {
> > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > +			r = -ENOSPC;
> > > > +			goto out;
> > > > +		}
> > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > +	}
> > > > +
> > > > +	mmio_dev->mmio_nr++;
> > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > +	mmio->flags = mmio_user->flags;
> > > > +
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > +		mmio->table_base_va = mmio_user->base_va;
> 
> Also check alignment for these values. base_va should be
> at least 4 byte aligned. Maybe more, need to think about it ...
> Look in spec for table_base_addr alignment.

OK.

--
regards
Yang, Sheng

> 
> > > > +	}
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > +	int r = 0, i, j;
> > > 
> > > set r = -EINVAL here
> > > 
> > > > +	bool found = 0;
> > > > +
> > > > +	if (!mmio)
> > > > +		return 0;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > +			found = true;
> > > 
> > > set r = 0 here
> > > 
> > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > +			mmio_dev->mmio_nr--;
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (!found)
> > > > +		r = -EINVAL;
> > > 
> > > then get rid of found
> > 
> > Sure.
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio mmio;
> > > > +
> > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > +	mmio.type = mmio_user->type;
> > > > +
> > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > +}
> > > > +
> > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > new file mode 100644
> > > > index 0000000..01b6587
> > > > --- /dev/null
> > > > +++ b/virt/kvm/msix_mmio.h
> > > > @@ -0,0 +1,25 @@
> > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > +#define __KVM_MSIX_MMIO_H__
> > > > +/*
> > > > + * MSI-X MMIO emulation
> > > > + *
> > > > + * Copyright (c) 2010 Intel Corporation
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > See + * the COPYING file in the top-level directory.
> > > > + *
> > > > + * Author:
> > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > + */
> > > > +
> > > > +#include <linux/pci.h>
> > > > +
> > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > *mmio_user); +
> > > > +#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24  8:08         ` Sheng Yang
@ 2011-02-24 10:11           ` Michael S. Tsirkin
  2011-02-25  5:54             ` Sheng Yang
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:11 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > just need to be able to communicate disabling and enabling of
> > > > individual msix vectors.  For brute force, we could do this with a lot
> > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > > vector.  Not sure if there's something smaller that could do it. 
> > > > Thanks,
> > > 
> > > Alex, thanks for your comments. See my comments below:
> > > > Alex
> > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > ---
> > > > > 
> > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > >  include/linux/kvm.h      |   21 ++++
> > > > >  include/linux/kvm_host.h |   25 ++++
> > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > >  virt/kvm/msix_mmio.c     |  286
> > > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > > >  
> > > > >  |   25 ++++
> > > > >  
> > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > 
> > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > index f15501f..3a0d851 100644
> > > > > --- a/arch/x86/kvm/Makefile
> > > > > +++ b/arch/x86/kvm/Makefile
> > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > 
> > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
> \
> > > > >  
> > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > 
> > > > > -				assigned-dev.o)
> > > > > +				assigned-dev.o msix_mmio.o)
> > > > > 
> > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > > >  async_pf.o)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index fa708c9..89bf12c 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > 
> > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > >  	case KVM_CAP_XSAVE:
> > > > > 
> > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  					   struct kvm_vcpu *vcpu)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	gpa_t                 gpa;
> > > > > 
> > > > > +	int r;
> > > > > 
> > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > 
> > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  mmio:
> > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > 
> > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > 
> > > > >  	/*
> > > > >  	
> > > > >  	 * Is this MMIO handled locally?
> > > > >  	 */
> > > > > 
> > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > +	if (!r)
> > > > > 
> > > > >  		return X86EMUL_CONTINUE;
> > > > >  	
> > > > >  	vcpu->mmio_needed = 1;
> > > > > 
> > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> > This use of -ENOTSYNC is IMO confusing.
> > How about we make vcpu_mmio_write return the positive exit reason?
> > Negative value will mean an error.
> 
> Make sense. I would update it.
> > 
> > > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > > 
> > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > index ea2dc1a..ad9df4b 100644
> > > > > --- a/include/linux/kvm.h
> > > > > +++ b/include/linux/kvm.h
> > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > > 
> > > > >  #define KVM_EXIT_NMI              16
> > > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > > >  #define KVM_EXIT_OSI              18
> > > > > 
> > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > > 
> > > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > > 
> > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > > 
> > > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > > >  #define KVM_CAP_ASYNC_PF 59
> > > > > 
> > > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > > 
> > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > 
> > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > > 
> > > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK             _IOW(KVMIO,
> > > > >  0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > > 
> > > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,
> > > > > 0x7e, struct kvm_msix_mmio_user)
> > > > > 
> > > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0,
> > > > >  struct kvm_pit_state2)
> > > > > 
> > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > > 
> > > > >  	__u16 padding[3];
> > > > >  
> > > > >  };
> > > > > 
> > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > +struct kvm_msix_mmio_user {
> > > > > +	__u32 dev_id;
> > > > > +	__u16 type;
> > > > > +	__u16 max_entries_nr;
> > > > > +	__u64 base_addr;
> > > > > +	__u64 base_va;
> > > > > +	__u64 flags;
> > > > > +	__u64 reserved[4];
> > > > > +};
> > > > > +
> > > > > 
> > > > >  #endif /* __LINUX_KVM_H */
> > > > > 
> > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > index 7d313e0..c10670c 100644
> > > > > --- a/include/linux/kvm_host.h
> > > > > +++ b/include/linux/kvm_host.h
> > > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > > 
> > > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > > >  
> > > > >  };
> > > > > 
> > > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > 
> > > > I assume this is based on 32 devices/bus, but we could exceed this is
> > > > we add more buses or make use of more functions.  How hard is it to
> > > > make this dynamic?
> > > 
> > > We can improve this later(maybe with balance tree?). 32 should be more
> > > than sufficient for now. And this patchset has been in the mailing list
> > > for months, I really want to get it in first...
> > > 
> > > > > +
> > > > > +struct kvm_msix_mmio {
> > > > > +	u32 dev_id;
> > > > > +	u16 type;
> > > > > +	u16 max_entries_nr;
> > > > > +	u64 flags;
> > > > > +	gpa_t table_base_addr;
> > > > > +	hva_t table_base_va;
> > > > > +	gpa_t pba_base_addr;
> > > > > +	hva_t pba_base_va;
> > > > > +};
> > > > > +
> > > > > +struct kvm_msix_m4mio_dev {
> > > > > +	struct kvm *kvm;
> > > > > +	struct kvm_io_device table_dev;
> > > > > +	int mmio_nr;
> > > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > > +	struct mutex lock;
> > > > > +};
> > > > > +
> > > > > 
> > > > >  struct kvm {
> > > > >  
> > > > >  	spinlock_t mmu_lock;
> > > > >  	raw_spinlock_t requests_lock;
> > > > > 
> > > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > > 
> > > > >  	long mmu_notifier_count;
> > > > >  
> > > > >  #endif
> > > > >  
> > > > >  	long tlbs_dirty;
> > > > > 
> > > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > > 
> > > > >  };
> > > > >  
> > > > >  /* The guest did something we don't support. */
> > > > > 
> > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > > 
> > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > +			int assigned_dev_id, int entry, bool mask);
> > > > > +
> > > > > 
> > > > >  /* For vcpu->arch.iommu_flags */
> > > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > > 
> > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > > index ae72ae6..d1598a6 100644
> > > > > --- a/virt/kvm/assigned-dev.c
> > > > > +++ b/virt/kvm/assigned-dev.c
> > > > > @@ -18,6 +18,7 @@
> > > > > 
> > > > >  #include <linux/interrupt.h>
> > > > >  #include <linux/slab.h>
> > > > >  #include "irq.h"
> > > > > 
> > > > > +#include "msix_mmio.h"
> > > > > 
> > > > >  static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct
> > > > >  list_head *head,
> > > > >  
> > > > >  						      int assigned_dev_id)
> > > > > 
> > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm
> > > > > *kvm,
> > > > > 
> > > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > > >  	assigned_dev->irq_requested_type);
> > > > >  
> > > > >  }
> > > > > 
> > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > > +{
> > > > > +	struct kvm_msix_mmio mmio;
> > > > > +
> > > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > > +}
> > > > > +
> > > > > 
> > > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > > >  
> > > > >  				     struct kvm_assigned_dev_kernel
> > > > >  				     *assigned_dev)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > > 
> > > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > > +
> > > > > 
> > > > >  	__pci_reset_function(assigned_dev->dev);
> > > > >  	pci_restore_state(assigned_dev->dev);
> > > > > 
> > > > > @@ -785,3 +799,33 @@ out:
> > > > >  	return r;
> > > > >  
> > > > >  }
> > > > > 
> > > > > +/* The caller should hold kvm->lock */
> > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > +				int assigned_dev_id, int entry, bool mask)
> > > > > +{
> > > > > +	int r = -EFAULT;
> > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > +	int i;
> > > > > +
> > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > +		return r;
> > > > > +
> > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > +				      assigned_dev_id);
> > > > > +	if (!adev)
> > > > > +		goto out;
> > > > > +
> > > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > +			if (mask)
> > > > > +				disable_irq_nosync(
> > > > > +					adev->host_msix_entries[i].vector);
> > > > > +			else
> > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > +			r = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +out:
> > > > > +	return r;
> > > > > +}
> > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > index b1b6cbb..b7807c8 100644
> > > > > --- a/virt/kvm/kvm_main.c
> > > > > +++ b/virt/kvm/kvm_main.c
> > > > > @@ -56,6 +56,7 @@
> > > > > 
> > > > >  #include "coalesced_mmio.h"
> > > > >  #include "async_pf.h"
> > > > > 
> > > > > +#include "msix_mmio.h"
> > > > > 
> > > > >  #define CREATE_TRACE_POINTS
> > > > >  #include <trace/events/kvm.h>
> > > > > 
> > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > > 
> > > > >  	struct mm_struct *mm = kvm->mm;
> > > > >  	
> > > > >  	kvm_arch_sync_events(kvm);
> > > > > 
> > > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > > 
> > > > >  	spin_lock(&kvm_lock);
> > > > >  	list_del(&kvm->vm_list);
> > > > >  	spin_unlock(&kvm_lock);
> > > > > 
> > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
> > > > > 
> > > > >  		mutex_unlock(&kvm->lock);
> > > > >  		break;
> > > > >  
> > > > >  #endif
> > > > > 
> > > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > +
> > > > > +		r = -EFAULT;
> > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > +			goto out;
> > > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > > +		break;
> > > > > +	}
> > > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > +
> > > > > +		r = -EFAULT;
> > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > +			goto out;
> > > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > > +		break;
> > > > > +	}
> > > > > 
> > > > >  	default:
> > > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > > >  		if (r == -ENOTTY)
> > > > > 
> > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > 
> > > > >  		return r;
> > > > >  	
> > > > >  	}
> > > > >  
> > > > >  #endif
> > > > > 
> > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > +	if (r < 0) {
> > > > > +		kvm_put_kvm(kvm);
> > > > > +		return r;
> > > > > +	}
> > > > > +
> > > > > 
> > > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > > >  	if (r < 0)
> > > > >  	
> > > > >  		kvm_put_kvm(kvm);
> > > > > 
> > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > > kvm_io_bus *bus)
> > > > > 
> > > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t
> > > > >  addr,
> > > > >  
> > > > >  		     int len, const void *val)
> > > > >  
> > > > >  {
> > > > > 
> > > > > -	int i;
> > > > > +	int i, r = -EOPNOTSUPP;
> > > > > 
> > > > >  	struct kvm_io_bus *bus;
> > > > >  	
> > > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > > 
> > > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > > +		if (r == -ENOTSYNC)
> > > > > +			break;
> > > > > +		else if (!r)
> > > > > 
> > > > >  			return 0;
> > 
> > So the above will be
> >  if (r >= 0)
> > 	 return r;
> > 
> > > > > -	return -EOPNOTSUPP;
> > > > > +	}
> > > > > +	return r;
> > > > > 
> > > > >  }
> > > > >  
> > > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > > 
> > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > > new file mode 100644
> > > > > index 0000000..dde9b62
> > > > > --- /dev/null
> > > > > +++ b/virt/kvm/msix_mmio.c
> > > > > @@ -0,0 +1,286 @@
> > > > > +/*
> > > > > + * MSI-X MMIO emulation
> > > > > + *
> > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > > See + * the COPYING file in the top-level directory.
> > > > > + *
> > > > > + * Author:
> > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > + */
> > > > > +
> > > > > +#include <linux/kvm_host.h>
> > > > > +#include <linux/kvm.h>
> > > > > +
> > > > > +#include "msix_mmio.h"
> > > > > +#include "iodev.h"
> > > > > +
> > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > > +{
> > > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > > +				mmio->dev_id, entry, flag);
> > > > > +	return -EFAULT;
> > > > > +}
> > > > > +
> > > > > +/* Caller must hold dev->lock */
> > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > > +				gpa_t addr, int len)
> > > > > +{
> > > > > +	gpa_t start, end;
> > > > > +	int i, r = -EINVAL;
> > > > > +
> > > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > > +		start = dev->mmio[i].table_base_addr;
> > > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > > +			dev->mmio[i].max_entries_nr;
> > > > > +		if (addr >= start && addr + len <= end) {
> > > > > +			r = i;
> > > > > +			break;
> > > > > +		}
> > > > 
> > > > Yet another potential user of the weight balanced tree ;)
> > > 
> > > Yeah, we can improve it later.
> > > 
> > > > > +	}
> > > > > +
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > > addr, int len, +				void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xf;
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > 
> > This is not as strict as it could be. Also
> > see below (in write) for an idea how to make the code clearer.
> > 
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > +	if (r)
> > > > > +		goto out;
> > 
> > Want to give the caller some indication of an error?
> > If not add a comment why.
> 
> I don't think we have a proper way to indicate the error now.

Then let's add a comment, and skip the goto - it's just confusing:

	/* Ignore return value, we don't have a way to indicate the
	 * error. */
	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
	


> Also the 
> table_base_va already passed the checking when MMIO registered. We can add more 
> error info later.
> > 
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > > addr, +				int len, const void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > +	gpa_t entry_base;
> > > > > +	u32 old_ctrl, new_ctrl;
> > 
> > These are really __le32. Pls make them so and then
> > fix up the sparse errors.
> > 
> > > > > +	u32 *ctrl_pos;
> > 
> > get_user should not be done on u32 *, it should be
> > __user.  Does not sparse complain about this?
> > 
> 
> Haven't use sparse to check the patches. Would do it later.

Please do. It's a lot of code with tricky pointer math, must make sure
we are not accessing userspace pointers directly etc.

> > > > > +
> > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xF;
> > > > 
> > > > nit, this 0xf above.
> > > 
> > > So you like another mask?
> > 
> > Yes, this should be
> > 
> > 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > 
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > 
> > I think it's a good idea to be as strict as possible.
> > Also let's quote the spec. Finally using % will make
> > the logic more transparent. How about the below?
> > 
> > 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > 	   aligned full DWORD or aligned full QWORD transactions; otherwise, the
> > 	   result is undefined. */
> > 	if (len == 4) {
> > 		if (addr % 4)
> > 			goto out;
> >         } else if (len == 8) {
> > 		if (addr % 8)
> > 			goto out;
> >         } else
> > 		goto out;
> > 
> > As read needs same logic it's likely a good idea
> > to add a function for this.
> > 
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > +
> > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > 
> > It's probably a good idea to do access_ok check for the whole
> > entry here, and then use __get_user etc below, these are cheaper.
> 
> table_base_va already passed access_ok checking. I am not sure if we need to do it 
> again.

Yes but the process invoking the ioctl can be different from the one
invoking the mmio.

> > 
> > > > > +		goto out;
> > > > > +
> > > > > +	/* No allow writing to other fields when entry is unmasked */
> > 
> > Do not allow.
> > 
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > 
> > any such cast is suspect.
> > Is entry_base really gpa? If it was you can not cast it to __user.
> 
> HVA.

Then don't declare it gpa_t

> > I think it should be void __user *.
> > 
> > > > > +		goto out;
> > > > > +
> > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > +		goto out;
> > 
> > get_user assumes aligned address. But there's no guarantee ctrl_pos
> > is aligned because table_base_va might be misaligned.
> 
> table_base_va already passed access_ok() check.

access_ok done on table_base_va only checks that the address
is not a kernel one.

> > 
> > > > > +
> > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > +		ret = -ENOTSYNC;
> > 
> > Better to use != and not <, effectively same.
> > Or in the above we can do:
> > 	bool control;
> > 	if (len == 4) {
> > 		if (addr % 4)
> > 			goto out;
> > 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
> >         } else if (len == 8) {
> > 		if (addr % 8)
> > 			goto out;
> > 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
> >         } else
> > 		goto out;
> > 
> > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > > first?  I'd also prefer to see a direct goto out here since it's not
> > > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > > I'm not a fan of using an error code as a special non-error return
> > > > either.
> > > 
> > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check
> > > new_ctrl to see if they indeed modified ctrl bits.
> > > 
> > > I would try to make the logic here more clear.
> > > 
> > > > > +	if (old_ctrl == new_ctrl)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > 
> > > > An xor would remove some redundancy here
> > > > 
> > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm,
> > > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > 
> > > Oh, yes, sure...
> > 
> > Also why not set ret to correct value directly?
> > 
> > > > > +	if (r || ret)
> > > > > +		ret = -ENOTSYNC;
> > 
> > when is ret set? And why not to the correct value directly?
> > 
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > +	.read     = msix_table_mmio_read,
> > > > > +	.write    = msix_table_mmio_write,
> > > > > +};
> > > > > +
> > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > > +{
> > > > > +	int ret;
> > > > > +
> > > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > > +{
> > > > > +	int ret;
> > > > > +
> > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > +	int r = 0, i;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > +			mmio = &mmio_dev->mmio[i];
> > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > > +				r = -EINVAL;
> > > > > +				goto out;
> > > > > +			}
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	/* All reserved currently */
> > > > > +	if (mmio_user->flags) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +
> > > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > > +		r = -EINVAL;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if (!mmio) {
> > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > +			r = -ENOSPC;
> > > > > +			goto out;
> > > > > +		}
> > > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > +	}
> > > > > +
> > > > > +	mmio_dev->mmio_nr++;
> > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > +	mmio->flags = mmio_user->flags;
> > > > > +
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > +		mmio->table_base_va = mmio_user->base_va;
> > 
> > Also check alignment for these values. base_va should be
> > at least 4 byte aligned. Maybe more, need to think about it ...
> > Look in spec for table_base_addr alignment.
> 
> OK.
> 
> --
> regards
> Yang, Sheng
> 
> > 
> > > > > +	}
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > +	int r = 0, i, j;
> > > > 
> > > > set r = -EINVAL here
> > > > 
> > > > > +	bool found = 0;
> > > > > +
> > > > > +	if (!mmio)
> > > > > +		return 0;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > > +			found = true;
> > > > 
> > > > set r = 0 here
> > > > 
> > > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > > +			mmio_dev->mmio_nr--;
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	if (!found)
> > > > > +		r = -EINVAL;
> > > > 
> > > > then get rid of found
> > > 
> > > Sure.
> > > 
> > > --
> > > regards
> > > Yang, Sheng
> > > 
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > > +{
> > > > > +	struct kvm_msix_mmio mmio;
> > > > > +
> > > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > > +	mmio.type = mmio_user->type;
> > > > > +
> > > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > > +}
> > > > > +
> > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > > new file mode 100644
> > > > > index 0000000..01b6587
> > > > > --- /dev/null
> > > > > +++ b/virt/kvm/msix_mmio.h
> > > > > @@ -0,0 +1,25 @@
> > > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > > +#define __KVM_MSIX_MMIO_H__
> > > > > +/*
> > > > > + * MSI-X MMIO emulation
> > > > > + *
> > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2. 
> > > > > See + * the COPYING file in the top-level directory.
> > > > > + *
> > > > > + * Author:
> > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > + */
> > > > > +
> > > > > +#include <linux/pci.h>
> > > > > +
> > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > *mmio_user); +
> > > > > +#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:11           ` Michael S. Tsirkin
@ 2011-02-25  5:54             ` Sheng Yang
  0 siblings, 0 replies; 20+ messages in thread
From: Sheng Yang @ 2011-02-25  5:54 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thursday 24 February 2011 18:11:44 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > just need to be able to communicate disabling and enabling of
> > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > it. Thanks,
> > > > 
> > > > Alex, thanks for your comments. See my comments below:
> > > > > Alex
> > > > > 
> > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > ---
> > > > > > 
> > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  virt/kvm/msix_mmio.h
> > > > > >  
> > > > > >  |   25 ++++
> > > > > >  
> > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > index f15501f..3a0d851 100644
> > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > 
> > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o
> > 
> > \
> > 
> > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > 
> > > > > > -				assigned-dev.o)
> > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > 
> > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index fa708c9..89bf12c 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > 
> > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > >  	case KVM_CAP_XSAVE:
> > > > > > 
> > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > >  		r = 1;
> > > > > >  		break;
> > > > > >  	
> > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	gpa_t                 gpa;
> > > > > > 
> > > > > > +	int r;
> > > > > > 
> > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > 
> > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  mmio:
> > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > 
> > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > 
> > > > > >  	/*
> > > > > >  	
> > > > > >  	 * Is this MMIO handled locally?
> > > > > >  	 */
> > > > > > 
> > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > +	if (!r)
> > > > > > 
> > > > > >  		return X86EMUL_CONTINUE;
> > > > > >  	
> > > > > >  	vcpu->mmio_needed = 1;
> > > > > > 
> > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > 
> > > This use of -ENOTSYNC is IMO confusing.
> > > How about we make vcpu_mmio_write return the positive exit reason?
> > > Negative value will mean an error.
> > 
> > Make sense. I would update it.
> > 
> > > > > >  	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
> > > > > >  	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
> > > > > >  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
> > > > > > 
> > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> > > > > > index ea2dc1a..ad9df4b 100644
> > > > > > --- a/include/linux/kvm.h
> > > > > > +++ b/include/linux/kvm.h
> > > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config {
> > > > > > 
> > > > > >  #define KVM_EXIT_NMI              16
> > > > > >  #define KVM_EXIT_INTERNAL_ERROR   17
> > > > > >  #define KVM_EXIT_OSI              18
> > > > > > 
> > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
> > > > > > 
> > > > > >  /* For KVM_EXIT_INTERNAL_ERROR */
> > > > > >  #define KVM_INTERNAL_ERROR_EMULATION 1
> > > > > > 
> > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
> > > > > > 
> > > > > >  #define KVM_CAP_PPC_GET_PVINFO 57
> > > > > >  #define KVM_CAP_PPC_IRQ_LEVEL 58
> > > > > >  #define KVM_CAP_ASYNC_PF 59
> > > > > > 
> > > > > > +#define KVM_CAP_MSIX_MMIO 60
> > > > > > 
> > > > > >  #ifdef KVM_CAP_IRQ_ROUTING
> > > > > > 
> > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data {
> > > > > > 
> > > > > >  #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct
> > > > > >  kvm_xen_hvm_config) #define KVM_SET_CLOCK            
> > > > > >  _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK
> > > > > >  _IOR(KVMIO,  0x7c, struct kvm_clock_data)
> > > > > > 
> > > > > > +/* Available with KVM_CAP_MSIX_MMIO */
> > > > > > +#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct
> > > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO 
> > > > > > _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user)
> > > > > > 
> > > > > >  /* Available with KVM_CAP_PIT_STATE2 */
> > > > > >  #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct
> > > > > >  kvm_pit_state2) #define KVM_SET_PIT2              _IOW(KVMIO, 
> > > > > >  0xa0, struct kvm_pit_state2)
> > > > > > 
> > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry {
> > > > > > 
> > > > > >  	__u16 padding[3];
> > > > > >  
> > > > > >  };
> > > > > > 
> > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > +struct kvm_msix_mmio_user {
> > > > > > +	__u32 dev_id;
> > > > > > +	__u16 type;
> > > > > > +	__u16 max_entries_nr;
> > > > > > +	__u64 base_addr;
> > > > > > +	__u64 base_va;
> > > > > > +	__u64 flags;
> > > > > > +	__u64 reserved[4];
> > > > > > +};
> > > > > > +
> > > > > > 
> > > > > >  #endif /* __LINUX_KVM_H */
> > > > > > 
> > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > index 7d313e0..c10670c 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -233,6 +233,27 @@ struct kvm_memslots {
> > > > > > 
> > > > > >  					KVM_PRIVATE_MEM_SLOTS];
> > > > > >  
> > > > > >  };
> > > > > > 
> > > > > > +#define KVM_MSIX_MMIO_MAX    32
> > > > > 
> > > > > I assume this is based on 32 devices/bus, but we could exceed this
> > > > > is we add more buses or make use of more functions.  How hard is
> > > > > it to make this dynamic?
> > > > 
> > > > We can improve this later(maybe with balance tree?). 32 should be
> > > > more than sufficient for now. And this patchset has been in the
> > > > mailing list for months, I really want to get it in first...
> > > > 
> > > > > > +
> > > > > > +struct kvm_msix_mmio {
> > > > > > +	u32 dev_id;
> > > > > > +	u16 type;
> > > > > > +	u16 max_entries_nr;
> > > > > > +	u64 flags;
> > > > > > +	gpa_t table_base_addr;
> > > > > > +	hva_t table_base_va;
> > > > > > +	gpa_t pba_base_addr;
> > > > > > +	hva_t pba_base_va;
> > > > > > +};
> > > > > > +
> > > > > > +struct kvm_msix_m4mio_dev {
> > > > > > +	struct kvm *kvm;
> > > > > > +	struct kvm_io_device table_dev;
> > > > > > +	int mmio_nr;
> > > > > > +	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
> > > > > > +	struct mutex lock;
> > > > > > +};
> > > > > > +
> > > > > > 
> > > > > >  struct kvm {
> > > > > >  
> > > > > >  	spinlock_t mmu_lock;
> > > > > >  	raw_spinlock_t requests_lock;
> > > > > > 
> > > > > > @@ -281,6 +302,7 @@ struct kvm {
> > > > > > 
> > > > > >  	long mmu_notifier_count;
> > > > > >  
> > > > > >  #endif
> > > > > >  
> > > > > >  	long tlbs_dirty;
> > > > > > 
> > > > > > +	struct kvm_msix_mmio_dev msix_mmio_dev;
> > > > > > 
> > > > > >  };
> > > > > >  
> > > > > >  /* The guest did something we don't support. */
> > > > > > 
> > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct
> > > > > > kvm *kvm,
> > > > > > 
> > > > > >  int kvm_request_irq_source_id(struct kvm *kvm);
> > > > > >  void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
> > > > > > 
> > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > +			int assigned_dev_id, int entry, bool mask);
> > > > > > +
> > > > > > 
> > > > > >  /* For vcpu->arch.iommu_flags */
> > > > > >  #define KVM_IOMMU_CACHE_COHERENCY	0x1
> > > > > > 
> > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
> > > > > > index ae72ae6..d1598a6 100644
> > > > > > --- a/virt/kvm/assigned-dev.c
> > > > > > +++ b/virt/kvm/assigned-dev.c
> > > > > > @@ -18,6 +18,7 @@
> > > > > > 
> > > > > >  #include <linux/interrupt.h>
> > > > > >  #include <linux/slab.h>
> > > > > >  #include "irq.h"
> > > > > > 
> > > > > > +#include "msix_mmio.h"
> > > > > > 
> > > > > >  static struct kvm_assigned_dev_kernel
> > > > > >  *kvm_find_assigned_dev(struct list_head *head,
> > > > > >  
> > > > > >  						      int assigned_dev_id)
> > > > > > 
> > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct
> > > > > > kvm *kvm,
> > > > > > 
> > > > > >  	kvm_deassign_irq(kvm, assigned_dev,
> > > > > >  	assigned_dev->irq_requested_type);
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm,
> > > > > > +				struct kvm_assigned_dev_kernel *adev)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio mmio;
> > > > > > +
> > > > > > +	mmio.dev_id = adev->assigned_dev_id;
> > > > > > +	mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV |
> > > > > > +		    KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > +	kvm_free_msix_mmio(kvm, &mmio);
> > > > > > +}
> > > > > > +
> > > > > > 
> > > > > >  static void kvm_free_assigned_device(struct kvm *kvm,
> > > > > >  
> > > > > >  				     struct kvm_assigned_dev_kernel
> > > > > >  				     *assigned_dev)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	kvm_free_assigned_irq(kvm, assigned_dev);
> > > > > > 
> > > > > > +	assigned_device_free_msix_mmio(kvm, assigned_dev);
> > > > > > +
> > > > > > 
> > > > > >  	__pci_reset_function(assigned_dev->dev);
> > > > > >  	pci_restore_state(assigned_dev->dev);
> > > > > > 
> > > > > > @@ -785,3 +799,33 @@ out:
> > > > > >  	return r;
> > > > > >  
> > > > > >  }
> > > > > > 
> > > > > > +/* The caller should hold kvm->lock */
> > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > +				int assigned_dev_id, int entry, bool mask)
> > > > > > +{
> > > > > > +	int r = -EFAULT;
> > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > +		return r;
> > > > > > +
> > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > +				      assigned_dev_id);
> > > > > > +	if (!adev)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	/* For non-MSIX enabled devices, entries_nr == 0 */
> > > > > > +	for (i = 0; i < adev->entries_nr; i++)
> > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > +			if (mask)
> > > > > > +				disable_irq_nosync(
> > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > +			else
> > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > +			r = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +out:
> > > > > > +	return r;
> > > > > > +}
> > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > > > > > index b1b6cbb..b7807c8 100644
> > > > > > --- a/virt/kvm/kvm_main.c
> > > > > > +++ b/virt/kvm/kvm_main.c
> > > > > > @@ -56,6 +56,7 @@
> > > > > > 
> > > > > >  #include "coalesced_mmio.h"
> > > > > >  #include "async_pf.h"
> > > > > > 
> > > > > > +#include "msix_mmio.h"
> > > > > > 
> > > > > >  #define CREATE_TRACE_POINTS
> > > > > >  #include <trace/events/kvm.h>
> > > > > > 
> > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
> > > > > > 
> > > > > >  	struct mm_struct *mm = kvm->mm;
> > > > > >  	
> > > > > >  	kvm_arch_sync_events(kvm);
> > > > > > 
> > > > > > +	kvm_unregister_msix_mmio_dev(kvm);
> > > > > > 
> > > > > >  	spin_lock(&kvm_lock);
> > > > > >  	list_del(&kvm->vm_list);
> > > > > >  	spin_unlock(&kvm_lock);
> > > > > > 
> > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file
> > > > > > *filp,
> > > > > > 
> > > > > >  		mutex_unlock(&kvm->lock);
> > > > > >  		break;
> > > > > >  
> > > > > >  #endif
> > > > > > 
> > > > > > +	case KVM_REGISTER_MSIX_MMIO: {
> > > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > > +
> > > > > > +		r = -EFAULT;
> > > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > > +			goto out;
> > > > > > +		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
> > > > > > +		break;
> > > > > > +	}
> > > > > > +	case KVM_UNREGISTER_MSIX_MMIO: {
> > > > > > +		struct kvm_msix_mmio_user mmio_user;
> > > > > > +
> > > > > > +		r = -EFAULT;
> > > > > > +		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
> > > > > > +			goto out;
> > > > > > +		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
> > > > > > +		break;
> > > > > > +	}
> > > > > > 
> > > > > >  	default:
> > > > > >  		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> > > > > >  		if (r == -ENOTTY)
> > > > > > 
> > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > > 
> > > > > >  		return r;
> > > > > >  	
> > > > > >  	}
> > > > > >  
> > > > > >  #endif
> > > > > > 
> > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > +	if (r < 0) {
> > > > > > +		kvm_put_kvm(kvm);
> > > > > > +		return r;
> > > > > > +	}
> > > > > > +
> > > > > > 
> > > > > >  	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
> > > > > >  	if (r < 0)
> > > > > >  	
> > > > > >  		kvm_put_kvm(kvm);
> > > > > > 
> > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct
> > > > > > kvm_io_bus *bus)
> > > > > > 
> > > > > >  int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx,
> > > > > >  gpa_t addr,
> > > > > >  
> > > > > >  		     int len, const void *val)
> > > > > >  
> > > > > >  {
> > > > > > 
> > > > > > -	int i;
> > > > > > +	int i, r = -EOPNOTSUPP;
> > > > > > 
> > > > > >  	struct kvm_io_bus *bus;
> > > > > >  	
> > > > > >  	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
> > > > > > 
> > > > > > -	for (i = 0; i < bus->dev_count; i++)
> > > > > > -		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
> > > > > > +	for (i = 0; i < bus->dev_count; i++) {
> > > > > > +		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
> > > > > > +		if (r == -ENOTSYNC)
> > > > > > +			break;
> > > > > > +		else if (!r)
> > > > > > 
> > > > > >  			return 0;
> > > 
> > > So the above will be
> > > 
> > >  if (r >= 0)
> > >  
> > > 	 return r;
> > > 	 
> > > > > > -	return -EOPNOTSUPP;
> > > > > > +	}
> > > > > > +	return r;
> > > > > > 
> > > > > >  }
> > > > > >  
> > > > > >  /* kvm_io_bus_read - called under kvm->slots_lock */
> > > > > > 
> > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
> > > > > > new file mode 100644
> > > > > > index 0000000..dde9b62
> > > > > > --- /dev/null
> > > > > > +++ b/virt/kvm/msix_mmio.c
> > > > > > @@ -0,0 +1,286 @@
> > > > > > +/*
> > > > > > + * MSI-X MMIO emulation
> > > > > > + *
> > > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > > + *
> > > > > > + * This work is licensed under the terms of the GNU GPL, version
> > > > > > 2. See + * the COPYING file in the top-level directory.
> > > > > > + *
> > > > > > + * Author:
> > > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > > + */
> > > > > > +
> > > > > > +#include <linux/kvm_host.h>
> > > > > > +#include <linux/kvm.h>
> > > > > > +
> > > > > > +#include "msix_mmio.h"
> > > > > > +#include "iodev.h"
> > > > > > +
> > > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct
> > > > > > kvm_msix_mmio *mmio, +				int entry, u32 flag)
> > > > > > +{
> > > > > > +	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > +		return kvm_assigned_device_update_msix_mask_bit(kvm,
> > > > > > +				mmio->dev_id, entry, flag);
> > > > > > +	return -EFAULT;
> > > > > > +}
> > > > > > +
> > > > > > +/* Caller must hold dev->lock */
> > > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
> > > > > > +				gpa_t addr, int len)
> > > > > > +{
> > > > > > +	gpa_t start, end;
> > > > > > +	int i, r = -EINVAL;
> > > > > > +
> > > > > > +	for (i = 0; i < dev->mmio_nr; i++) {
> > > > > > +		start = dev->mmio[i].table_base_addr;
> > > > > > +		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
> > > > > > +			dev->mmio[i].max_entries_nr;
> > > > > > +		if (addr >= start && addr + len <= end) {
> > > > > > +			r = i;
> > > > > > +			break;
> > > > > > +		}
> > > > > 
> > > > > Yet another potential user of the weight balanced tree ;)
> > > > 
> > > > Yeah, we can improve it later.
> > > > 
> > > > > > +	}
> > > > > > +
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > gpa_t addr, int len, +				void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx < 0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	offset = addr & 0xf;
> > > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > > +		goto out;
> > > 
> > > This is not as strict as it could be. Also
> > > see below (in write) for an idea how to make the code clearer.
> > > 
> > > > > > +
> > > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > +	if (r)
> > > > > > +		goto out;
> > > 
> > > Want to give the caller some indication of an error?
> > > If not add a comment why.
> > 
> > I don't think we have a proper way to indicate the error now.
> 
> Then let's add a comment, and skip the goto - it's just confusing:
> 
> 	/* Ignore return value, we don't have a way to indicate the
> 	 * error. */
> 	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> 			entry * PCI_MSIX_ENTRY_SIZE + offset), len);

Um... I really think it's quite obvious in the code.
> 
> > Also the
> > table_base_va already passed the checking when MMIO registered. We can
> > add more error info later.
> > 
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > +	gpa_t entry_base;
> > > > > > +	u32 old_ctrl, new_ctrl;
> > > 
> > > These are really __le32. Pls make them so and then
> > > fix up the sparse errors.
> > > 
> > > > > > +	u32 *ctrl_pos;
> > > 
> > > get_user should not be done on u32 *, it should be
> > > __user.  Does not sparse complain about this?
> > 
> > Haven't use sparse to check the patches. Would do it later.
> 
> Please do. It's a lot of code with tricky pointer math, must make sure
> we are not accessing userspace pointers directly etc.

All following patches has been checked.
> 
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx < 0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	offset = addr & 0xF;
> > > > > 
> > > > > nit, this 0xf above.
> > > > 
> > > > So you like another mask?
> > > 
> > > Yes, this should be
> > > 
> > > 	offset = addr % PCI_MSIX_ENTRY_SIZE;
> > > 	
> > > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > > +		goto out;
> > > 
> > > I think it's a good idea to be as strict as possible.
> > > Also let's quote the spec. Finally using % will make
> > > the logic more transparent. How about the below?
> > > 
> > > 	/* For all accesses to MSI-X Table and MSI-X PBA fields, software must
> > > 	use
> > > 	
> > > 	   aligned full DWORD or aligned full QWORD transactions; otherwise,
> > > 	   the result is undefined. */
> > > 	
> > > 	if (len == 4) {
> > > 	
> > > 		if (addr % 4)
> > > 		
> > > 			goto out;
> > > 			
> > >         } else if (len == 8) {
> > > 		
> > > 		if (addr % 8)
> > > 		
> > > 			goto out;
> > > 			
> > >         } else
> > > 		
> > > 		goto out;
> > > 
> > > As read needs same logic it's likely a good idea
> > > to add a function for this.
> > > 
> > > > > > +
> > > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > > +
> > > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > 
> > > It's probably a good idea to do access_ok check for the whole
> > > entry here, and then use __get_user etc below, these are cheaper.
> > 
> > table_base_va already passed access_ok checking. I am not sure if we need
> > to do it again.
> 
> Yes but the process invoking the ioctl can be different from the one
> invoking the mmio.

I don't understand why it matters here...

Oh, I found I missed your original point. You mean we can check all entry and call 
a cheaper __get_user instread of get_user. Well, we can leave this kind of tweak 
later...
> 
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > 
> > > Do not allow.
> > > 
> > > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val,
> > > > > > len))
> > > 
> > > any such cast is suspect.
> > > Is entry_base really gpa? If it was you can not cast it to __user.
> > 
> > HVA.
> 
> Then don't declare it gpa_t

Oh, typo... Would update it later.
> 
> > > I think it should be void __user *.
> > > 
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > > +		goto out;
> > > 
> > > get_user assumes aligned address. But there's no guarantee ctrl_pos
> > > is aligned because table_base_va might be misaligned.
> > 
> > table_base_va already passed access_ok() check.
> 
> access_ok done on table_base_va only checks that the address
> is not a kernel one.

Now table_base_va's alignment has been checked(see the latest patchset), so I 
think we're fine here.

--
regards
Yang, Sheng

> 
> > > > > > +
> > > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > > +		ret = -ENOTSYNC;
> > > 
> > > Better to use != and not <, effectively same.
> > > 
> > > Or in the above we can do:
> > > 	bool control;
> > > 	if (len == 4) {
> > > 	
> > > 		if (addr % 4)
> > > 		
> > > 			goto out;
> > > 		
> > > 		control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL;
> > > 		
> > >         } else if (len == 8) {
> > > 		
> > > 		if (addr % 8)
> > > 		
> > > 			goto out;
> > > 		
> > > 		control = offset == PCI_MSIX_ENTRY_VECTOR_DATA;
> > > 		
> > >         } else
> > > 		
> > > 		goto out;
> > > 		
> > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested
> > > > > this first?  I'd also prefer to see a direct goto out here since
> > > > > it's not entirely obvious that this first 'if' always falls into
> > > > > the below 'if'. I'm not a fan of using an error code as a special
> > > > > non-error return either.
> > > > 
> > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to
> > > > check new_ctrl to see if they indeed modified ctrl bits.
> > > > 
> > > > I would try to make the logic here more clear.
> > > > 
> > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > +	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > > +			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > 
> > > > > An xor would remove some redundancy here
> > > > > 
> > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl &
> > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r =
> > > > > update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl &
> > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT));
> > > > 
> > > > Oh, yes, sure...
> > > 
> > > Also why not set ret to correct value directly?
> > > 
> > > > > > +	if (r || ret)
> > > > > > +		ret = -ENOTSYNC;
> > > 
> > > when is ret set? And why not to the correct value directly?
> > > 
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	mutex_unlock(&mmio_dev->kvm->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > > +	.read     = msix_table_mmio_read,
> > > > > > +	.write    = msix_table_mmio_write,
> > > > > > +};
> > > > > > +
> > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev,
> > > > > > &msix_mmio_table_ops); +	mutex_init(&kvm->msix_mmio_dev.lock);
> > > > > > +	kvm->msix_mmio_dev.kvm = kvm;
> > > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > > +	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
> > > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
> > > > > > +{
> > > > > > +	int ret;
> > > > > > +
> > > > > > +	mutex_lock(&kvm->slots_lock);
> > > > > > +	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
> > > > > > +				      &kvm->msix_mmio_dev.table_dev);
> > > > > > +	mutex_unlock(&kvm->slots_lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > +	int r = 0, i;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
> > > > > > +		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) 
==
> > > > > > +		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > +			mmio = &mmio_dev->mmio[i];
> > > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) 
{
> > > > > > +				r = -EINVAL;
> > > > > > +				goto out;
> > > > > > +			}
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	/* All reserved currently */
> > > > > > +	if (mmio_user->flags) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) !=
> > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) !=
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +
> > > > > > +	if (!access_ok(VERIFY_WRITE, mmio_user->base_va,
> > > > > > +			mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) {
> > > > > > +		r = -EINVAL;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if (!mmio) {
> > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > +			r = -ENOSPC;
> > > > > > +			goto out;
> > > > > > +		}
> > > > > > +		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > +	}
> > > > > > +
> > > > > > +	mmio_dev->mmio_nr++;
> > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > +
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > +	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > 
> > > Also check alignment for these values. base_va should be
> > > at least 4 byte aligned. Maybe more, need to think about it ...
> > > Look in spec for table_base_addr alignment.
> > 
> > OK.
> > 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > > > +	}
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > > *mmio) +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
> > > > > > +	int r = 0, i, j;
> > > > > 
> > > > > set r = -EINVAL here
> > > > > 
> > > > > > +	bool found = 0;
> > > > > > +
> > > > > > +	if (!mmio)
> > > > > > +		return 0;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX);
> > > > > > +	for (i = 0; i < mmio_dev->mmio_nr; i++) {
> > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio->dev_id &&
> > > > > > +		    mmio_dev->mmio[i].type == mmio->type) {
> > > > > > +			found = true;
> > > > > 
> > > > > set r = 0 here
> > > > > 
> > > > > > +			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
> > > > > > +				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
> > > > > > +			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
> > > > > > +			mmio_dev->mmio_nr--;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	if (!found)
> > > > > > +		r = -EINVAL;
> > > > > 
> > > > > then get rid of found
> > > > 
> > > > Sure.
> > > > 
> > > > --
> > > > regards
> > > > Yang, Sheng
> > > > 
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > > +				      struct kvm_msix_mmio_user *mmio_user)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio mmio;
> > > > > > +
> > > > > > +	mmio.dev_id = mmio_user->dev_id;
> > > > > > +	mmio.type = mmio_user->type;
> > > > > > +
> > > > > > +	return kvm_free_msix_mmio(kvm, &mmio);
> > > > > > +}
> > > > > > +
> > > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
> > > > > > new file mode 100644
> > > > > > index 0000000..01b6587
> > > > > > --- /dev/null
> > > > > > +++ b/virt/kvm/msix_mmio.h
> > > > > > @@ -0,0 +1,25 @@
> > > > > > +#ifndef __KVM_MSIX_MMIO_H__
> > > > > > +#define __KVM_MSIX_MMIO_H__
> > > > > > +/*
> > > > > > + * MSI-X MMIO emulation
> > > > > > + *
> > > > > > + * Copyright (c) 2010 Intel Corporation
> > > > > > + *
> > > > > > + * This work is licensed under the terms of the GNU GPL, version
> > > > > > 2. See + * the COPYING file in the top-level directory.
> > > > > > + *
> > > > > > + * Author:
> > > > > > + *   Sheng Yang <sheng.yang@intel.com>
> > > > > > + */
> > > > > > +
> > > > > > +#include <linux/pci.h>
> > > > > > +
> > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm);
> > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
> > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > +				    struct kvm_msix_mmio_user *mmio_user);
> > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
> > > > > > +				      struct kvm_msix_mmio_user *mmio_user);
> > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio
> > > > > > *mmio_user); +
> > > > > > +#endif

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  8:45       ` Michael S. Tsirkin
  2011-02-24  8:08         ` Sheng Yang
@ 2011-02-24  9:44         ` Sheng Yang
  2011-02-24 10:17           ` Michael S. Tsirkin
  1 sibling, 1 reply; 20+ messages in thread
From: Sheng Yang @ 2011-02-24  9:44 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > Looks pretty good overall.  A few comments below.  It seems like we
> > > should be able to hook this into vfio with a small stub in kvm.  We
> > > just need to be able to communicate disabling and enabling of
> > > individual msix vectors.  For brute force, we could do this with a lot
> > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > vector.  Not sure if there's something smaller that could do it. 
> > > Thanks,
> > 
> > Alex, thanks for your comments. See my comments below:
> > > Alex
> > > 
> > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > ---
> > > > 
> > > >  arch/x86/kvm/Makefile    |    2 +-
> > > >  arch/x86/kvm/x86.c       |    8 +-
> > > >  include/linux/kvm.h      |   21 ++++
> > > >  include/linux/kvm_host.h |   25 ++++
> > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > >  virt/kvm/msix_mmio.c     |  286
> > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > >  
> > > >  |   25 ++++
> > > >  
> > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > 
> > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > index f15501f..3a0d851 100644
> > > > --- a/arch/x86/kvm/Makefile
> > > > +++ b/arch/x86/kvm/Makefile
> > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > 
> > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
\
> > > >  
> > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > 
> > > > -				assigned-dev.o)
> > > > +				assigned-dev.o msix_mmio.o)
> > > > 
> > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > >  async_pf.o)
> > > > 
> > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > index fa708c9..89bf12c 100644
> > > > --- a/arch/x86/kvm/x86.c
> > > > +++ b/arch/x86/kvm/x86.c
> > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > 
> > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > >  	case KVM_CAP_XSAVE:
> > > > 
> > > >  	case KVM_CAP_ASYNC_PF:
> > > > +	case KVM_CAP_MSIX_MMIO:
> > > >  		r = 1;
> > > >  		break;
> > > >  	
> > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > @@ -3807,6 +3808,7 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  					   struct kvm_vcpu *vcpu)
> > > >  
> > > >  {
> > > >  
> > > >  	gpa_t                 gpa;
> > > > 
> > > > +	int r;
> > > > 
> > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > 
> > > > @@ -3822,14 +3824,16 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >  mmio:
> > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > 
> > > >  	/*
> > > >  	
> > > >  	 * Is this MMIO handled locally?
> > > >  	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!r)
> > > > 
> > > >  		return X86EMUL_CONTINUE;
> > > >  	
> > > >  	vcpu->mmio_needed = 1;
> > > > 
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> 
> This use of -ENOTSYNC is IMO confusing.
> How about we make vcpu_mmio_write return the positive exit reason?
> Negative value will mean an error.

In fact currently nagative value means something more need to be done, the same as 
MMIO exit. Now I think we can keep it, or update them all later.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24  9:44         ` Sheng Yang
@ 2011-02-24 10:17           ` Michael S. Tsirkin
  2011-02-25  6:12             ` Sheng Yang
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-02-24 10:17 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > just need to be able to communicate disabling and enabling of
> > > > individual msix vectors.  For brute force, we could do this with a lot
> > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X
> > > > vector.  Not sure if there's something smaller that could do it. 
> > > > Thanks,
> > > 
> > > Alex, thanks for your comments. See my comments below:
> > > > Alex
> > > > 
> > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > ---
> > > > > 
> > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > >  include/linux/kvm.h      |   21 ++++
> > > > >  include/linux/kvm_host.h |   25 ++++
> > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > >  virt/kvm/msix_mmio.c     |  286
> > > > >  ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h
> > > > >  
> > > > >  |   25 ++++
> > > > >  
> > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > 
> > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > index f15501f..3a0d851 100644
> > > > > --- a/arch/x86/kvm/Makefile
> > > > > +++ b/arch/x86/kvm/Makefile
> > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > 
> > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o 
> \
> > > > >  
> > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > 
> > > > > -				assigned-dev.o)
> > > > > +				assigned-dev.o msix_mmio.o)
> > > > > 
> > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
> > > > >  kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/,
> > > > >  async_pf.o)
> > > > > 
> > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > index fa708c9..89bf12c 100644
> > > > > --- a/arch/x86/kvm/x86.c
> > > > > +++ b/arch/x86/kvm/x86.c
> > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > 
> > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > >  	case KVM_CAP_XSAVE:
> > > > > 
> > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > >  		r = 1;
> > > > >  		break;
> > > > >  	
> > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  					   struct kvm_vcpu *vcpu)
> > > > >  
> > > > >  {
> > > > >  
> > > > >  	gpa_t                 gpa;
> > > > > 
> > > > > +	int r;
> > > > > 
> > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > 
> > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >  mmio:
> > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > 
> > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > 
> > > > >  	/*
> > > > >  	
> > > > >  	 * Is this MMIO handled locally?
> > > > >  	 */
> > > > > 
> > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > +	if (!r)
> > > > > 
> > > > >  		return X86EMUL_CONTINUE;
> > > > >  	
> > > > >  	vcpu->mmio_needed = 1;
> > > > > 
> > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> > This use of -ENOTSYNC is IMO confusing.
> > How about we make vcpu_mmio_write return the positive exit reason?
> > Negative value will mean an error.
> 
> In fact currently nagative value means something more need to be done, the same as 
> MMIO exit.


So it would be
	if (!r)
  		return X86EMUL_CONTINUE;

	vcpu->run->exit_reason = r;

> Now I think we can keep it, or update them all later.

The way to do this would be
1. patch to return KVM_EXIT_MMIO on mmio
2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

> 
> --
> regards
> Yang, Sheng

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-24 10:17           ` Michael S. Tsirkin
@ 2011-02-25  6:12             ` Sheng Yang
  2011-02-25  8:46               ` Michael S. Tsirkin
  0 siblings, 1 reply; 20+ messages in thread
From: Sheng Yang @ 2011-02-25  6:12 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
> On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > just need to be able to communicate disabling and enabling of
> > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > it. Thanks,
> > > > 
> > > > Alex, thanks for your comments. See my comments below:
> > > > > Alex
> > > > > 
> > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > ---
> > > > > > 
> > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > >  virt/kvm/msix_mmio.h
> > > > > >  
> > > > > >  |   25 ++++
> > > > > >  
> > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > index f15501f..3a0d851 100644
> > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > 
> > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
ioapic.o
> > 
> > \
> > 
> > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > 
> > > > > > -				assigned-dev.o)
> > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > 
> > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > index fa708c9..89bf12c 100644
> > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > 
> > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > >  	case KVM_CAP_XSAVE:
> > > > > > 
> > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > >  		r = 1;
> > > > > >  		break;
> > > > > >  	
> > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > >  
> > > > > >  {
> > > > > >  
> > > > > >  	gpa_t                 gpa;
> > > > > > 
> > > > > > +	int r;
> > > > > > 
> > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > 
> > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >  mmio:
> > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > 
> > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > 
> > > > > >  	/*
> > > > > >  	
> > > > > >  	 * Is this MMIO handled locally?
> > > > > >  	 */
> > > > > > 
> > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > +	if (!r)
> > > > > > 
> > > > > >  		return X86EMUL_CONTINUE;
> > > > > >  	
> > > > > >  	vcpu->mmio_needed = 1;
> > > > > > 
> > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > 
> > > This use of -ENOTSYNC is IMO confusing.
> > > How about we make vcpu_mmio_write return the positive exit reason?
> > > Negative value will mean an error.
> > 
> > In fact currently nagative value means something more need to be done,
> > the same as MMIO exit.
> 
> So it would be
> 	if (!r)
>   		return X86EMUL_CONTINUE;
> 
> 	vcpu->run->exit_reason = r;
> 
> > Now I think we can keep it, or update them all later.
> 
> The way to do this would be
> 1. patch to return KVM_EXIT_MMIO on mmio
> 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top

It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 
because KVM itself unable to complete the request. That's quite straightforward. 
But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can 
only know when all of them fail to handle the accessing.

I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead 
of a error return. We can discuss more on this, but since it's not API/ABI change, 
I think we can get the patch in first.

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-25  6:12             ` Sheng Yang
@ 2011-02-25  8:46               ` Michael S. Tsirkin
  0 siblings, 0 replies; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-02-25  8:46 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm

On Fri, Feb 25, 2011 at 02:12:42PM +0800, Sheng Yang wrote:
> On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote:
> > On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote:
> > > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote:
> > > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > > 
> > > > > > Looks pretty good overall.  A few comments below.  It seems like we
> > > > > > should be able to hook this into vfio with a small stub in kvm.  We
> > > > > > just need to be able to communicate disabling and enabling of
> > > > > > individual msix vectors.  For brute force, we could do this with a
> > > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per
> > > > > > MSI-X vector.  Not sure if there's something smaller that could do
> > > > > > it. Thanks,
> > > > > 
> > > > > Alex, thanks for your comments. See my comments below:
> > > > > > Alex
> > > > > > 
> > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com>
> > > > > > > ---
> > > > > > > 
> > > > > > >  arch/x86/kvm/Makefile    |    2 +-
> > > > > > >  arch/x86/kvm/x86.c       |    8 +-
> > > > > > >  include/linux/kvm.h      |   21 ++++
> > > > > > >  include/linux/kvm_host.h |   25 ++++
> > > > > > >  virt/kvm/assigned-dev.c  |   44 +++++++
> > > > > > >  virt/kvm/kvm_main.c      |   38 ++++++-
> > > > > > >  virt/kvm/msix_mmio.c     |  286
> > > > > > >  ++++++++++++++++++++++++++++++++++++++++++++++
> > > > > > >  virt/kvm/msix_mmio.h
> > > > > > >  
> > > > > > >  |   25 ++++
> > > > > > >  
> > > > > > >  8 files changed, 442 insertions(+), 7 deletions(-)
> > > > > > >  create mode 100644 virt/kvm/msix_mmio.c
> > > > > > >  create mode 100644 virt/kvm/msix_mmio.h
> > > > > > > 
> > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
> > > > > > > index f15501f..3a0d851 100644
> > > > > > > --- a/arch/x86/kvm/Makefile
> > > > > > > +++ b/arch/x86/kvm/Makefile
> > > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
> > > > > > > 
> > > > > > >  kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o 
> ioapic.o
> > > 
> > > \
> > > 
> > > > > > >  				coalesced_mmio.o irq_comm.o eventfd.o \
> > > > > > > 
> > > > > > > -				assigned-dev.o)
> > > > > > > +				assigned-dev.o msix_mmio.o)
> > > > > > > 
> > > > > > >  kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/,
> > > > > > >  iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix
> > > > > > >  ../../../virt/kvm/, async_pf.o)
> > > > > > > 
> > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > > > > > index fa708c9..89bf12c 100644
> > > > > > > --- a/arch/x86/kvm/x86.c
> > > > > > > +++ b/arch/x86/kvm/x86.c
> > > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> > > > > > > 
> > > > > > >  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
> > > > > > >  	case KVM_CAP_XSAVE:
> > > > > > > 
> > > > > > >  	case KVM_CAP_ASYNC_PF:
> > > > > > > +	case KVM_CAP_MSIX_MMIO:
> > > > > > >  		r = 1;
> > > > > > >  		break;
> > > > > > >  	
> > > > > > >  	case KVM_CAP_COALESCED_MMIO:
> > > > > > > @@ -3807,6 +3808,7 @@ static int
> > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > 
> > > > > > >  					   struct kvm_vcpu *vcpu)
> > > > > > >  
> > > > > > >  {
> > > > > > >  
> > > > > > >  	gpa_t                 gpa;
> > > > > > > 
> > > > > > > +	int r;
> > > > > > > 
> > > > > > >  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
> > > > > > > 
> > > > > > > @@ -3822,14 +3824,16 @@ static int
> > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > 
> > > > > > >  mmio:
> > > > > > >  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > > 
> > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > 
> > > > > > >  	/*
> > > > > > >  	
> > > > > > >  	 * Is this MMIO handled locally?
> > > > > > >  	 */
> > > > > > > 
> > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > +	if (!r)
> > > > > > > 
> > > > > > >  		return X86EMUL_CONTINUE;
> > > > > > >  	
> > > > > > >  	vcpu->mmio_needed = 1;
> > > > > > > 
> > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > 
> > > > This use of -ENOTSYNC is IMO confusing.
> > > > How about we make vcpu_mmio_write return the positive exit reason?
> > > > Negative value will mean an error.
> > > 
> > > In fact currently nagative value means something more need to be done,
> > > the same as MMIO exit.
> > 
> > So it would be
> > 	if (!r)
> >   		return X86EMUL_CONTINUE;
> > 
> > 	vcpu->run->exit_reason = r;
> > 
> > > Now I think we can keep it, or update them all later.
> > 
> > The way to do this would be
> > 1. patch to return KVM_EXIT_MMIO on mmio
> > 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top
> 
> It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 
> because KVM itself unable to complete the request. That's quite straightforward. 
> But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can 
> only know when all of them fail to handle the accessing.
> 

Well, this just violates the standard API for return value.

The standard semantics are:
- negative - error
- positive - not an error

So you can just return a positive KVM_EXIT_MSIX_ROUTING_UPDATE
instead of ENOTSYNC. This way you do not need to rework all code
and you don't spread MSIX all over kvm.

You just do:
	(r > 0) ? r : KVM_EXIT_MMIO


> I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead 
> of a error return.
> We can discuss more on this, but since it's not API/ABI change, 
> I think we can get the patch in first.
> --
> regards
> Yang, Sheng
> 
> > 
> > > --
> > > regards
> > > Yang, Sheng

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23  6:59     ` Sheng Yang
  2011-02-23  8:45       ` Michael S. Tsirkin
@ 2011-02-23 16:34       ` Alex Williamson
  2011-02-23 18:39         ` Michael S. Tsirkin
  1 sibling, 1 reply; 20+ messages in thread
From: Alex Williamson @ 2011-02-23 16:34 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm

On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xf;
> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > +	if (r)
> > > +		goto out;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;
> > > +	u32 *ctrl_pos;
> > > +
> > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx < 0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > +		goto out;
> > > +
> > > +	offset = addr & 0xF;
> > 
> > nit, this 0xf above.
> 
> So you like another mask?

I'm just noting that above you used 0xf and here you used 0xF.

> > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > +		goto out;
> > > +
> > > +	mmio = &mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > +
> > > +	if (get_user(old_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > +		goto out;
> > > +
> > > +	if (get_user(new_ctrl, ctrl_pos))
> > > +		goto out;
> > > +
> > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > +		ret = -ENOTSYNC;
> > 
> > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > first?  I'd also prefer to see a direct goto out here since it's not
> > entirely obvious that this first 'if' always falls into the below 'if'.
> > I'm not a fan of using an error code as a special non-error return
> > either.
> 
> In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> see if they indeed modified ctrl bits.
> 
> I would try to make the logic here more clear. 

Isn't that what you're testing for though?

(offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)

If this is true, we can only be writing address, upper address, or data,
not control.

(offset < PCI_MSIX_ENTRY_DATA && len == 8)

If this is true, we're writing address and upper address, not the
control vector.

Additionally, I think the size an alignment test should be more
restrictive.  We really only want to allow naturally aligned 4 or 8 byte
accesses.  Maybe instead of:

if ((addr & 0x3) || (len != 4 && len != 8))

we should do this:

if (!(len == 4 || len == 8) || addr & (len - 1))

Then the test could become:

if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)

Thanks,

Alex


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23 16:34       ` Alex Williamson
@ 2011-02-23 18:39         ` Michael S. Tsirkin
  2011-02-23 19:02           ` Alex Williamson
  0 siblings, 1 reply; 20+ messages in thread
From: Michael S. Tsirkin @ 2011-02-23 18:39 UTC (permalink / raw)
  To: Alex Williamson; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm

On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
> On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > int len, +				void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xf;
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > +	if (r)
> > > > +		goto out;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > +				int len, const void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> > > > +	u32 *ctrl_pos;
> > > > +
> > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx < 0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > +		goto out;
> > > > +
> > > > +	offset = addr & 0xF;
> > > 
> > > nit, this 0xf above.
> > 
> > So you like another mask?
> 
> I'm just noting that above you used 0xf and here you used 0xF.
> 
> > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > +		goto out;
> > > > +
> > > > +	mmio = &mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > +
> > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > +		goto out;
> > > > +
> > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > +		goto out;
> > > > +
> > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > +		ret = -ENOTSYNC;
> > > 
> > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > first?  I'd also prefer to see a direct goto out here since it's not
> > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > I'm not a fan of using an error code as a special non-error return
> > > either.
> > 
> > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> > see if they indeed modified ctrl bits.
> > 
> > I would try to make the logic here more clear. 
> 
> Isn't that what you're testing for though?
> 
> (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)
> 
> If this is true, we can only be writing address, upper address, or data,
> not control.
> 
> (offset < PCI_MSIX_ENTRY_DATA && len == 8)
> 
> If this is true, we're writing address and upper address, not the
> control vector.
> 
> Additionally, I think the size an alignment test should be more
> restrictive.  We really only want to allow naturally aligned 4 or 8 byte
> accesses.  Maybe instead of:
> 
> if ((addr & 0x3) || (len != 4 && len != 8))
> 
> we should do this:
> 
> if (!(len == 4 || len == 8) || addr & (len - 1))
> 
> Then the test could become:
> 
> if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
> 
> Thanks,
> 
> Alex

Or rather

 if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)

we are matching offset + length
to control offset + control length,
and avoid depending on the fact control is the last register.

-- 
MST

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel
  2011-02-23 18:39         ` Michael S. Tsirkin
@ 2011-02-23 19:02           ` Alex Williamson
  0 siblings, 0 replies; 20+ messages in thread
From: Alex Williamson @ 2011-02-23 19:02 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm

On Wed, 2011-02-23 at 20:39 +0200, Michael S. Tsirkin wrote:
> On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote:
> > On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote:
> > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote:
> > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote:
> > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > > > int len, +				void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xf;
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	r = copy_from_user(val, (void __user *)(mmio->table_base_va +
> > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > +	if (r)
> > > > > +		goto out;
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > > > +				int len, const void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > +	gpa_t entry_base;
> > > > > +	u32 old_ctrl, new_ctrl;
> > > > > +	u32 *ctrl_pos;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->kvm->lock);
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx < 0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr & 0x3) || (len != 4 && len != 8))
> > > > > +		goto out;
> > > > > +
> > > > > +	offset = addr & 0xF;
> > > > 
> > > > nit, this 0xf above.
> > > 
> > > So you like another mask?
> > 
> > I'm just noting that above you used 0xf and here you used 0xF.
> > 
> > > > > +	if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8)
> > > > > +		goto out;
> > > > > +
> > > > > +	mmio = &mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > +	ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL);
> > > > > +
> > > > > +	if (get_user(old_ctrl, ctrl_pos))
> > > > > +		goto out;
> > > > > +
> > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > > +	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
> > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (copy_to_user((void __user *)(entry_base + offset), val, len))
> > > > > +		goto out;
> > > > > +
> > > > > +	if (get_user(new_ctrl, ctrl_pos))
> > > > > +		goto out;
> > > > > +
> > > > > +	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
> > > > > +	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
> > > > > +		ret = -ENOTSYNC;
> > > > 
> > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this
> > > > first?  I'd also prefer to see a direct goto out here since it's not
> > > > entirely obvious that this first 'if' always falls into the below 'if'.
> > > > I'm not a fan of using an error code as a special non-error return
> > > > either.
> > > 
> > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to 
> > > see if they indeed modified ctrl bits.
> > > 
> > > I would try to make the logic here more clear. 
> > 
> > Isn't that what you're testing for though?
> > 
> > (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4)
> > 
> > If this is true, we can only be writing address, upper address, or data,
> > not control.
> > 
> > (offset < PCI_MSIX_ENTRY_DATA && len == 8)
> > 
> > If this is true, we're writing address and upper address, not the
> > control vector.
> > 
> > Additionally, I think the size an alignment test should be more
> > restrictive.  We really only want to allow naturally aligned 4 or 8 byte
> > accesses.  Maybe instead of:
> > 
> > if ((addr & 0x3) || (len != 4 && len != 8))
> > 
> > we should do this:
> > 
> > if (!(len == 4 || len == 8) || addr & (len - 1))
> > 
> > Then the test could become:
> > 
> > if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL)
> > 
> > Thanks,
> > 
> > Alex
> 
> Or rather
> 
>  if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4)
> 
> we are matching offset + length
> to control offset + control length,
> and avoid depending on the fact control is the last register.

That's just silly.  We're writing code to match a hardware
specification.  The registers always land at the same place, are always
the same size and always have the alignment requirements dictated in the
spec.  I see no problem depending on that.  They both do the same thing,
I prefer my version.

Alex




^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 3/3] KVM: Add documents for MSI-X MMIO API
  2011-01-30  5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang
  2011-01-30  5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
@ 2011-01-30  5:11 ` Sheng Yang
  2 siblings, 0 replies; 20+ messages in thread
From: Sheng Yang @ 2011-01-30  5:11 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang


Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 Documentation/kvm/api.txt |   47 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 47 insertions(+), 0 deletions(-)

diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt
index e1a9297..e6b7a1d 100644
--- a/Documentation/kvm/api.txt
+++ b/Documentation/kvm/api.txt
@@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+4.54 KVM_REGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API indicates an MSI-X MMIO address of a guest device. Then all MMIO
+operation would be handled by kernel. When necessary(e.g. MSI data/address
+changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to
+indicate the MMIO modification and require userspace to update IRQ routing
+table.
+
+NOTICE: Writing the MSI-X MMIO page after it was registered with this API may
+be dangerous for userspace program. The writing during VM running may result
+in synchronization issue therefore the assigned device can't work properly.
+The writing is allowed when VM is not running and can be used as save/restore
+mechanism.
+
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;		/* Device type and MMIO address type */
+	__u16 max_entries_nr;	/* Maximum entries supported */
+	__u64 base_addr;	/* Guest physical address of MMIO */
+	__u64 base_va;		/* Host virtual address of MMIO mapping */
+	__u64 flags;		/* Reserved for now */
+	__u64 reserved[4];
+};
+
+Current device type can be:
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+Current MMIO type can be:
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+
+4.55 KVM_UNREGISTER_MSIX_MMIO
+
+Capability: KVM_CAP_MSIX_MMIO
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msix_mmio_user (in)
+Returns: 0 on success, -1 on error
+
+This API would unregister the specific MSI-X MMIO, indicated by dev_id and
+type fields of struct kvm_msix_mmio_user.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2011-02-25  8:46 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-01-30  5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang
2011-01-30  5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2011-01-30  5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang
2011-02-03  1:05   ` Marcelo Tosatti
2011-02-18  8:15     ` Sheng Yang
2011-02-22 17:58       ` Marcelo Tosatti
2011-02-23  0:19   ` Alex Williamson
2011-02-23  6:59     ` Sheng Yang
2011-02-23  8:45       ` Michael S. Tsirkin
2011-02-24  8:08         ` Sheng Yang
2011-02-24 10:11           ` Michael S. Tsirkin
2011-02-25  5:54             ` Sheng Yang
2011-02-24  9:44         ` Sheng Yang
2011-02-24 10:17           ` Michael S. Tsirkin
2011-02-25  6:12             ` Sheng Yang
2011-02-25  8:46               ` Michael S. Tsirkin
2011-02-23 16:34       ` Alex Williamson
2011-02-23 18:39         ` Michael S. Tsirkin
2011-02-23 19:02           ` Alex Williamson
2011-01-30  5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).