[PATCH 0/2 v6] MSI-X mask bit support for KVM

public inbox for kvm@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2 v6] MSI-X mask bit support for KVM
@ 2010-12-22  8:44 Sheng Yang
  2010-12-22  8:44 ` [PATCH 1/2] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
                   ` (2 more replies)
  0 siblings, 3 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-22  8:44 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

This patchset didn't include two PCI related patches which would be checked
in through PCI subsystem.

Would add the API document soon.

Change from v5:
Complete rewrote according to Avi's comments.

Sheng Yang (2):
  KVM: Move struct kvm_io_device to kvm_host.h
  KVM: Emulate MSI-X table and PBA in kernel

 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   22 ++++
 include/linux/kvm_host.h |   48 +++++++++
 virt/kvm/assigned-dev.c  |   30 ++++++
 virt/kvm/iodev.h         |   25 +-----
 virt/kvm/kvm_main.c      |   38 +++++++-
 virt/kvm/msix_mmio.c     |  244 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   24 +++++
 9 files changed, 410 insertions(+), 31 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h


^ permalink raw reply	[flat|nested] 32+ messages in thread

* [PATCH 1/2] KVM: Move struct kvm_io_device to kvm_host.h
  2010-12-22  8:44 [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
@ 2010-12-22  8:44 ` Sheng Yang
  2010-12-22  8:44 ` [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel Sheng Yang
  2010-12-28  4:05 ` [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
  2 siblings, 0 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-22  8:44 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then it can be used by other struct in kvm_host.h

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 include/linux/kvm_host.h |   23 +++++++++++++++++++++++
 virt/kvm/iodev.h         |   25 +------------------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac4e83a..ac026ad 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
 int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
 #endif
 
+struct kvm_io_device;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
 struct kvm_vcpu {
 	struct kvm *kvm;
 #ifdef CONFIG_PREEMPT_NOTIFIERS
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3ca..d1f5651 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -17,32 +17,9 @@
 #define __KVM_IODEV_H__
 
 #include <linux/kvm_types.h>
+#include <linux/kvm_host.h>
 #include <asm/errno.h>
 
-struct kvm_io_device;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
 static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 				     const struct kvm_io_device_ops *ops)
 {
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-22  8:44 [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
  2010-12-22  8:44 ` [PATCH 1/2] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
@ 2010-12-22  8:44 ` Sheng Yang
  2010-12-28 12:26   ` Avi Kivity
  2010-12-28  4:05 ` [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
  2 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-12-22  8:44 UTC (permalink / raw)
  To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang

Then we can support mask bit operation of assigned devices now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
---
 arch/x86/kvm/Makefile    |    2 +-
 arch/x86/kvm/x86.c       |    8 +-
 include/linux/kvm.h      |   22 ++++
 include/linux/kvm_host.h |   25 +++++
 virt/kvm/assigned-dev.c  |   30 ++++++
 virt/kvm/kvm_main.c      |   38 +++++++-
 virt/kvm/msix_mmio.c     |  244 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/msix_mmio.h     |   24 +++++
 8 files changed, 386 insertions(+), 7 deletions(-)
 create mode 100644 virt/kvm/msix_mmio.c
 create mode 100644 virt/kvm/msix_mmio.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index f15501f..3a0d851 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o)
+				assigned-dev.o msix_mmio.o)
 kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ed373ba..0be5837 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1965,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
+	case KVM_CAP_MSIX_MMIO:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
@@ -3802,6 +3803,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 					   struct kvm_vcpu *vcpu)
 {
 	gpa_t                 gpa;
+	int r;
 
 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
 
@@ -3817,14 +3819,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
 
 mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
 	/*
 	 * Is this MMIO handled locally?
 	 */
-	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
+	if (!r)
 		return X86EMUL_CONTINUE;
 
 	vcpu->mmio_needed = 1;
-	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
+		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
 	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
 	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index ea2dc1a..44838fe 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -161,6 +161,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_NMI              16
 #define KVM_EXIT_INTERNAL_ERROR   17
 #define KVM_EXIT_OSI              18
+#define KVM_EXIT_MSIX_ROUTING_UPDATE 19
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 #define KVM_INTERNAL_ERROR_EMULATION 1
@@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo {
 #define KVM_CAP_PPC_GET_PVINFO 57
 #define KVM_CAP_PPC_IRQ_LEVEL 58
 #define KVM_CAP_ASYNC_PF 59
+#define KVM_CAP_MSIX_MMIO 60
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -672,6 +674,9 @@ struct kvm_clock_data {
 #define KVM_XEN_HVM_CONFIG        _IOW(KVMIO,  0x7a, struct kvm_xen_hvm_config)
 #define KVM_SET_CLOCK             _IOW(KVMIO,  0x7b, struct kvm_clock_data)
 #define KVM_GET_CLOCK             _IOR(KVMIO,  0x7c, struct kvm_clock_data)
+/* Available with KVM_CAP_MSIX_MMIO */
+#define KVM_REGISTER_MSIX_MMIO    _IOW(KVMIO,  0x7d, struct kvm_msix_mmio_user)
+#define KVM_UNREGISTER_MSIX_MMIO  _IOW(KVMIO,  0x7e, struct kvm_msix_mmio_user)
 /* Available with KVM_CAP_PIT_STATE2 */
 #define KVM_GET_PIT2              _IOR(KVMIO,  0x9f, struct kvm_pit_state2)
 #define KVM_SET_PIT2              _IOW(KVMIO,  0xa0, struct kvm_pit_state2)
@@ -795,4 +800,21 @@ struct kvm_assigned_msix_entry {
 	__u16 padding[3];
 };
 
+#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1 << 0)
+
+#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1 << 8)
+#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1 << 9)
+
+#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
+#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
+struct kvm_msix_mmio_user {
+	__u32 dev_id;
+	__u16 type;
+	__u16 max_entries_nr;
+	__u64 base_addr;
+	__u64 base_va;
+	__u64 flags;
+	__u64 reserved[4];
+};
+
 #endif /* __LINUX_KVM_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac026ad..15fdd0d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -231,6 +231,27 @@ struct kvm_memslots {
 					KVM_PRIVATE_MEM_SLOTS];
 };
 
+#define KVM_MSIX_MMIO_MAX    32
+
+struct kvm_msix_mmio {
+	u32 dev_id;
+	u16 type;
+	u16 max_entries_nr;
+	u64 flags;
+	gpa_t table_base_addr;
+	hva_t table_base_va;
+	gpa_t pba_base_addr;
+	hva_t pba_base_va;
+};
+
+struct kvm_msix_mmio_dev {
+	struct kvm *kvm;
+	struct kvm_io_device table_dev;
+	int mmio_nr;
+	struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX];
+	struct mutex lock;
+};
+
 struct kvm {
 	spinlock_t mmu_lock;
 	raw_spinlock_t requests_lock;
@@ -279,6 +300,7 @@ struct kvm {
 	long mmu_notifier_count;
 	long tlbs_dirty;
 #endif
+	struct kvm_msix_mmio_dev msix_mmio_dev;
 };
 
 /* The guest did something we don't support. */
@@ -551,6 +573,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, u32 flag);
+
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index ae72ae6..ec48bfe 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -785,3 +785,33 @@ out:
 	return r;
 }
 
+int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
+				int assigned_dev_id, int entry, u32 flag)
+{
+	int r = -EFAULT;
+	struct kvm_assigned_dev_kernel *adev;
+	int i;
+
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_dev_id);
+	if (!adev)
+		goto out;
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->host_msix_entries[i].entry == entry) {
+			if (flag)
+				disable_irq_nosync(
+					adev->host_msix_entries[i].vector);
+			else
+				enable_irq(adev->host_msix_entries[i].vector);
+			r = 0;
+			break;
+		}
+out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 83f5bf6..4be7cfe 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -56,6 +56,7 @@
 
 #include "coalesced_mmio.h"
 #include "async_pf.h"
+#include "msix_mmio.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/kvm.h>
@@ -521,6 +522,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #else
 	kvm_arch_flush_shadow(kvm);
 #endif
+	kvm_unregister_msix_mmio_dev(kvm);
 	kvm_arch_destroy_vm(kvm);
 	kvm_free_physmem(kvm);
 	cleanup_srcu_struct(&kvm->srcu);
@@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+	case KVM_REGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user);
+		break;
+	}
+	case KVM_UNREGISTER_MSIX_MMIO: {
+		struct kvm_msix_mmio_user mmio_user;
+
+		r = -EFAULT;
+		if (copy_from_user(&mmio_user, argp, sizeof mmio_user))
+			goto out;
+		r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user);
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
 		return r;
 	}
 #endif
+	r = kvm_register_msix_mmio_dev(kvm);
+	if (r < 0) {
+		kvm_put_kvm(kvm);
+		return r;
+	}
+
 	r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR);
 	if (r < 0)
 		kvm_put_kvm(kvm);
@@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
-	int i;
+	int i, r = -EOPNOTSUPP;
 	struct kvm_io_bus *bus;
 
 	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	for (i = 0; i < bus->dev_count; i++)
-		if (!kvm_iodevice_write(bus->devs[i], addr, len, val))
+	for (i = 0; i < bus->dev_count; i++) {
+		r = kvm_iodevice_write(bus->devs[i], addr, len, val);
+		if (r == -ENOTSYNC)
+			break;
+		else if (!r)
 			return 0;
-	return -EOPNOTSUPP;
+	}
+	return r;
 }
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c
new file mode 100644
index 0000000..e6064e0
--- /dev/null
+++ b/virt/kvm/msix_mmio.c
@@ -0,0 +1,244 @@
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+
+#include "msix_mmio.h"
+#include "iodev.h"
+
+static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio,
+				int entry, u32 flag)
+{
+	if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		return kvm_assigned_device_update_msix_mask_bit(kvm,
+				mmio->dev_id, entry, flag);
+	return -EFAULT;
+}
+
+/* Caller must hold dev->lock */
+static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev,
+				gpa_t addr, int len)
+{
+	gpa_t start, end;
+	int i, r = -EINVAL;
+
+	for (i = 0; i < dev->mmio_nr; i++) {
+		start = dev->mmio[i].table_base_addr;
+		end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE *
+			dev->mmio[i].max_entries_nr;
+		if (addr >= start && addr + len <= end) {
+			r = i;
+			break;
+		}
+	}
+
+	return r;
+}
+
+static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+				void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, ret = 0, entry, offset, r;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+	mmio = &mmio_dev->mmio[idx];
+
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	offset = addr & 0xf;
+	r = copy_from_user(val, (void *)(mmio->table_base_va +
+			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
+	if (r)
+		goto out;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+
+static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
+{
+	struct kvm_msix_mmio_dev *mmio_dev =
+		container_of(this, struct kvm_msix_mmio_dev, table_dev);
+	struct kvm_msix_mmio *mmio;
+	int idx, entry, offset, ret = 0, r = 0;
+	gpa_t entry_base;
+	u32 old_ctrl, new_ctrl;
+
+	mutex_lock(&mmio_dev->lock);
+	idx = get_mmio_table_index(mmio_dev, addr, len);
+	if (idx < 0) {
+		ret = -EOPNOTSUPP;
+		goto out;
+	}
+	if ((addr & 0x3) || (len != 4 && len != 8))
+		goto out;
+	mmio = &mmio_dev->mmio[idx];
+	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
+	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
+	offset = addr & 0xF;
+
+	if (copy_from_user(&old_ctrl,
+			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
+			sizeof old_ctrl))
+		goto out;
+
+	/* No allow writing to other fields when entry is unmasked */
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
+		goto out;
+
+	if (copy_to_user(entry_base + offset, val, len))
+		goto out;
+
+	if (copy_from_user(&new_ctrl,
+			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
+			sizeof new_ctrl))
+		goto out;
+
+	if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) ||
+	    (offset < PCI_MSIX_ENTRY_DATA && len == 8))
+		ret = -ENOTSYNC;
+	if (old_ctrl == new_ctrl)
+		goto out;
+	if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
+	else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) &&
+			!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT))
+		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
+	if (r || ret)
+		ret = -ENOTSYNC;
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return ret;
+}
+static const struct kvm_io_device_ops msix_mmio_table_ops = {
+	.read     = msix_table_mmio_read,
+	.write    = msix_table_mmio_write,
+};
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops);
+	mutex_init(&kvm->msix_mmio_dev.lock);
+	kvm->msix_mmio_dev.kvm = kvm;
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm)
+{
+	int ret;
+
+	mutex_lock(&kvm->slots_lock);
+	ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
+				      &kvm->msix_mmio_dev.table_dev);
+	mutex_unlock(&kvm->slots_lock);
+	return ret;
+}
+
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio_user)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	struct kvm_msix_mmio *mmio = NULL;
+	int r = 0, i;
+
+	mutex_lock(&mmio_dev->lock);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id &&
+		    (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+		    (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
+			mmio = &mmio_dev->mmio[i];
+			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
+				r = -EINVAL;
+				goto out;
+			}
+			break;
+		}
+	}
+	if (!mmio) {
+		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
+			r = -ENOSPC;
+			goto out;
+		}
+		mmio = &mmio_dev->mmio[mmio_dev->mmio_nr];
+		mmio_dev->mmio_nr++;
+	}
+	mmio->max_entries_nr = mmio_user->max_entries_nr;
+	mmio->dev_id = mmio_user->dev_id;
+	mmio->flags = mmio_user->flags;
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
+			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
+		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
+
+	if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
+		mmio->table_base_addr = mmio_user->base_addr;
+		mmio->table_base_va = mmio_user->base_va;
+	} else if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
+			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
+		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
+		mmio->pba_base_addr = mmio_user->base_addr;
+		mmio->pba_base_va = mmio_user->base_va;
+	}
+out:
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio)
+{
+	struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev;
+	int r = 0, i, j;
+	bool found = 0;
+
+	mutex_lock(&mmio_dev->lock);
+	BUG_ON(mmio_dev->mmio_nr >= KVM_MSIX_MMIO_MAX);
+	for (i = 0; i < mmio_dev->mmio_nr; i++) {
+		if (mmio_dev->mmio[i].max_entries_nr != 0 &&
+		    mmio_dev->mmio[i].dev_id == mmio->dev_id &&
+		    mmio_dev->mmio[i].type == mmio->type) {
+			found = true;
+			for (j = i; j < mmio_dev->mmio_nr - 1; j++)
+				mmio_dev->mmio[j] = mmio_dev->mmio[j + 1];
+			mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0;
+			mmio_dev->mmio[mmio_dev->mmio_nr].type = 0;
+			mmio_dev->mmio_nr--;
+			break;
+		}
+	}
+	if (!found)
+		r = -EINVAL;
+	mutex_unlock(&mmio_dev->lock);
+	return r;
+}
+
diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h
new file mode 100644
index 0000000..87caa29
--- /dev/null
+++ b/virt/kvm/msix_mmio.h
@@ -0,0 +1,24 @@
+#ifndef __KVM_MSIX_MMIO_H__
+#define __KVM_MSIX_MMIO_H__
+/*
+ * MSI-X MMIO emulation
+ *
+ * Copyright (c) 2010 Intel Corporation
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Author:
+ *   Sheng Yang <sheng.yang@intel.com>
+ */
+
+#include <linux/pci.h>
+
+int kvm_register_msix_mmio_dev(struct kvm *kvm);
+int kvm_unregister_msix_mmio_dev(struct kvm *kvm);
+int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
+				    struct kvm_msix_mmio_user *mmio);
+int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm,
+				      struct kvm_msix_mmio_user *mmio);
+
+#endif
-- 
1.7.0.1


^ permalink raw reply related	[flat|nested] 32+ messages in thread

* Re: [PATCH 0/2 v6] MSI-X mask bit support for KVM
  2010-12-22  8:44 [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
  2010-12-22  8:44 ` [PATCH 1/2] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
  2010-12-22  8:44 ` [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel Sheng Yang
@ 2010-12-28  4:05 ` Sheng Yang
  2 siblings, 0 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-28  4:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm

On Wednesday 22 December 2010 16:44:53 Sheng Yang wrote:
> This patchset didn't include two PCI related patches which would be checked
> in through PCI subsystem.
> 
> Would add the API document soon.

Avi?

BTW, there is one compiling issue for the second patch, due to last minute clean 
up...

Would update it along with other comments.

--
regards
Yang, Sheng

> 
> Change from v5:
> Complete rewrote according to Avi's comments.
> 
> Sheng Yang (2):
>   KVM: Move struct kvm_io_device to kvm_host.h
>   KVM: Emulate MSI-X table and PBA in kernel
> 
>  arch/x86/kvm/Makefile    |    2 +-
>  arch/x86/kvm/x86.c       |    8 +-
>  include/linux/kvm.h      |   22 ++++
>  include/linux/kvm_host.h |   48 +++++++++
>  virt/kvm/assigned-dev.c  |   30 ++++++
>  virt/kvm/iodev.h         |   25 +-----
>  virt/kvm/kvm_main.c      |   38 +++++++-
>  virt/kvm/msix_mmio.c     |  244
> ++++++++++++++++++++++++++++++++++++++++++++++ 
virt/kvm/msix_mmio.h     | 
>  24 +++++
>  9 files changed, 410 insertions(+), 31 deletions(-)
>  create mode 100644 virt/kvm/msix_mmio.c
>  create mode 100644 virt/kvm/msix_mmio.h

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-22  8:44 ` [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel Sheng Yang
@ 2010-12-28 12:26   ` Avi Kivity
  2010-12-29  7:18     ` Sheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-12-28 12:26 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm, Alex Williamson

On 12/22/2010 10:44 AM, Sheng Yang wrote:
> Then we can support mask bit operation of assigned devices now.
>
>
> @@ -3817,14 +3819,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
>
>   mmio:
>   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
>   	/*
>   	 * Is this MMIO handled locally?
>   	 */
> -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> +	if (!r)
>   		return X86EMUL_CONTINUE;
>
>   	vcpu->mmio_needed = 1;
> -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;

This isn't very pretty, exit_reason should be written in 
vcpu_mmio_write().  I guess we can refactor it later.

>
> +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> +
> +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> +
> +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00

Any explanation of these?

> +struct kvm_msix_mmio_user {
> +	__u32 dev_id;
> +	__u16 type;
> +	__u16 max_entries_nr;
> +	__u64 base_addr;
> +	__u64 base_va;
> +	__u64 flags;
> +	__u64 reserved[4];
> +};
> +
>
>
> +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> +				int assigned_dev_id, int entry, u32 flag)
> +{

Need a better name for 'flag' (and make it a bool).

> +	int r = -EFAULT;
> +	struct kvm_assigned_dev_kernel *adev;
> +	int i;
> +
> +	if (!irqchip_in_kernel(kvm))
> +		return r;
> +
> +	mutex_lock(&kvm->lock);
> +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> +				      assigned_dev_id);
> +	if (!adev)
> +		goto out;
> +
> +	for (i = 0; i<  adev->entries_nr; i++)
> +		if (adev->host_msix_entries[i].entry == entry) {
> +			if (flag)
> +				disable_irq_nosync(
> +					adev->host_msix_entries[i].vector);
> +			else
> +				enable_irq(adev->host_msix_entries[i].vector);
> +			r = 0;
> +			break;
> +		}
> +out:
> +	mutex_unlock(&kvm->lock);
> +	return r;
> +}
>
> @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
>   		return r;
>   	}
>   #endif
> +	r = kvm_register_msix_mmio_dev(kvm);
> +	if (r<  0) {
> +		kvm_put_kvm(kvm);
> +		return r;
> +	}

Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?

> +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
> +				void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, ret = 0, entry, offset, r;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;

What about (addr & 4) && (len == 8)? Is it supported? It may cross entry 
boundaries.

> +	mmio =&mmio_dev->mmio[idx];
> +
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	offset = addr&  0xf;
> +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);


> +	if (r)
> +		goto out;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}
> +
> +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> +				int len, const void *val)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =
> +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> +	struct kvm_msix_mmio *mmio;
> +	int idx, entry, offset, ret = 0, r = 0;
> +	gpa_t entry_base;
> +	u32 old_ctrl, new_ctrl;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	idx = get_mmio_table_index(mmio_dev, addr, len);
> +	if (idx<  0) {
> +		ret = -EOPNOTSUPP;
> +		goto out;
> +	}
> +	if ((addr&  0x3) || (len != 4&&  len != 8))
> +		goto out;
> +	mmio =&mmio_dev->mmio[idx];
> +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> +	offset = addr&  0xF;
> +
> +	if (copy_from_user(&old_ctrl,
> +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> +			sizeof old_ctrl))
> +		goto out;

get_user() is easier.

> +
> +	/* No allow writing to other fields when entry is unmasked */
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> +		goto out;
> +
> +	if (copy_to_user(entry_base + offset, val, len))
> +		goto out;

> +
> +	if (copy_from_user(&new_ctrl,
> +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> +			sizeof new_ctrl))
> +		goto out;

put_user()

> +
> +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> +		ret = -ENOTSYNC;
> +	if (old_ctrl == new_ctrl)
> +		goto out;
> +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> +	if (r || ret)
> +		ret = -ENOTSYNC;
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return ret;
> +}

blank line...

> +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> +	.read     = msix_table_mmio_read,
> +	.write    = msix_table_mmio_write,
> +};
> +
> ++
> +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> +				    struct kvm_msix_mmio_user *mmio_user)
> +{
> +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> +	struct kvm_msix_mmio *mmio = NULL;
> +	int r = 0, i;
> +
> +	mutex_lock(&mmio_dev->lock);
> +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> +			mmio =&mmio_dev->mmio[i];
> +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> +				r = -EINVAL;
> +				goto out;
> +			}
> +			break;
> +		}
> +	}
> +	if (!mmio) {
> +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> +			r = -ENOSPC;
> +			goto out;
> +		}
> +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> +		mmio_dev->mmio_nr++;
> +	}
> +	mmio->max_entries_nr = mmio_user->max_entries_nr;

Sanity check to avoid overflow.

> +	mmio->dev_id = mmio_user->dev_id;
> +	mmio->flags = mmio_user->flags;

Check for unsupported bits (all of them at present?)

> +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> +
> +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> +		mmio->table_base_addr = mmio_user->base_addr;
> +		mmio->table_base_va = mmio_user->base_va;

Check for va in kernel space.

> +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> +		mmio->pba_base_addr = mmio_user->base_addr;
> +		mmio->pba_base_va = mmio_user->base_va;
> +	}
> +out:
> +	mutex_unlock(&mmio_dev->lock);
> +	return r;
> +}
> +
> +

In all, looks reasonable.  I'd like to see documentation for it, and 
review from the pci people.  Alex, mst?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-28 12:26   ` Avi Kivity
@ 2010-12-29  7:18     ` Sheng Yang
  2010-12-29  8:31       ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-12-29  7:18 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Marcelo Tosatti, Michael S. Tsirkin, kvm, Alex Williamson

On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > Then we can support mask bit operation of assigned devices now.
> > 
> > 
> > @@ -3817,14 +3819,16 @@ static int
> > emulator_write_emulated_onepage(unsigned long addr,
> > 
> >   mmio:
> >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > 
> > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > 
> >   	/*
> >   	
> >   	 * Is this MMIO handled locally?
> >   	 */
> > 
> > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > +	if (!r)
> > 
> >   		return X86EMUL_CONTINUE;
> >   	
> >   	vcpu->mmio_needed = 1;
> > 
> > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> 
> This isn't very pretty, exit_reason should be written in
> vcpu_mmio_write().  I guess we can refactor it later.

Sure.
> 
> > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > +
> > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> 
> Any explanation of these?

I chose to use assigned device id instead of one specific table id, because every 
device should got at most one MSI MMIO(the same should applied to vfio device as 
well), and if we use specific table ID, we need way to associate with the device 
anyway, to perform mask/unmask or other operation. So I think it's better to use 
device ID here directly. 

And for the table and pba address, it's due to the mapping in userspace may know 
the guest MSI-X table address and PBA address at different time(due to different 
BAR, refer to the code in assigned_dev_iomem_map() of qemu). So I purposed this 
API to allow each of them can be passed to kernel space individually.
> 
> > +struct kvm_msix_mmio_user {
> > +	__u32 dev_id;
> > +	__u16 type;
> > +	__u16 max_entries_nr;
> > +	__u64 base_addr;
> > +	__u64 base_va;
> > +	__u64 flags;
> > +	__u64 reserved[4];
> > +};
> > +
> > 
> > 
> > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > +				int assigned_dev_id, int entry, u32 flag)
> > +{
> 
> Need a better name for 'flag' (and make it a bool).
> 
> > +	int r = -EFAULT;
> > +	struct kvm_assigned_dev_kernel *adev;
> > +	int i;
> > +
> > +	if (!irqchip_in_kernel(kvm))
> > +		return r;
> > +
> > +	mutex_lock(&kvm->lock);
> > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > +				      assigned_dev_id);
> > +	if (!adev)
> > +		goto out;
> > +
> > +	for (i = 0; i<  adev->entries_nr; i++)
> > +		if (adev->host_msix_entries[i].entry == entry) {
> > +			if (flag)
> > +				disable_irq_nosync(
> > +					adev->host_msix_entries[i].vector);
> > +			else
> > +				enable_irq(adev->host_msix_entries[i].vector);
> > +			r = 0;
> > +			break;
> > +		}
> > +out:
> > +	mutex_unlock(&kvm->lock);
> > +	return r;
> > +}
> > 
> > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > 
> >   		return r;
> >   	
> >   	}
> >   
> >   #endif
> > 
> > +	r = kvm_register_msix_mmio_dev(kvm);
> > +	if (r<  0) {
> > +		kvm_put_kvm(kvm);
> > +		return r;
> > +	}
> 
> Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?

In fact this MMIO device is more like global one for the VM, not for every 
devices. It should handle all MMIO from all MSI-X enabled devices, so I put it in 
the VM init/destroy process.

> > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > int len, +				void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, ret = 0, entry, offset, r;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx<  0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > +		goto out;
> 
> What about (addr & 4) && (len == 8)? Is it supported? It may cross entry
> boundaries.

Should not supported. But I haven't found words on the PCI spec for it. So I 
didn't add this check.
> 
> > +	mmio =&mmio_dev->mmio[idx];
> > +
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	offset = addr&  0xf;
> > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > 
> > 
> > +	if (r)
> > +		goto out;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> > +
> > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > +				int len, const void *val)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =
> > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > +	struct kvm_msix_mmio *mmio;
> > +	int idx, entry, offset, ret = 0, r = 0;
> > +	gpa_t entry_base;
> > +	u32 old_ctrl, new_ctrl;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > +	if (idx<  0) {
> > +		ret = -EOPNOTSUPP;
> > +		goto out;
> > +	}
> > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > +		goto out;
> > +	mmio =&mmio_dev->mmio[idx];
> > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > +	offset = addr&  0xF;
> > +
> > +	if (copy_from_user(&old_ctrl,
> > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > +			sizeof old_ctrl))
> > +		goto out;
> 
> get_user() is easier.
> 
> > +
> > +	/* No allow writing to other fields when entry is unmasked */
> > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > +		goto out;
> > +
> > +	if (copy_to_user(entry_base + offset, val, len))
> > +		goto out;
> > 
> > +
> > +	if (copy_from_user(&new_ctrl,
> > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > +			sizeof new_ctrl))
> > +		goto out;
> 
> put_user()
> 
> > +
> > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > +		ret = -ENOTSYNC;
> > +	if (old_ctrl == new_ctrl)
> > +		goto out;
> > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > +	if (r || ret)
> > +		ret = -ENOTSYNC;
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return ret;
> > +}
> 
> blank line...
> 
> > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > +	.read     = msix_table_mmio_read,
> > +	.write    = msix_table_mmio_write,
> > +};
> > +
> > ++
> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > +				    struct kvm_msix_mmio_user *mmio_user)
> > +{
> > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > +	struct kvm_msix_mmio *mmio = NULL;
> > +	int r = 0, i;
> > +
> > +	mutex_lock(&mmio_dev->lock);
> > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > +			mmio =&mmio_dev->mmio[i];
> > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > +				r = -EINVAL;
> > +				goto out;
> > +			}
> > +			break;
> > +		}
> > +	}
> > +	if (!mmio) {
> > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > +			r = -ENOSPC;
> > +			goto out;
> > +		}
> > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > +		mmio_dev->mmio_nr++;
> > +	}
> > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> 
> Sanity check to avoid overflow.
> 
> > +	mmio->dev_id = mmio_user->dev_id;
> > +	mmio->flags = mmio_user->flags;
> 
> Check for unsupported bits (all of them at present?)
> 
> > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > +
> > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > +		mmio->table_base_addr = mmio_user->base_addr;
> > +		mmio->table_base_va = mmio_user->base_va;
> 
> Check for va in kernel space.
> 
> > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > +		mmio->pba_base_addr = mmio_user->base_addr;
> > +		mmio->pba_base_va = mmio_user->base_va;
> > +	}
> > +out:
> > +	mutex_unlock(&mmio_dev->lock);
> > +	return r;
> > +}
> > +
> > +
> 
> In all, looks reasonable.  I'd like to see documentation for it, and
> review from the pci people.  Alex, mst?

Would add the API document soon.

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-29  7:18     ` Sheng Yang
@ 2010-12-29  8:31       ` Michael S. Tsirkin
  2010-12-29  8:55         ` Sheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-29  8:31 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > Then we can support mask bit operation of assigned devices now.
> > > 
> > > 
> > > @@ -3817,14 +3819,16 @@ static int
> > > emulator_write_emulated_onepage(unsigned long addr,
> > > 
> > >   mmio:
> > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > 
> > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > 
> > >   	/*
> > >   	
> > >   	 * Is this MMIO handled locally?
> > >   	 */
> > > 
> > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > +	if (!r)
> > > 
> > >   		return X86EMUL_CONTINUE;
> > >   	
> > >   	vcpu->mmio_needed = 1;
> > > 
> > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > 
> > This isn't very pretty, exit_reason should be written in
> > vcpu_mmio_write().  I guess we can refactor it later.
> 
> Sure.
> > 
> > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > +
> > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > 
> > Any explanation of these?
> 
> I chose to use assigned device id instead of one specific table id, because every 
> device should got at most one MSI MMIO(the same should applied to vfio device as 
> well), and if we use specific table ID, we need way to associate with the device 
> anyway, to perform mask/unmask or other operation. So I think it's better to use 
> device ID here directly. 

Table id will be needed to make things work for emulated devices.

My idea was this: we have the device id in kvm_assigned_msix_entry already.
Just put table id and entry number in kvm_irq_routing_entry (create
a new gsi type for this).
The result will also work for irqfd because these are mapped to gsi.


> And for the table and pba address, it's due to the mapping in userspace may know 
> the guest MSI-X table address and PBA address at different time(due to different 
> BAR, refer to the code in assigned_dev_iomem_map() of qemu). So I purposed this 
> API to allow each of them can be passed to kernel space individually.
> > 
> > > +struct kvm_msix_mmio_user {
> > > +	__u32 dev_id;
> > > +	__u16 type;
> > > +	__u16 max_entries_nr;
> > > +	__u64 base_addr;
> > > +	__u64 base_va;
> > > +	__u64 flags;
> > > +	__u64 reserved[4];
> > > +};
> > > +
> > > 
> > > 
> > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > +				int assigned_dev_id, int entry, u32 flag)
> > > +{
> > 
> > Need a better name for 'flag' (and make it a bool).
> > 
> > > +	int r = -EFAULT;
> > > +	struct kvm_assigned_dev_kernel *adev;
> > > +	int i;
> > > +
> > > +	if (!irqchip_in_kernel(kvm))
> > > +		return r;
> > > +
> > > +	mutex_lock(&kvm->lock);
> > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > +				      assigned_dev_id);
> > > +	if (!adev)
> > > +		goto out;
> > > +
> > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > +			if (flag)
> > > +				disable_irq_nosync(
> > > +					adev->host_msix_entries[i].vector);
> > > +			else
> > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > +			r = 0;
> > > +			break;
> > > +		}
> > > +out:
> > > +	mutex_unlock(&kvm->lock);
> > > +	return r;
> > > +}
> > > 
> > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > 
> > >   		return r;
> > >   	
> > >   	}
> > >   
> > >   #endif
> > > 
> > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > +	if (r<  0) {
> > > +		kvm_put_kvm(kvm);
> > > +		return r;
> > > +	}
> > 
> > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?
> 
> In fact this MMIO device is more like global one for the VM, not for every 
> devices. It should handle all MMIO from all MSI-X enabled devices, so I put it in 
> the VM init/destroy process.
> 
> > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr,
> > > int len, +				void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, ret = 0, entry, offset, r;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx<  0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > +		goto out;
> > 
> > What about (addr & 4) && (len == 8)? Is it supported? It may cross entry
> > boundaries.
> 
> Should not supported. But I haven't found words on the PCI spec for it. So I 
> didn't add this check.

IMPLEMENTATION NOTE
MSI-X Memory Space Structures in Read/Write Memory

....

For all accesses to MSI-X Table and MSI-X PBA fields, software must use
aligned full
DWORD or aligned full QWORD transactions; otherwise, the result is
undefined.


> > 
> > > +	mmio =&mmio_dev->mmio[idx];
> > > +
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	offset = addr&  0xf;
> > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > 
> > > 
> > > +	if (r)
> > > +		goto out;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > > +
> > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr,
> > > +				int len, const void *val)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > +	struct kvm_msix_mmio *mmio;
> > > +	int idx, entry, offset, ret = 0, r = 0;
> > > +	gpa_t entry_base;
> > > +	u32 old_ctrl, new_ctrl;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > +	if (idx<  0) {
> > > +		ret = -EOPNOTSUPP;
> > > +		goto out;
> > > +	}
> > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > +		goto out;
> > > +	mmio =&mmio_dev->mmio[idx];
> > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > +	offset = addr&  0xF;
> > > +
> > > +	if (copy_from_user(&old_ctrl,
> > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > +			sizeof old_ctrl))
> > > +		goto out;
> > 
> > get_user() is easier.
> > 
> > > +
> > > +	/* No allow writing to other fields when entry is unmasked */
> > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > +		goto out;
> > > +
> > > +	if (copy_to_user(entry_base + offset, val, len))
> > > +		goto out;
> > > 
> > > +
> > > +	if (copy_from_user(&new_ctrl,
> > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > +			sizeof new_ctrl))
> > > +		goto out;
> > 
> > put_user()
> > 
> > > +
> > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > +		ret = -ENOTSYNC;
> > > +	if (old_ctrl == new_ctrl)
> > > +		goto out;
> > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > +	if (r || ret)
> > > +		ret = -ENOTSYNC;
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return ret;
> > > +}
> > 
> > blank line...
> > 
> > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > +	.read     = msix_table_mmio_read,
> > > +	.write    = msix_table_mmio_write,
> > > +};
> > > +
> > > ++
> > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > +{
> > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > +	struct kvm_msix_mmio *mmio = NULL;
> > > +	int r = 0, i;
> > > +
> > > +	mutex_lock(&mmio_dev->lock);
> > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > +			mmio =&mmio_dev->mmio[i];
> > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > +				r = -EINVAL;
> > > +				goto out;
> > > +			}
> > > +			break;
> > > +		}
> > > +	}
> > > +	if (!mmio) {
> > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > +			r = -ENOSPC;
> > > +			goto out;
> > > +		}
> > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > +		mmio_dev->mmio_nr++;
> > > +	}
> > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > 
> > Sanity check to avoid overflow.
> > 
> > > +	mmio->dev_id = mmio_user->dev_id;
> > > +	mmio->flags = mmio_user->flags;
> > 
> > Check for unsupported bits (all of them at present?)
> > 
> > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > +
> > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > +		mmio->table_base_va = mmio_user->base_va;
> > 
> > Check for va in kernel space.
> > 
> > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > +		mmio->pba_base_va = mmio_user->base_va;
> > > +	}
> > > +out:
> > > +	mutex_unlock(&mmio_dev->lock);
> > > +	return r;
> > > +}
> > > +
> > > +
> > 
> > In all, looks reasonable.  I'd like to see documentation for it, and
> > review from the pci people.  Alex, mst?

Some general comments:
PBA isn't supported in this version, which is OK, but let's not add a
capability until it is, and let's not try to guess what
the interface will look like. I think keeping PBA in userspace will be hard
because it needs to be modified from interrupt context.
Removing the PBA stub will make the interface simpler.

> Would add the API document soon.
> 
> --
> regards
> Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-29  8:31       ` Michael S. Tsirkin
@ 2010-12-29  8:55         ` Sheng Yang
  2010-12-29  9:28           ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-12-29  8:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > Then we can support mask bit operation of assigned devices now.
> > > > 
> > > > 
> > > > @@ -3817,14 +3819,16 @@ static int
> > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > 
> > > >   mmio:
> > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > 
> > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > 
> > > >   	/*
> > > >   	
> > > >   	 * Is this MMIO handled locally?
> > > >   	 */
> > > > 
> > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > +	if (!r)
> > > > 
> > > >   		return X86EMUL_CONTINUE;
> > > >   	
> > > >   	vcpu->mmio_needed = 1;
> > > > 
> > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > 
> > > This isn't very pretty, exit_reason should be written in
> > > vcpu_mmio_write().  I guess we can refactor it later.
> > 
> > Sure.
> > 
> > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > +
> > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > 
> > > Any explanation of these?
> > 
> > I chose to use assigned device id instead of one specific table id,
> > because every device should got at most one MSI MMIO(the same should
> > applied to vfio device as well), and if we use specific table ID, we
> > need way to associate with the device anyway, to perform mask/unmask or
> > other operation. So I think it's better to use device ID here directly.
> 
> Table id will be needed to make things work for emulated devices.

I suppose even emulated device should got some kind of id(BDF)? I think that is 
enough for identification, which is already there, so we don't need to allocate 
another ID for the device - because one device would got at most one MSI-X MMIO, 
then use BDF or other device specific ID should be quite straightforward.
> 
> My idea was this: we have the device id in kvm_assigned_msix_entry already.
> Just put table id and entry number in kvm_irq_routing_entry (create
> a new gsi type for this).
> The result will also work for irqfd because these are mapped to gsi.
> 
> > And for the table and pba address, it's due to the mapping in userspace
> > may know the guest MSI-X table address and PBA address at different
> > time(due to different BAR, refer to the code in assigned_dev_iomem_map()
> > of qemu). So I purposed this API to allow each of them can be passed to
> > kernel space individually.
> > 
> > > > +struct kvm_msix_mmio_user {
> > > > +	__u32 dev_id;
> > > > +	__u16 type;
> > > > +	__u16 max_entries_nr;
> > > > +	__u64 base_addr;
> > > > +	__u64 base_va;
> > > > +	__u64 flags;
> > > > +	__u64 reserved[4];
> > > > +};
> > > > +
> > > > 
> > > > 
> > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > +				int assigned_dev_id, int entry, u32 flag)
> > > > +{
> > > 
> > > Need a better name for 'flag' (and make it a bool).
> > > 
> > > > +	int r = -EFAULT;
> > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > +	int i;
> > > > +
> > > > +	if (!irqchip_in_kernel(kvm))
> > > > +		return r;
> > > > +
> > > > +	mutex_lock(&kvm->lock);
> > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > +				      assigned_dev_id);
> > > > +	if (!adev)
> > > > +		goto out;
> > > > +
> > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > +			if (flag)
> > > > +				disable_irq_nosync(
> > > > +					adev->host_msix_entries[i].vector);
> > > > +			else
> > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > +			r = 0;
> > > > +			break;
> > > > +		}
> > > > +out:
> > > > +	mutex_unlock(&kvm->lock);
> > > > +	return r;
> > > > +}
> > > > 
> > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > 
> > > >   		return r;
> > > >   	
> > > >   	}
> > > >   
> > > >   #endif
> > > > 
> > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > +	if (r<  0) {
> > > > +		kvm_put_kvm(kvm);
> > > > +		return r;
> > > > +	}
> > > 
> > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?
> > 
> > In fact this MMIO device is more like global one for the VM, not for
> > every devices. It should handle all MMIO from all MSI-X enabled devices,
> > so I put it in the VM init/destroy process.
> > 
> > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > addr, int len, +				void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, ret = 0, entry, offset, r;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx<  0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > +		goto out;
> > > 
> > > What about (addr & 4) && (len == 8)? Is it supported? It may cross
> > > entry boundaries.
> > 
> > Should not supported. But I haven't found words on the PCI spec for it.
> > So I didn't add this check.
> 
> IMPLEMENTATION NOTE
> MSI-X Memory Space Structures in Read/Write Memory
> 
> ....
> 
> For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> aligned full
> DWORD or aligned full QWORD transactions; otherwise, the result is
> undefined.

Yes, this one is enough, I would add the checking.
> 
> > > > +	mmio =&mmio_dev->mmio[idx];
> > > > +
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	offset = addr&  0xf;
> > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > 
> > > > 
> > > > +	if (r)
> > > > +		goto out;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > > +
> > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > addr, +				int len, const void *val)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > +	struct kvm_msix_mmio *mmio;
> > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > +	gpa_t entry_base;
> > > > +	u32 old_ctrl, new_ctrl;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > +	if (idx<  0) {
> > > > +		ret = -EOPNOTSUPP;
> > > > +		goto out;
> > > > +	}
> > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > +		goto out;
> > > > +	mmio =&mmio_dev->mmio[idx];
> > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > +	offset = addr&  0xF;
> > > > +
> > > > +	if (copy_from_user(&old_ctrl,
> > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > +			sizeof old_ctrl))
> > > > +		goto out;
> > > 
> > > get_user() is easier.
> > > 
> > > > +
> > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > +		goto out;
> > > > +
> > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > +		goto out;
> > > > 
> > > > +
> > > > +	if (copy_from_user(&new_ctrl,
> > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > +			sizeof new_ctrl))
> > > > +		goto out;
> > > 
> > > put_user()
> > > 
> > > > +
> > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > +		ret = -ENOTSYNC;
> > > > +	if (old_ctrl == new_ctrl)
> > > > +		goto out;
> > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > +	if (r || ret)
> > > > +		ret = -ENOTSYNC;
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return ret;
> > > > +}
> > > 
> > > blank line...
> > > 
> > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > +	.read     = msix_table_mmio_read,
> > > > +	.write    = msix_table_mmio_write,
> > > > +};
> > > > +
> > > > ++
> > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > +{
> > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > +	int r = 0, i;
> > > > +
> > > > +	mutex_lock(&mmio_dev->lock);
> > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > +			mmio =&mmio_dev->mmio[i];
> > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > +				r = -EINVAL;
> > > > +				goto out;
> > > > +			}
> > > > +			break;
> > > > +		}
> > > > +	}
> > > > +	if (!mmio) {
> > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > +			r = -ENOSPC;
> > > > +			goto out;
> > > > +		}
> > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > +		mmio_dev->mmio_nr++;
> > > > +	}
> > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > 
> > > Sanity check to avoid overflow.
> > > 
> > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > +	mmio->flags = mmio_user->flags;
> > > 
> > > Check for unsupported bits (all of them at present?)
> > > 
> > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > +
> > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > +		mmio->table_base_va = mmio_user->base_va;
> > > 
> > > Check for va in kernel space.
> > > 
> > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > +	}
> > > > +out:
> > > > +	mutex_unlock(&mmio_dev->lock);
> > > > +	return r;
> > > > +}
> > > > +
> > > > +
> > > 
> > > In all, looks reasonable.  I'd like to see documentation for it, and
> > > review from the pci people.  Alex, mst?
> 
> Some general comments:
> PBA isn't supported in this version, which is OK, but let's not add a
> capability until it is, and let's not try to guess what
> the interface will look like. I think keeping PBA in userspace will be hard
> because it needs to be modified from interrupt context.
> Removing the PBA stub will make the interface simpler.

The API only get the PBA address now which should be fine. And we still have 
threaded irq and tasklet for accessing the userspace for interrupt handler...

--
regards
Yang, Sheng
 
> > Would add the API document soon.
> > 
> > --
> > regards
> > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-29  8:55         ` Sheng Yang
@ 2010-12-29  9:28           ` Michael S. Tsirkin
  2010-12-30  7:32             ` Sheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-29  9:28 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > Then we can support mask bit operation of assigned devices now.
> > > > > 
> > > > > 
> > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > 
> > > > >   mmio:
> > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > 
> > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > 
> > > > >   	/*
> > > > >   	
> > > > >   	 * Is this MMIO handled locally?
> > > > >   	 */
> > > > > 
> > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > +	if (!r)
> > > > > 
> > > > >   		return X86EMUL_CONTINUE;
> > > > >   	
> > > > >   	vcpu->mmio_needed = 1;
> > > > > 
> > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > 
> > > > This isn't very pretty, exit_reason should be written in
> > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > 
> > > Sure.
> > > 
> > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > +
> > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > 
> > > > Any explanation of these?
> > > 
> > > I chose to use assigned device id instead of one specific table id,
> > > because every device should got at most one MSI MMIO(the same should
> > > applied to vfio device as well), and if we use specific table ID, we
> > > need way to associate with the device anyway, to perform mask/unmask or
> > > other operation. So I think it's better to use device ID here directly.
> > 
> > Table id will be needed to make things work for emulated devices.
> 
> I suppose even emulated device should got some kind of id(BDF)?

Not that I know. Look at how irqfd is defined for example,
or how interrupts are sent through a gsi.
I would like to make the interface be able to support that.

> I think that is 
> enough for identification, which is already there, so we don't need to allocate 
> another ID for the device - because one device would got at most one MSI-X MMIO, 
> then use BDF or other device specific ID should be quite straightforward.

So you propose allocating ids for emulated devices?
OK. How will we map e.g. irqfds to these?

> > 
> > My idea was this: we have the device id in kvm_assigned_msix_entry already.
> > Just put table id and entry number in kvm_irq_routing_entry (create
> > a new gsi type for this).
> > The result will also work for irqfd because these are mapped to gsi.
> > 
> > > And for the table and pba address, it's due to the mapping in userspace
> > > may know the guest MSI-X table address and PBA address at different
> > > time(due to different BAR, refer to the code in assigned_dev_iomem_map()
> > > of qemu). So I purposed this API to allow each of them can be passed to
> > > kernel space individually.
> > > 
> > > > > +struct kvm_msix_mmio_user {
> > > > > +	__u32 dev_id;
> > > > > +	__u16 type;
> > > > > +	__u16 max_entries_nr;
> > > > > +	__u64 base_addr;
> > > > > +	__u64 base_va;
> > > > > +	__u64 flags;
> > > > > +	__u64 reserved[4];
> > > > > +};
> > > > > +
> > > > > 
> > > > > 
> > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > +				int assigned_dev_id, int entry, u32 flag)
> > > > > +{
> > > > 
> > > > Need a better name for 'flag' (and make it a bool).
> > > > 
> > > > > +	int r = -EFAULT;
> > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > +	int i;
> > > > > +
> > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > +		return r;
> > > > > +
> > > > > +	mutex_lock(&kvm->lock);
> > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > +				      assigned_dev_id);
> > > > > +	if (!adev)
> > > > > +		goto out;
> > > > > +
> > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > +			if (flag)
> > > > > +				disable_irq_nosync(
> > > > > +					adev->host_msix_entries[i].vector);
> > > > > +			else
> > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > +			r = 0;
> > > > > +			break;
> > > > > +		}
> > > > > +out:
> > > > > +	mutex_unlock(&kvm->lock);
> > > > > +	return r;
> > > > > +}
> > > > > 
> > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > 
> > > > >   		return r;
> > > > >   	
> > > > >   	}
> > > > >   
> > > > >   #endif
> > > > > 
> > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > +	if (r<  0) {
> > > > > +		kvm_put_kvm(kvm);
> > > > > +		return r;
> > > > > +	}
> > > > 
> > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?
> > > 
> > > In fact this MMIO device is more like global one for the VM, not for
> > > every devices. It should handle all MMIO from all MSI-X enabled devices,
> > > so I put it in the VM init/destroy process.
> > > 
> > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t
> > > > > addr, int len, +				void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx<  0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > +		goto out;
> > > > 
> > > > What about (addr & 4) && (len == 8)? Is it supported? It may cross
> > > > entry boundaries.
> > > 
> > > Should not supported. But I haven't found words on the PCI spec for it.
> > > So I didn't add this check.
> > 
> > IMPLEMENTATION NOTE
> > MSI-X Memory Space Structures in Read/Write Memory
> > 
> > ....
> > 
> > For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > aligned full
> > DWORD or aligned full QWORD transactions; otherwise, the result is
> > undefined.
> 
> Yes, this one is enough, I would add the checking.
> > 
> > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > +
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	offset = addr&  0xf;
> > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > 
> > > > > 
> > > > > +	if (r)
> > > > > +		goto out;
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > > +
> > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t
> > > > > addr, +				int len, const void *val)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > +	struct kvm_msix_mmio *mmio;
> > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > +	gpa_t entry_base;
> > > > > +	u32 old_ctrl, new_ctrl;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > +	if (idx<  0) {
> > > > > +		ret = -EOPNOTSUPP;
> > > > > +		goto out;
> > > > > +	}
> > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > +		goto out;
> > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > +	offset = addr&  0xF;
> > > > > +
> > > > > +	if (copy_from_user(&old_ctrl,
> > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > +			sizeof old_ctrl))
> > > > > +		goto out;
> > > > 
> > > > get_user() is easier.
> > > > 
> > > > > +
> > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > +		goto out;
> > > > > +
> > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > +		goto out;
> > > > > 
> > > > > +
> > > > > +	if (copy_from_user(&new_ctrl,
> > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > +			sizeof new_ctrl))
> > > > > +		goto out;
> > > > 
> > > > put_user()
> > > > 
> > > > > +
> > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > +		ret = -ENOTSYNC;
> > > > > +	if (old_ctrl == new_ctrl)
> > > > > +		goto out;
> > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > +	if (r || ret)
> > > > > +		ret = -ENOTSYNC;
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return ret;
> > > > > +}
> > > > 
> > > > blank line...
> > > > 
> > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > +	.read     = msix_table_mmio_read,
> > > > > +	.write    = msix_table_mmio_write,
> > > > > +};
> > > > > +
> > > > > ++
> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > +{
> > > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > +	int r = 0, i;
> > > > > +
> > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) {
> > > > > +				r = -EINVAL;
> > > > > +				goto out;
> > > > > +			}
> > > > > +			break;
> > > > > +		}
> > > > > +	}
> > > > > +	if (!mmio) {
> > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > +			r = -ENOSPC;
> > > > > +			goto out;
> > > > > +		}
> > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > +		mmio_dev->mmio_nr++;
> > > > > +	}
> > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > 
> > > > Sanity check to avoid overflow.
> > > > 
> > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > +	mmio->flags = mmio_user->flags;
> > > > 
> > > > Check for unsupported bits (all of them at present?)
> > > > 
> > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > +
> > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > 
> > > > Check for va in kernel space.
> > > > 
> > > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > +	}
> > > > > +out:
> > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > +	return r;
> > > > > +}
> > > > > +
> > > > > +
> > > > 
> > > > In all, looks reasonable.  I'd like to see documentation for it, and
> > > > review from the pci people.  Alex, mst?
> > 
> > Some general comments:
> > PBA isn't supported in this version, which is OK, but let's not add a
> > capability until it is, and let's not try to guess what
> > the interface will look like. I think keeping PBA in userspace will be hard
> > because it needs to be modified from interrupt context.
> > Removing the PBA stub will make the interface simpler.
> 
> The API only get the PBA address now which should be fine. And we still have 
> threaded irq and tasklet for accessing the userspace for interrupt handler...

I don't think it's going to work: we are not
in the context of the right process. Further
I think we should keep the option of
reading the PBA status from the device or host kernel open.
And generally having an interface
for functionality we don't implement is not a good idea:
you don't know whether you really can support the interface you promised.

> --
> regards
> Yang, Sheng
>  
> > > Would add the API document soon.
> > > 
> > > --
> > > regards
> > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-29  9:28           ` Michael S. Tsirkin
@ 2010-12-30  7:32             ` Sheng Yang
  2010-12-30  7:47               ` Michael S. Tsirkin
  2010-12-30  9:28               ` Avi Kivity
  0 siblings, 2 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-30  7:32 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > > 
> > > > > > 
> > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > 
> > > > > >   mmio:
> > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > 
> > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > 
> > > > > >   	/*
> > > > > >   	
> > > > > >   	 * Is this MMIO handled locally?
> > > > > >   	 */
> > > > > > 
> > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > +	if (!r)
> > > > > > 
> > > > > >   		return X86EMUL_CONTINUE;
> > > > > >   	
> > > > > >   	vcpu->mmio_needed = 1;
> > > > > > 
> > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > 
> > > > > This isn't very pretty, exit_reason should be written in
> > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > 
> > > > Sure.
> > > > 
> > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > +
> > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > 
> > > > > Any explanation of these?
> > > > 
> > > > I chose to use assigned device id instead of one specific table id,
> > > > because every device should got at most one MSI MMIO(the same should
> > > > applied to vfio device as well), and if we use specific table ID, we
> > > > need way to associate with the device anyway, to perform mask/unmask
> > > > or other operation. So I think it's better to use device ID here
> > > > directly.
> > > 
> > > Table id will be needed to make things work for emulated devices.
> > 
> > I suppose even emulated device should got some kind of id(BDF)?
> 
> Not that I know. Look at how irqfd is defined for example,
> or how interrupts are sent through a gsi.
> I would like to make the interface be able to support that.
>
> > I think that is
> > enough for identification, which is already there, so we don't need to
> > allocate another ID for the device - because one device would got at
> > most one MSI-X MMIO, then use BDF or other device specific ID should be
> > quite straightforward.
> 
> So you propose allocating ids for emulated devices?
> OK. How will we map e.g. irqfds to these?

I don't understand. I've checked virtio-pci.c which is using irqfd, and it's still 
a PCI device, and still have BDF, right? 

Also, what we want is a way to identify the MSI-X MMIO. For assigned device, we 
use BDF, then we can easily identify the MMIO as well as the device. For others, 
even they don't have BDF(I don't think so, because MSI-X is a part of PCI, and 
every PCI device has BDF), what you need is an ID, no matter what it is and how it 
defined. QEmu can get the allocation done, and the type field in this API can still 
tell which kind of ID/devices they are, then determine how to deal with them.

> 
> > > My idea was this: we have the device id in kvm_assigned_msix_entry
> > > already. Just put table id and entry number in kvm_irq_routing_entry
> > > (create a new gsi type for this).
> > > The result will also work for irqfd because these are mapped to gsi.
> > > 
> > > > And for the table and pba address, it's due to the mapping in
> > > > userspace may know the guest MSI-X table address and PBA address at
> > > > different time(due to different BAR, refer to the code in
> > > > assigned_dev_iomem_map() of qemu). So I purposed this API to allow
> > > > each of them can be passed to kernel space individually.
> > > > 
> > > > > > +struct kvm_msix_mmio_user {
> > > > > > +	__u32 dev_id;
> > > > > > +	__u16 type;
> > > > > > +	__u16 max_entries_nr;
> > > > > > +	__u64 base_addr;
> > > > > > +	__u64 base_va;
> > > > > > +	__u64 flags;
> > > > > > +	__u64 reserved[4];
> > > > > > +};
> > > > > > +
> > > > > > 
> > > > > > 
> > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > +				int assigned_dev_id, int entry, u32 flag)
> > > > > > +{
> > > > > 
> > > > > Need a better name for 'flag' (and make it a bool).
> > > > > 
> > > > > > +	int r = -EFAULT;
> > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > +	int i;
> > > > > > +
> > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > +		return r;
> > > > > > +
> > > > > > +	mutex_lock(&kvm->lock);
> > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > +				      assigned_dev_id);
> > > > > > +	if (!adev)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > +			if (flag)
> > > > > > +				disable_irq_nosync(
> > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > +			else
> > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > +			r = 0;
> > > > > > +			break;
> > > > > > +		}
> > > > > > +out:
> > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > 
> > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > > 
> > > > > >   		return r;
> > > > > >   	
> > > > > >   	}
> > > > > >   
> > > > > >   #endif
> > > > > > 
> > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > +	if (r<  0) {
> > > > > > +		kvm_put_kvm(kvm);
> > > > > > +		return r;
> > > > > > +	}
> > > > > 
> > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?
> > > > 
> > > > In fact this MMIO device is more like global one for the VM, not for
> > > > every devices. It should handle all MMIO from all MSI-X enabled
> > > > devices, so I put it in the VM init/destroy process.
> > > > 
> > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > gpa_t addr, int len, +				void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx<  0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > +		goto out;
> > > > > 
> > > > > What about (addr & 4) && (len == 8)? Is it supported? It may cross
> > > > > entry boundaries.
> > > > 
> > > > Should not supported. But I haven't found words on the PCI spec for
> > > > it. So I didn't add this check.
> > > 
> > > IMPLEMENTATION NOTE
> > > MSI-X Memory Space Structures in Read/Write Memory
> > > 
> > > ....
> > > 
> > > For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > > aligned full
> > > DWORD or aligned full QWORD transactions; otherwise, the result is
> > > undefined.
> > 
> > Yes, this one is enough, I would add the checking.
> > 
> > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > +
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	offset = addr&  0xf;
> > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > 
> > > > > > 
> > > > > > +	if (r)
> > > > > > +		goto out;
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > > +
> > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > +	gpa_t entry_base;
> > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > +	if (idx<  0) {
> > > > > > +		ret = -EOPNOTSUPP;
> > > > > > +		goto out;
> > > > > > +	}
> > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > +		goto out;
> > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > > +	offset = addr&  0xF;
> > > > > > +
> > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > +			sizeof old_ctrl))
> > > > > > +		goto out;
> > > > > 
> > > > > get_user() is easier.
> > > > > 
> > > > > > +
> > > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > +		goto out;
> > > > > > +
> > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > +		goto out;
> > > > > > 
> > > > > > +
> > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > +			sizeof new_ctrl))
> > > > > > +		goto out;
> > > > > 
> > > > > put_user()
> > > > > 
> > > > > > +
> > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > +		ret = -ENOTSYNC;
> > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > +		goto out;
> > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > > +	if (r || ret)
> > > > > > +		ret = -ENOTSYNC;
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return ret;
> > > > > > +}
> > > > > 
> > > > > blank line...
> > > > > 
> > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > > +	.read     = msix_table_mmio_read,
> > > > > > +	.write    = msix_table_mmio_write,
> > > > > > +};
> > > > > > +
> > > > > > ++
> > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > +{
> > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > +	int r = 0, i;
> > > > > > +
> > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) 
==
> > > > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) 
{
> > > > > > +				r = -EINVAL;
> > > > > > +				goto out;
> > > > > > +			}
> > > > > > +			break;
> > > > > > +		}
> > > > > > +	}
> > > > > > +	if (!mmio) {
> > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > +			r = -ENOSPC;
> > > > > > +			goto out;
> > > > > > +		}
> > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > +		mmio_dev->mmio_nr++;
> > > > > > +	}
> > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > 
> > > > > Sanity check to avoid overflow.
> > > > > 
> > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > +	mmio->flags = mmio_user->flags;
> > > > > 
> > > > > Check for unsupported bits (all of them at present?)
> > > > > 
> > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > +
> > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > 
> > > > > Check for va in kernel space.
> > > > > 
> > > > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > +	}
> > > > > > +out:
> > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > +	return r;
> > > > > > +}
> > > > > > +
> > > > > > +
> > > > > 
> > > > > In all, looks reasonable.  I'd like to see documentation for it,
> > > > > and review from the pci people.  Alex, mst?
> > > 
> > > Some general comments:
> > > PBA isn't supported in this version, which is OK, but let's not add a
> > > capability until it is, and let's not try to guess what
> > > the interface will look like. I think keeping PBA in userspace will be
> > > hard because it needs to be modified from interrupt context.
> > > Removing the PBA stub will make the interface simpler.
> > 
> > The API only get the PBA address now which should be fine. And we still
> > have threaded irq and tasklet for accessing the userspace for interrupt
> > handler...
> 
> I don't think it's going to work: we are not
> in the context of the right process. Further
> I think we should keep the option of
> reading the PBA status from the device or host kernel open.
> And generally having an interface
> for functionality we don't implement is not a good idea:
> you don't know whether you really can support the interface you promised.

Well, I don't know if we want to read PBA from device directly. To me it's not a 
good idea because the real device has nothing to do with the one we show to the 
guest. At least direct accessing the mask bits of real device would be very 
dangerous. Avi?

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > Would add the API document soon.
> > > > 
> > > > --
> > > > regards
> > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  7:32             ` Sheng Yang
@ 2010-12-30  7:47               ` Michael S. Tsirkin
  2010-12-30  7:55                 ` Sheng Yang
  2010-12-30  9:30                 ` Avi Kivity
  2010-12-30  9:28               ` Avi Kivity
  1 sibling, 2 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30  7:47 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > Then we can support mask bit operation of assigned devices now.
> > > > > > > 
> > > > > > > 
> > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > 
> > > > > > >   mmio:
> > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
> > > > > > > 
> > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > 
> > > > > > >   	/*
> > > > > > >   	
> > > > > > >   	 * Is this MMIO handled locally?
> > > > > > >   	 */
> > > > > > > 
> > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > +	if (!r)
> > > > > > > 
> > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > >   	
> > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > 
> > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > 
> > > > > > This isn't very pretty, exit_reason should be written in
> > > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > > 
> > > > > Sure.
> > > > > 
> > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > +
> > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > +
> > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > 
> > > > > > Any explanation of these?
> > > > > 
> > > > > I chose to use assigned device id instead of one specific table id,
> > > > > because every device should got at most one MSI MMIO(the same should
> > > > > applied to vfio device as well), and if we use specific table ID, we
> > > > > need way to associate with the device anyway, to perform mask/unmask
> > > > > or other operation. So I think it's better to use device ID here
> > > > > directly.
> > > > 
> > > > Table id will be needed to make things work for emulated devices.
> > > 
> > > I suppose even emulated device should got some kind of id(BDF)?
> > 
> > Not that I know. Look at how irqfd is defined for example,
> > or how interrupts are sent through a gsi.
> > I would like to make the interface be able to support that.
> >
> > > I think that is
> > > enough for identification, which is already there, so we don't need to
> > > allocate another ID for the device - because one device would got at
> > > most one MSI-X MMIO, then use BDF or other device specific ID should be
> > > quite straightforward.
> > 
> > So you propose allocating ids for emulated devices?
> > OK. How will we map e.g. irqfds to these?
> 
> I don't understand. I've checked virtio-pci.c which is using irqfd, and it's still 
> a PCI device, and still have BDF, right? 
> 
> Also, what we want is a way to identify the MSI-X MMIO. For assigned device, we 
> use BDF, then we can easily identify the MMIO as well as the device. For others, 
> even they don't have BDF(I don't think so, because MSI-X is a part of PCI, and 
> every PCI device has BDF), what you need is an ID, no matter what it is and how it 
> defined. QEmu can get the allocation done, and the type field in this API can still 
> tell which kind of ID/devices they are, then determine how to deal with them.


Yes, the PCI device can be identified with e.g. BFD
(won't work for multi-domain but then we can write an allocator maybe).
But how will we inject these interrupts?
We can do this now with GSI ioctl or map GSI to irqfd
and inject with irqfd write.

> > 
> > > > My idea was this: we have the device id in kvm_assigned_msix_entry
> > > > already. Just put table id and entry number in kvm_irq_routing_entry
> > > > (create a new gsi type for this).
> > > > The result will also work for irqfd because these are mapped to gsi.
> > > > 
> > > > > And for the table and pba address, it's due to the mapping in
> > > > > userspace may know the guest MSI-X table address and PBA address at
> > > > > different time(due to different BAR, refer to the code in
> > > > > assigned_dev_iomem_map() of qemu). So I purposed this API to allow
> > > > > each of them can be passed to kernel space individually.
> > > > > 
> > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > +	__u32 dev_id;
> > > > > > > +	__u16 type;
> > > > > > > +	__u16 max_entries_nr;
> > > > > > > +	__u64 base_addr;
> > > > > > > +	__u64 base_va;
> > > > > > > +	__u64 flags;
> > > > > > > +	__u64 reserved[4];
> > > > > > > +};
> > > > > > > +
> > > > > > > 
> > > > > > > 
> > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm,
> > > > > > > +				int assigned_dev_id, int entry, u32 flag)
> > > > > > > +{
> > > > > > 
> > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > 
> > > > > > > +	int r = -EFAULT;
> > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > +	int i;
> > > > > > > +
> > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > +		return r;
> > > > > > > +
> > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > +				      assigned_dev_id);
> > > > > > > +	if (!adev)
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > +			if (flag)
> > > > > > > +				disable_irq_nosync(
> > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > +			else
> > > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > > +			r = 0;
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +out:
> > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > +	return r;
> > > > > > > +}
> > > > > > > 
> > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void)
> > > > > > > 
> > > > > > >   		return r;
> > > > > > >   	
> > > > > > >   	}
> > > > > > >   
> > > > > > >   #endif
> > > > > > > 
> > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > +	if (r<  0) {
> > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > +		return r;
> > > > > > > +	}
> > > > > > 
> > > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO calls?
> > > > > 
> > > > > In fact this MMIO device is more like global one for the VM, not for
> > > > > every devices. It should handle all MMIO from all MSI-X enabled
> > > > > devices, so I put it in the VM init/destroy process.
> > > > > 
> > > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > > gpa_t addr, int len, +				void *val)
> > > > > > > +{
> > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > +
> > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > +	if (idx<  0) {
> > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > +		goto out;
> > > > > > 
> > > > > > What about (addr & 4) && (len == 8)? Is it supported? It may cross
> > > > > > entry boundaries.
> > > > > 
> > > > > Should not supported. But I haven't found words on the PCI spec for
> > > > > it. So I didn't add this check.
> > > > 
> > > > IMPLEMENTATION NOTE
> > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > 
> > > > ....
> > > > 
> > > > For all accesses to MSI-X Table and MSI-X PBA fields, software must use
> > > > aligned full
> > > > DWORD or aligned full QWORD transactions; otherwise, the result is
> > > > undefined.
> > > 
> > > Yes, this one is enough, I would add the checking.
> > > 
> > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > +
> > > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > > +	offset = addr&  0xf;
> > > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > 
> > > > > > > 
> > > > > > > +	if (r)
> > > > > > > +		goto out;
> > > > > > > +out:
> > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > > +
> > > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > > +{
> > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > +	gpa_t entry_base;
> > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > +
> > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > +	if (idx<  0) {
> > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > +		goto out;
> > > > > > > +	}
> > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > +		goto out;
> > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > +	entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE;
> > > > > > > +	entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > > > +	offset = addr&  0xF;
> > > > > > > +
> > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > +			sizeof old_ctrl))
> > > > > > > +		goto out;
> > > > > > 
> > > > > > get_user() is easier.
> > > > > > 
> > > > > > > +
> > > > > > > +	/* No allow writing to other fields when entry is unmasked */
> > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > +		goto out;
> > > > > > > +
> > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > +		goto out;
> > > > > > > 
> > > > > > > +
> > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > +			sizeof new_ctrl))
> > > > > > > +		goto out;
> > > > > > 
> > > > > > put_user()
> > > > > > 
> > > > > > > +
> > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > +		ret = -ENOTSYNC;
> > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > +		goto out;
> > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > > > +	if (r || ret)
> > > > > > > +		ret = -ENOTSYNC;
> > > > > > > +out:
> > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > +	return ret;
> > > > > > > +}
> > > > > > 
> > > > > > blank line...
> > > > > > 
> > > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = {
> > > > > > > +	.read     = msix_table_mmio_read,
> > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > +};
> > > > > > > +
> > > > > > > ++
> > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > +{
> > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > > +	int r = 0, i;
> > > > > > > +
> > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > > +		    (mmio_dev->mmio[i].type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) 
> ==
> > > > > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > +			if (mmio->max_entries_nr != mmio_user->max_entries_nr) 
> {
> > > > > > > +				r = -EINVAL;
> > > > > > > +				goto out;
> > > > > > > +			}
> > > > > > > +			break;
> > > > > > > +		}
> > > > > > > +	}
> > > > > > > +	if (!mmio) {
> > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > +			r = -ENOSPC;
> > > > > > > +			goto out;
> > > > > > > +		}
> > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > +	}
> > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > 
> > > > > > Sanity check to avoid overflow.
> > > > > > 
> > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > 
> > > > > > Check for unsupported bits (all of them at present?)
> > > > > > 
> > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > +
> > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > 
> > > > > > Check for va in kernel space.
> > > > > > 
> > > > > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > +	}
> > > > > > > +out:
> > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > +	return r;
> > > > > > > +}
> > > > > > > +
> > > > > > > +
> > > > > > 
> > > > > > In all, looks reasonable.  I'd like to see documentation for it,
> > > > > > and review from the pci people.  Alex, mst?
> > > > 
> > > > Some general comments:
> > > > PBA isn't supported in this version, which is OK, but let's not add a
> > > > capability until it is, and let's not try to guess what
> > > > the interface will look like. I think keeping PBA in userspace will be
> > > > hard because it needs to be modified from interrupt context.
> > > > Removing the PBA stub will make the interface simpler.
> > > 
> > > The API only get the PBA address now which should be fine. And we still
> > > have threaded irq and tasklet for accessing the userspace for interrupt
> > > handler...
> > 
> > I don't think it's going to work: we are not
> > in the context of the right process. Further
> > I think we should keep the option of
> > reading the PBA status from the device or host kernel open.
> > And generally having an interface
> > for functionality we don't implement is not a good idea:
> > you don't know whether you really can support the interface you promised.
> 
> Well, I don't know if we want to read PBA from device directly. To me it's not a 
> good idea because the real device has nothing to do with the one we show to the 
> guest. At least direct accessing the mask bits of real device would be very 
> dangerous. Avi?
> 
> --
> regards
> Yang, Sheng

I am not really suggesting this. What I say is PBA is unimplemented
let us not commit to an interface yet.

> > 
> > > --
> > > regards
> > > Yang, Sheng
> > > 
> > > > > Would add the API document soon.
> > > > > 
> > > > > --
> > > > > regards
> > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  7:47               ` Michael S. Tsirkin
@ 2010-12-30  7:55                 ` Sheng Yang
  2010-12-30  8:15                   ` Michael S. Tsirkin
  2010-12-30  9:30                 ` Avi Kivity
  1 sibling, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-12-30  7:55 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thursday 30 December 2010 15:47:48 Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> > On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > > Then we can support mask bit operation of assigned devices
> > > > > > > > now.
> > > > > > > > 
> > > > > > > > 
> > > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > > 
> > > > > > > >   mmio:
> > > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64
> > > > > > > >   	*)val);
> > > > > > > > 
> > > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > > 
> > > > > > > >   	/*
> > > > > > > >   	
> > > > > > > >   	 * Is this MMIO handled locally?
> > > > > > > >   	 */
> > > > > > > > 
> > > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > > +	if (!r)
> > > > > > > > 
> > > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > > >   	
> > > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > > 
> > > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > > 
> > > > > > > This isn't very pretty, exit_reason should be written in
> > > > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > > > 
> > > > > > Sure.
> > > > > > 
> > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > > +
> > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > > +
> > > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > > 
> > > > > > > Any explanation of these?
> > > > > > 
> > > > > > I chose to use assigned device id instead of one specific table
> > > > > > id, because every device should got at most one MSI MMIO(the
> > > > > > same should applied to vfio device as well), and if we use
> > > > > > specific table ID, we need way to associate with the device
> > > > > > anyway, to perform mask/unmask or other operation. So I think
> > > > > > it's better to use device ID here directly.
> > > > > 
> > > > > Table id will be needed to make things work for emulated devices.
> > > > 
> > > > I suppose even emulated device should got some kind of id(BDF)?
> > > 
> > > Not that I know. Look at how irqfd is defined for example,
> > > or how interrupts are sent through a gsi.
> > > I would like to make the interface be able to support that.
> > > 
> > > > I think that is
> > > > enough for identification, which is already there, so we don't need
> > > > to allocate another ID for the device - because one device would got
> > > > at most one MSI-X MMIO, then use BDF or other device specific ID
> > > > should be quite straightforward.
> > > 
> > > So you propose allocating ids for emulated devices?
> > > OK. How will we map e.g. irqfds to these?
> > 
> > I don't understand. I've checked virtio-pci.c which is using irqfd, and
> > it's still a PCI device, and still have BDF, right?
> > 
> > Also, what we want is a way to identify the MSI-X MMIO. For assigned
> > device, we use BDF, then we can easily identify the MMIO as well as the
> > device. For others, even they don't have BDF(I don't think so, because
> > MSI-X is a part of PCI, and every PCI device has BDF), what you need is
> > an ID, no matter what it is and how it defined. QEmu can get the
> > allocation done, and the type field in this API can still tell which
> > kind of ID/devices they are, then determine how to deal with them.
> 
> Yes, the PCI device can be identified with e.g. BFD
> (won't work for multi-domain but then we can write an allocator maybe).
> But how will we inject these interrupts?
> We can do this now with GSI ioctl or map GSI to irqfd
> and inject with irqfd write.

I suppose it's not in the scope of this patch... But I think you can still do 
this, everything is the same as before. QEmu can read from table to get 
data/address pair, then program the routing table, etc.

--
regards
Yang, Sheng
 
> > > > > My idea was this: we have the device id in kvm_assigned_msix_entry
> > > > > already. Just put table id and entry number in
> > > > > kvm_irq_routing_entry (create a new gsi type for this).
> > > > > The result will also work for irqfd because these are mapped to
> > > > > gsi.
> > > > > 
> > > > > > And for the table and pba address, it's due to the mapping in
> > > > > > userspace may know the guest MSI-X table address and PBA address
> > > > > > at different time(due to different BAR, refer to the code in
> > > > > > assigned_dev_iomem_map() of qemu). So I purposed this API to
> > > > > > allow each of them can be passed to kernel space individually.
> > > > > > 
> > > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > > +	__u32 dev_id;
> > > > > > > > +	__u16 type;
> > > > > > > > +	__u16 max_entries_nr;
> > > > > > > > +	__u64 base_addr;
> > > > > > > > +	__u64 base_va;
> > > > > > > > +	__u64 flags;
> > > > > > > > +	__u64 reserved[4];
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > 
> > > > > > > > 
> > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm
> > > > > > > > *kvm, +				int assigned_dev_id, int entry, u32 flag)
> > > > > > > > +{
> > > > > > > 
> > > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > > 
> > > > > > > > +	int r = -EFAULT;
> > > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > > +	int i;
> > > > > > > > +
> > > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > > +		return r;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > > +				      assigned_dev_id);
> > > > > > > > +	if (!adev)
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > > +			if (flag)
> > > > > > > > +				disable_irq_nosync(
> > > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > > +			else
> > > > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > > > +			r = 0;
> > > > > > > > +			break;
> > > > > > > > +		}
> > > > > > > > +out:
> > > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > > +	return r;
> > > > > > > > +}
> > > > > > > > 
> > > > > > > > @@ -1988,6 +2008,12 @@ static int
> > > > > > > > kvm_dev_ioctl_create_vm(void)
> > > > > > > > 
> > > > > > > >   		return r;
> > > > > > > >   	
> > > > > > > >   	}
> > > > > > > >   
> > > > > > > >   #endif
> > > > > > > > 
> > > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > > +	if (r<  0) {
> > > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > > +		return r;
> > > > > > > > +	}
> > > > > > > 
> > > > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO
> > > > > > > calls?
> > > > > > 
> > > > > > In fact this MMIO device is more like global one for the VM, not
> > > > > > for every devices. It should handle all MMIO from all MSI-X
> > > > > > enabled devices, so I put it in the VM init/destroy process.
> > > > > > 
> > > > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > > > gpa_t addr, int len, +				void *val)
> > > > > > > > +{
> > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > +	if (idx<  0) {
> > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > +		goto out;
> > > > > > > 
> > > > > > > What about (addr & 4) && (len == 8)? Is it supported? It may
> > > > > > > cross entry boundaries.
> > > > > > 
> > > > > > Should not supported. But I haven't found words on the PCI spec
> > > > > > for it. So I didn't add this check.
> > > > > 
> > > > > IMPLEMENTATION NOTE
> > > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > > 
> > > > > ....
> > > > > 
> > > > > For all accesses to MSI-X Table and MSI-X PBA fields, software must
> > > > > use aligned full
> > > > > DWORD or aligned full QWORD transactions; otherwise, the result is
> > > > > undefined.
> > > > 
> > > > Yes, this one is enough, I would add the checking.
> > > > 
> > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > +
> > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xf;
> > > > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > > 
> > > > > > > > 
> > > > > > > > +	if (r)
> > > > > > > > +		goto out;
> > > > > > > > +out:
> > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > > > +{
> > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > > +	gpa_t entry_base;
> > > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > +	if (idx<  0) {
> > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > +		goto out;
> > > > > > > > +	}
> > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > +		goto out;
> > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > PCI_MSIX_ENTRY_SIZE; +	entry_base = mmio->table_base_va +
> > > > > > > > entry * PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xF;
> > > > > > > > +
> > > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > +			sizeof old_ctrl))
> > > > > > > > +		goto out;
> > > > > > > 
> > > > > > > get_user() is easier.
> > > > > > > 
> > > > > > > > +
> > > > > > > > +	/* No allow writing to other fields when entry is unmasked
> > > > > > > > */ +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > > +		goto out;
> > > > > > > > +
> > > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > > +		goto out;
> > > > > > > > 
> > > > > > > > +
> > > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > +			sizeof new_ctrl))
> > > > > > > > +		goto out;
> > > > > > > 
> > > > > > > put_user()
> > > > > > > 
> > > > > > > > +
> > > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > > > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > > +		goto out;
> > > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > > > > +	if (r || ret)
> > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > +out:
> > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > +	return ret;
> > > > > > > > +}
> > > > > > > 
> > > > > > > blank line...
> > > > > > > 
> > > > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops =
> > > > > > > > { +	.read     = msix_table_mmio_read,
> > > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > > +};
> > > > > > > > +
> > > > > > > > ++
> > > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > > +{
> > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > > > +	int r = 0, i;
> > > > > > > > +
> > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > > > +		    (mmio_dev->mmio[i].type&  
KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > 
> > ==
> > 
> > > > > > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > > +			if (mmio->max_entries_nr != mmio_user-
>max_entries_nr)
> > 
> > {
> > 
> > > > > > > > +				r = -EINVAL;
> > > > > > > > +				goto out;
> > > > > > > > +			}
> > > > > > > > +			break;
> > > > > > > > +		}
> > > > > > > > +	}
> > > > > > > > +	if (!mmio) {
> > > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > > +			r = -ENOSPC;
> > > > > > > > +			goto out;
> > > > > > > > +		}
> > > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > > +	}
> > > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > > 
> > > > > > > Sanity check to avoid overflow.
> > > > > > > 
> > > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > > 
> > > > > > > Check for unsupported bits (all of them at present?)
> > > > > > > 
> > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > > +
> > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > > 
> > > > > > > Check for va in kernel space.
> > > > > > > 
> > > > > > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK)
> > > > > > > > == +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > > +	}
> > > > > > > > +out:
> > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > +	return r;
> > > > > > > > +}
> > > > > > > > +
> > > > > > > > +
> > > > > > > 
> > > > > > > In all, looks reasonable.  I'd like to see documentation for
> > > > > > > it, and review from the pci people.  Alex, mst?
> > > > > 
> > > > > Some general comments:
> > > > > PBA isn't supported in this version, which is OK, but let's not add
> > > > > a capability until it is, and let's not try to guess what
> > > > > the interface will look like. I think keeping PBA in userspace will
> > > > > be hard because it needs to be modified from interrupt context.
> > > > > Removing the PBA stub will make the interface simpler.
> > > > 
> > > > The API only get the PBA address now which should be fine. And we
> > > > still have threaded irq and tasklet for accessing the userspace for
> > > > interrupt handler...
> > > 
> > > I don't think it's going to work: we are not
> > > in the context of the right process. Further
> > > I think we should keep the option of
> > > reading the PBA status from the device or host kernel open.
> > > And generally having an interface
> > > for functionality we don't implement is not a good idea:
> > > you don't know whether you really can support the interface you
> > > promised.
> > 
> > Well, I don't know if we want to read PBA from device directly. To me
> > it's not a good idea because the real device has nothing to do with the
> > one we show to the guest. At least direct accessing the mask bits of
> > real device would be very dangerous. Avi?
> > 
> > --
> > regards
> > Yang, Sheng
> 
> I am not really suggesting this. What I say is PBA is unimplemented
> let us not commit to an interface yet.
> 
> > > > --
> > > > regards
> > > > Yang, Sheng
> > > > 
> > > > > > Would add the API document soon.
> > > > > > 
> > > > > > --
> > > > > > regards
> > > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  7:55                 ` Sheng Yang
@ 2010-12-30  8:15                   ` Michael S. Tsirkin
  2010-12-30  8:24                     ` Sheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30  8:15 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 03:55:10PM +0800, Sheng Yang wrote:
> On Thursday 30 December 2010 15:47:48 Michael S. Tsirkin wrote:
> > On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> > > On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > > > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > > > Then we can support mask bit operation of assigned devices
> > > > > > > > > now.
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > > > 
> > > > > > > > >   mmio:
> > > > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64
> > > > > > > > >   	*)val);
> > > > > > > > > 
> > > > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > > > 
> > > > > > > > >   	/*
> > > > > > > > >   	
> > > > > > > > >   	 * Is this MMIO handled locally?
> > > > > > > > >   	 */
> > > > > > > > > 
> > > > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > > > +	if (!r)
> > > > > > > > > 
> > > > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > > > >   	
> > > > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > > > 
> > > > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > > > 
> > > > > > > > This isn't very pretty, exit_reason should be written in
> > > > > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > > > > 
> > > > > > > Sure.
> > > > > > > 
> > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > > > +
> > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > > > +
> > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > > > 
> > > > > > > > Any explanation of these?
> > > > > > > 
> > > > > > > I chose to use assigned device id instead of one specific table
> > > > > > > id, because every device should got at most one MSI MMIO(the
> > > > > > > same should applied to vfio device as well), and if we use
> > > > > > > specific table ID, we need way to associate with the device
> > > > > > > anyway, to perform mask/unmask or other operation. So I think
> > > > > > > it's better to use device ID here directly.
> > > > > > 
> > > > > > Table id will be needed to make things work for emulated devices.
> > > > > 
> > > > > I suppose even emulated device should got some kind of id(BDF)?
> > > > 
> > > > Not that I know. Look at how irqfd is defined for example,
> > > > or how interrupts are sent through a gsi.
> > > > I would like to make the interface be able to support that.
> > > > 
> > > > > I think that is
> > > > > enough for identification, which is already there, so we don't need
> > > > > to allocate another ID for the device - because one device would got
> > > > > at most one MSI-X MMIO, then use BDF or other device specific ID
> > > > > should be quite straightforward.
> > > > 
> > > > So you propose allocating ids for emulated devices?
> > > > OK. How will we map e.g. irqfds to these?
> > > 
> > > I don't understand. I've checked virtio-pci.c which is using irqfd, and
> > > it's still a PCI device, and still have BDF, right?
> > > 
> > > Also, what we want is a way to identify the MSI-X MMIO. For assigned
> > > device, we use BDF, then we can easily identify the MMIO as well as the
> > > device. For others, even they don't have BDF(I don't think so, because
> > > MSI-X is a part of PCI, and every PCI device has BDF), what you need is
> > > an ID, no matter what it is and how it defined. QEmu can get the
> > > allocation done, and the type field in this API can still tell which
> > > kind of ID/devices they are, then determine how to deal with them.
> > 
> > Yes, the PCI device can be identified with e.g. BFD
> > (won't work for multi-domain but then we can write an allocator maybe).
> > But how will we inject these interrupts?
> > We can do this now with GSI ioctl or map GSI to irqfd
> > and inject with irqfd write.
> 
> I suppose it's not in the scope of this patch...

This is why I suggested mapping GSI to msix.

> But I think you can still do 
> this, everything is the same as before. QEmu can read from table to get 
> data/address pair, then program the routing table, etc.

Yes, fine, but mask is the problem :)
When qemu/irqfd injects an interrupt and it's masked,
guest should not be interrupted.



> --
> regards
> Yang, Sheng
>  
> > > > > > My idea was this: we have the device id in kvm_assigned_msix_entry
> > > > > > already. Just put table id and entry number in
> > > > > > kvm_irq_routing_entry (create a new gsi type for this).
> > > > > > The result will also work for irqfd because these are mapped to
> > > > > > gsi.
> > > > > > 
> > > > > > > And for the table and pba address, it's due to the mapping in
> > > > > > > userspace may know the guest MSI-X table address and PBA address
> > > > > > > at different time(due to different BAR, refer to the code in
> > > > > > > assigned_dev_iomem_map() of qemu). So I purposed this API to
> > > > > > > allow each of them can be passed to kernel space individually.
> > > > > > > 
> > > > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > > > +	__u32 dev_id;
> > > > > > > > > +	__u16 type;
> > > > > > > > > +	__u16 max_entries_nr;
> > > > > > > > > +	__u64 base_addr;
> > > > > > > > > +	__u64 base_va;
> > > > > > > > > +	__u64 flags;
> > > > > > > > > +	__u64 reserved[4];
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm
> > > > > > > > > *kvm, +				int assigned_dev_id, int entry, u32 flag)
> > > > > > > > > +{
> > > > > > > > 
> > > > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > > > 
> > > > > > > > > +	int r = -EFAULT;
> > > > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > > > +	int i;
> > > > > > > > > +
> > > > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > > > +		return r;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > > > +	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > > > +				      assigned_dev_id);
> > > > > > > > > +	if (!adev)
> > > > > > > > > +		goto out;
> > > > > > > > > +
> > > > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > > > +			if (flag)
> > > > > > > > > +				disable_irq_nosync(
> > > > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > > > +			else
> > > > > > > > > +				enable_irq(adev->host_msix_entries[i].vector);
> > > > > > > > > +			r = 0;
> > > > > > > > > +			break;
> > > > > > > > > +		}
> > > > > > > > > +out:
> > > > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > > > +	return r;
> > > > > > > > > +}
> > > > > > > > > 
> > > > > > > > > @@ -1988,6 +2008,12 @@ static int
> > > > > > > > > kvm_dev_ioctl_create_vm(void)
> > > > > > > > > 
> > > > > > > > >   		return r;
> > > > > > > > >   	
> > > > > > > > >   	}
> > > > > > > > >   
> > > > > > > > >   #endif
> > > > > > > > > 
> > > > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > > > +	if (r<  0) {
> > > > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > > > +		return r;
> > > > > > > > > +	}
> > > > > > > > 
> > > > > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO
> > > > > > > > calls?
> > > > > > > 
> > > > > > > In fact this MMIO device is more like global one for the VM, not
> > > > > > > for every devices. It should handle all MMIO from all MSI-X
> > > > > > > enabled devices, so I put it in the VM init/destroy process.
> > > > > > > 
> > > > > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this,
> > > > > > > > > gpa_t addr, int len, +				void *val)
> > > > > > > > > +{
> > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > +	if (idx<  0) {
> > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > +		goto out;
> > > > > > > > > +	}
> > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > +		goto out;
> > > > > > > > 
> > > > > > > > What about (addr & 4) && (len == 8)? Is it supported? It may
> > > > > > > > cross entry boundaries.
> > > > > > > 
> > > > > > > Should not supported. But I haven't found words on the PCI spec
> > > > > > > for it. So I didn't add this check.
> > > > > > 
> > > > > > IMPLEMENTATION NOTE
> > > > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > > > 
> > > > > > ....
> > > > > > 
> > > > > > For all accesses to MSI-X Table and MSI-X PBA fields, software must
> > > > > > use aligned full
> > > > > > DWORD or aligned full QWORD transactions; otherwise, the result is
> > > > > > undefined.
> > > > > 
> > > > > Yes, this one is enough, I would add the checking.
> > > > > 
> > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > +
> > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xf;
> > > > > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > > > 
> > > > > > > > > 
> > > > > > > > > +	if (r)
> > > > > > > > > +		goto out;
> > > > > > > > > +out:
> > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > +	return ret;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this,
> > > > > > > > > gpa_t addr, +				int len, const void *val)
> > > > > > > > > +{
> > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev, table_dev);
> > > > > > > > > +	struct kvm_msix_mmio *mmio;
> > > > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > > > +	gpa_t entry_base;
> > > > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > +	if (idx<  0) {
> > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > +		goto out;
> > > > > > > > > +	}
> > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > +		goto out;
> > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	entry_base = mmio->table_base_va +
> > > > > > > > > entry * PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xF;
> > > > > > > > > +
> > > > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > +			sizeof old_ctrl))
> > > > > > > > > +		goto out;
> > > > > > > > 
> > > > > > > > get_user() is easier.
> > > > > > > > 
> > > > > > > > > +
> > > > > > > > > +	/* No allow writing to other fields when entry is unmasked
> > > > > > > > > */ +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > +	    offset != PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > > > +		goto out;
> > > > > > > > > +
> > > > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > > > +		goto out;
> > > > > > > > > 
> > > > > > > > > +
> > > > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > +			sizeof new_ctrl))
> > > > > > > > > +		goto out;
> > > > > > > > 
> > > > > > > > put_user()
> > > > > > > > 
> > > > > > > > > +
> > > > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4) ||
> > > > > > > > > +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > > > +		goto out;
> > > > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1);
> > > > > > > > > +	else if ((old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > +			!(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0);
> > > > > > > > > +	if (r || ret)
> > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > +out:
> > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > +	return ret;
> > > > > > > > > +}
> > > > > > > > 
> > > > > > > > blank line...
> > > > > > > > 
> > > > > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops =
> > > > > > > > > { +	.read     = msix_table_mmio_read,
> > > > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > > > +};
> > > > > > > > > +
> > > > > > > > > ++
> > > > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > > > +{
> > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =&kvm->msix_mmio_dev;
> > > > > > > > > +	struct kvm_msix_mmio *mmio = NULL;
> > > > > > > > > +	int r = 0, i;
> > > > > > > > > +
> > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > > > > +		    (mmio_dev->mmio[i].type&  
> KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > > 
> > > ==
> > > 
> > > > > > > > > +		    (mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > > > +			if (mmio->max_entries_nr != mmio_user-
> >max_entries_nr)
> > > 
> > > {
> > > 
> > > > > > > > > +				r = -EINVAL;
> > > > > > > > > +				goto out;
> > > > > > > > > +			}
> > > > > > > > > +			break;
> > > > > > > > > +		}
> > > > > > > > > +	}
> > > > > > > > > +	if (!mmio) {
> > > > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > > > +			r = -ENOSPC;
> > > > > > > > > +			goto out;
> > > > > > > > > +		}
> > > > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > > > +	}
> > > > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > > > 
> > > > > > > > Sanity check to avoid overflow.
> > > > > > > > 
> > > > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > > > 
> > > > > > > > Check for unsupported bits (all of them at present?)
> > > > > > > > 
> > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > > > +
> > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > > > 
> > > > > > > > Check for va in kernel space.
> > > > > > > > 
> > > > > > > > > +	} else if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK)
> > > > > > > > > == +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > > > +	}
> > > > > > > > > +out:
> > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > +	return r;
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > > +
> > > > > > > > 
> > > > > > > > In all, looks reasonable.  I'd like to see documentation for
> > > > > > > > it, and review from the pci people.  Alex, mst?
> > > > > > 
> > > > > > Some general comments:
> > > > > > PBA isn't supported in this version, which is OK, but let's not add
> > > > > > a capability until it is, and let's not try to guess what
> > > > > > the interface will look like. I think keeping PBA in userspace will
> > > > > > be hard because it needs to be modified from interrupt context.
> > > > > > Removing the PBA stub will make the interface simpler.
> > > > > 
> > > > > The API only get the PBA address now which should be fine. And we
> > > > > still have threaded irq and tasklet for accessing the userspace for
> > > > > interrupt handler...
> > > > 
> > > > I don't think it's going to work: we are not
> > > > in the context of the right process. Further
> > > > I think we should keep the option of
> > > > reading the PBA status from the device or host kernel open.
> > > > And generally having an interface
> > > > for functionality we don't implement is not a good idea:
> > > > you don't know whether you really can support the interface you
> > > > promised.
> > > 
> > > Well, I don't know if we want to read PBA from device directly. To me
> > > it's not a good idea because the real device has nothing to do with the
> > > one we show to the guest. At least direct accessing the mask bits of
> > > real device would be very dangerous. Avi?
> > > 
> > > --
> > > regards
> > > Yang, Sheng
> > 
> > I am not really suggesting this. What I say is PBA is unimplemented
> > let us not commit to an interface yet.
> > 
> > > > > --
> > > > > regards
> > > > > Yang, Sheng
> > > > > 
> > > > > > > Would add the API document soon.
> > > > > > > 
> > > > > > > --
> > > > > > > regards
> > > > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  8:15                   ` Michael S. Tsirkin
@ 2010-12-30  8:24                     ` Sheng Yang
  2010-12-30  8:52                       ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Sheng Yang @ 2010-12-30  8:24 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thursday 30 December 2010 16:15:32 Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 03:55:10PM +0800, Sheng Yang wrote:
> > On Thursday 30 December 2010 15:47:48 Michael S. Tsirkin wrote:
> > > On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> > > > On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > > > > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > > > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > > > > Then we can support mask bit operation of assigned
> > > > > > > > > > devices now.
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > > > > 
> > > > > > > > > >   mmio:
> > > > > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64
> > > > > > > > > >   	*)val);
> > > > > > > > > > 
> > > > > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > > > > 
> > > > > > > > > >   	/*
> > > > > > > > > >   	
> > > > > > > > > >   	 * Is this MMIO handled locally?
> > > > > > > > > >   	 */
> > > > > > > > > > 
> > > > > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > > > > +	if (!r)
> > > > > > > > > > 
> > > > > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > > > > >   	
> > > > > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > > > > 
> > > > > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > > > > 
> > > > > > > > > This isn't very pretty, exit_reason should be written in
> > > > > > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > > > > > 
> > > > > > > > Sure.
> > > > > > > > 
> > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > > > > +
> > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > > > > +
> > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > > > > 
> > > > > > > > > Any explanation of these?
> > > > > > > > 
> > > > > > > > I chose to use assigned device id instead of one specific
> > > > > > > > table id, because every device should got at most one MSI
> > > > > > > > MMIO(the same should applied to vfio device as well), and if
> > > > > > > > we use specific table ID, we need way to associate with the
> > > > > > > > device anyway, to perform mask/unmask or other operation. So
> > > > > > > > I think it's better to use device ID here directly.
> > > > > > > 
> > > > > > > Table id will be needed to make things work for emulated
> > > > > > > devices.
> > > > > > 
> > > > > > I suppose even emulated device should got some kind of id(BDF)?
> > > > > 
> > > > > Not that I know. Look at how irqfd is defined for example,
> > > > > or how interrupts are sent through a gsi.
> > > > > I would like to make the interface be able to support that.
> > > > > 
> > > > > > I think that is
> > > > > > enough for identification, which is already there, so we don't
> > > > > > need to allocate another ID for the device - because one device
> > > > > > would got at most one MSI-X MMIO, then use BDF or other device
> > > > > > specific ID should be quite straightforward.
> > > > > 
> > > > > So you propose allocating ids for emulated devices?
> > > > > OK. How will we map e.g. irqfds to these?
> > > > 
> > > > I don't understand. I've checked virtio-pci.c which is using irqfd,
> > > > and it's still a PCI device, and still have BDF, right?
> > > > 
> > > > Also, what we want is a way to identify the MSI-X MMIO. For assigned
> > > > device, we use BDF, then we can easily identify the MMIO as well as
> > > > the device. For others, even they don't have BDF(I don't think so,
> > > > because MSI-X is a part of PCI, and every PCI device has BDF), what
> > > > you need is an ID, no matter what it is and how it defined. QEmu can
> > > > get the allocation done, and the type field in this API can still
> > > > tell which kind of ID/devices they are, then determine how to deal
> > > > with them.
> > > 
> > > Yes, the PCI device can be identified with e.g. BFD
> > > (won't work for multi-domain but then we can write an allocator maybe).
> > > But how will we inject these interrupts?
> > > We can do this now with GSI ioctl or map GSI to irqfd
> > > and inject with irqfd write.
> > 
> > I suppose it's not in the scope of this patch...
> 
> This is why I suggested mapping GSI to msix.
> 
> > But I think you can still do
> > this, everything is the same as before. QEmu can read from table to get
> > data/address pair, then program the routing table, etc.
> 
> Yes, fine, but mask is the problem :)
> When qemu/irqfd injects an interrupt and it's masked,
> guest should not be interrupted.

I think this should be done by other APIs(to map GSI with MSI-X entry). And of 
course you can introduce one GSI type which require one ID and one entry number to 
eject later, but it's not in the scope of this patch. 

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > > > > My idea was this: we have the device id in
> > > > > > > kvm_assigned_msix_entry already. Just put table id and entry
> > > > > > > number in
> > > > > > > kvm_irq_routing_entry (create a new gsi type for this).
> > > > > > > The result will also work for irqfd because these are mapped to
> > > > > > > gsi.
> > > > > > > 
> > > > > > > > And for the table and pba address, it's due to the mapping in
> > > > > > > > userspace may know the guest MSI-X table address and PBA
> > > > > > > > address at different time(due to different BAR, refer to the
> > > > > > > > code in assigned_dev_iomem_map() of qemu). So I purposed
> > > > > > > > this API to allow each of them can be passed to kernel space
> > > > > > > > individually.
> > > > > > > > 
> > > > > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > > > > +	__u32 dev_id;
> > > > > > > > > > +	__u16 type;
> > > > > > > > > > +	__u16 max_entries_nr;
> > > > > > > > > > +	__u64 base_addr;
> > > > > > > > > > +	__u64 base_va;
> > > > > > > > > > +	__u64 flags;
> > > > > > > > > > +	__u64 reserved[4];
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm
> > > > > > > > > > *kvm, +				int assigned_dev_id, int entry, u32 
flag)
> > > > > > > > > > +{
> > > > > > > > > 
> > > > > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > > > > 
> > > > > > > > > > +	int r = -EFAULT;
> > > > > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > > > > +	int i;
> > > > > > > > > > +
> > > > > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > > > > +		return r;
> > > > > > > > > > +
> > > > > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > > > > +	adev =
> > > > > > > > > > kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > > > > +				      assigned_dev_id);
> > > > > > > > > > +	if (!adev)
> > > > > > > > > > +		goto out;
> > > > > > > > > > +
> > > > > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > > > > +			if (flag)
> > > > > > > > > > +				disable_irq_nosync(
> > > > > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > > > > +			else
> > > > > > > > > > +				enable_irq(adev-
>host_msix_entries[i].vector);
> > > > > > > > > > +			r = 0;
> > > > > > > > > > +			break;
> > > > > > > > > > +		}
> > > > > > > > > > +out:
> > > > > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > > > > +	return r;
> > > > > > > > > > +}
> > > > > > > > > > 
> > > > > > > > > > @@ -1988,6 +2008,12 @@ static int
> > > > > > > > > > kvm_dev_ioctl_create_vm(void)
> > > > > > > > > > 
> > > > > > > > > >   		return r;
> > > > > > > > > >   	
> > > > > > > > > >   	}
> > > > > > > > > >   
> > > > > > > > > >   #endif
> > > > > > > > > > 
> > > > > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > > > > +	if (r<  0) {
> > > > > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > > > > +		return r;
> > > > > > > > > > +	}
> > > > > > > > > 
> > > > > > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO
> > > > > > > > > calls?
> > > > > > > > 
> > > > > > > > In fact this MMIO device is more like global one for the VM,
> > > > > > > > not for every devices. It should handle all MMIO from all
> > > > > > > > MSI-X enabled devices, so I put it in the VM init/destroy
> > > > > > > > process.
> > > > > > > > 
> > > > > > > > > > +static int msix_table_mmio_read(struct kvm_io_device
> > > > > > > > > > *this, gpa_t addr, int len, +				void *val)
> > > > > > > > > > +{
> > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > > > > +
> > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > +		goto out;
> > > > > > > > > > +	}
> > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > +		goto out;
> > > > > > > > > 
> > > > > > > > > What about (addr & 4) && (len == 8)? Is it supported? It
> > > > > > > > > may cross entry boundaries.
> > > > > > > > 
> > > > > > > > Should not supported. But I haven't found words on the PCI
> > > > > > > > spec for it. So I didn't add this check.
> > > > > > > 
> > > > > > > IMPLEMENTATION NOTE
> > > > > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > > > > 
> > > > > > > ....
> > > > > > > 
> > > > > > > For all accesses to MSI-X Table and MSI-X PBA fields, software
> > > > > > > must use aligned full
> > > > > > > DWORD or aligned full QWORD transactions; otherwise, the result
> > > > > > > is undefined.
> > > > > > 
> > > > > > Yes, this one is enough, I would add the checking.
> > > > > > 
> > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > +
> > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xf;
> > > > > > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > > > > 
> > > > > > > > > > 
> > > > > > > > > > +	if (r)
> > > > > > > > > > +		goto out;
> > > > > > > > > > +out:
> > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > +	return ret;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +static int msix_table_mmio_write(struct kvm_io_device
> > > > > > > > > > *this, gpa_t addr, +				int len, const void *val)
> > > > > > > > > > +{
> > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > > > > +	gpa_t entry_base;
> > > > > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > > > > +
> > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > +		goto out;
> > > > > > > > > > +	}
> > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > +		goto out;
> > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	entry_base = mmio->table_base_va +
> > > > > > > > > > entry * PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xF;
> > > > > > > > > > +
> > > > > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > +			sizeof old_ctrl))
> > > > > > > > > > +		goto out;
> > > > > > > > > 
> > > > > > > > > get_user() is easier.
> > > > > > > > > 
> > > > > > > > > > +
> > > > > > > > > > +	/* No allow writing to other fields when entry is
> > > > > > > > > > unmasked */ +	if (!(old_ctrl& 
> > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +	    offset !=
> > > > > > > > > > PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > > > > +		goto out;
> > > > > > > > > > +
> > > > > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > > > > +		goto out;
> > > > > > > > > > 
> > > > > > > > > > +
> > > > > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > +			sizeof new_ctrl))
> > > > > > > > > > +		goto out;
> > > > > > > > > 
> > > > > > > > > put_user()
> > > > > > > > > 
> > > > > > > > > > +
> > > > > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4)
> > > > > > > > > > || +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > > > > +		goto out;
> > > > > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, 
entry,
> > > > > > > > > > 1); +	else if ((old_ctrl& 
> > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +			!(new_ctrl& 
> > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, 
entry,
> > > > > > > > > > 0); +	if (r || ret)
> > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > +out:
> > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > +	return ret;
> > > > > > > > > > +}
> > > > > > > > > 
> > > > > > > > > blank line...
> > > > > > > > > 
> > > > > > > > > > +static const struct kvm_io_device_ops
> > > > > > > > > > msix_mmio_table_ops = { +	.read     =
> > > > > > > > > > msix_table_mmio_read,
> > > > > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > > > > +};
> > > > > > > > > > +
> > > > > > > > > > ++
> > > > > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > > > > +{
> > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev
> > > > > > > > > > =&kvm->msix_mmio_dev; +	struct kvm_msix_mmio *mmio =
> > > > > > > > > > NULL;
> > > > > > > > > > +	int r = 0, i;
> > > > > > > > > > +
> > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > > > > > +		    (mmio_dev->mmio[i].type&
> > 
> > KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > 
> > > > ==
> > > > 
> > > > > > > > > > +		    (mmio_user->type&  
KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > > > > +			if (mmio->max_entries_nr != mmio_user-
> > >
> > >max_entries_nr)
> > >
> > > > {
> > > > 
> > > > > > > > > > +				r = -EINVAL;
> > > > > > > > > > +				goto out;
> > > > > > > > > > +			}
> > > > > > > > > > +			break;
> > > > > > > > > > +		}
> > > > > > > > > > +	}
> > > > > > > > > > +	if (!mmio) {
> > > > > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > > > > +			r = -ENOSPC;
> > > > > > > > > > +			goto out;
> > > > > > > > > > +		}
> > > > > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > > > > +	}
> > > > > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > > > > 
> > > > > > > > > Sanity check to avoid overflow.
> > > > > > > > > 
> > > > > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > > > > 
> > > > > > > > > Check for unsupported bits (all of them at present?)
> > > > > > > > > 
> > > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > > > > +
> > > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > > > > 
> > > > > > > > > Check for va in kernel space.
> > > > > > > > > 
> > > > > > > > > > +	} else if ((mmio_user->type& 
> > > > > > > > > > KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > > > > +	}
> > > > > > > > > > +out:
> > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > +	return r;
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > > +
> > > > > > > > > 
> > > > > > > > > In all, looks reasonable.  I'd like to see documentation
> > > > > > > > > for it, and review from the pci people.  Alex, mst?
> > > > > > > 
> > > > > > > Some general comments:
> > > > > > > PBA isn't supported in this version, which is OK, but let's not
> > > > > > > add a capability until it is, and let's not try to guess what
> > > > > > > the interface will look like. I think keeping PBA in userspace
> > > > > > > will be hard because it needs to be modified from interrupt
> > > > > > > context. Removing the PBA stub will make the interface
> > > > > > > simpler.
> > > > > > 
> > > > > > The API only get the PBA address now which should be fine. And we
> > > > > > still have threaded irq and tasklet for accessing the userspace
> > > > > > for interrupt handler...
> > > > > 
> > > > > I don't think it's going to work: we are not
> > > > > in the context of the right process. Further
> > > > > I think we should keep the option of
> > > > > reading the PBA status from the device or host kernel open.
> > > > > And generally having an interface
> > > > > for functionality we don't implement is not a good idea:
> > > > > you don't know whether you really can support the interface you
> > > > > promised.
> > > > 
> > > > Well, I don't know if we want to read PBA from device directly. To me
> > > > it's not a good idea because the real device has nothing to do with
> > > > the one we show to the guest. At least direct accessing the mask
> > > > bits of real device would be very dangerous. Avi?
> > > > 
> > > > --
> > > > regards
> > > > Yang, Sheng
> > > 
> > > I am not really suggesting this. What I say is PBA is unimplemented
> > > let us not commit to an interface yet.
> > > 
> > > > > > --
> > > > > > regards
> > > > > > Yang, Sheng
> > > > > > 
> > > > > > > > Would add the API document soon.
> > > > > > > > 
> > > > > > > > --
> > > > > > > > regards
> > > > > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  8:24                     ` Sheng Yang
@ 2010-12-30  8:52                       ` Michael S. Tsirkin
  2010-12-30  9:13                         ` Sheng Yang
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30  8:52 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 04:24:10PM +0800, Sheng Yang wrote:
> On Thursday 30 December 2010 16:15:32 Michael S. Tsirkin wrote:
> > On Thu, Dec 30, 2010 at 03:55:10PM +0800, Sheng Yang wrote:
> > > On Thursday 30 December 2010 15:47:48 Michael S. Tsirkin wrote:
> > > > On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> > > > > On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > > > > > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > > > > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > > > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > > > > > Then we can support mask bit operation of assigned
> > > > > > > > > > > devices now.
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > > > > > 
> > > > > > > > > > >   mmio:
> > > > > > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64
> > > > > > > > > > >   	*)val);
> > > > > > > > > > > 
> > > > > > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > > > > > 
> > > > > > > > > > >   	/*
> > > > > > > > > > >   	
> > > > > > > > > > >   	 * Is this MMIO handled locally?
> > > > > > > > > > >   	 */
> > > > > > > > > > > 
> > > > > > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > > > > > +	if (!r)
> > > > > > > > > > > 
> > > > > > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > > > > > >   	
> > > > > > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > > > > > 
> > > > > > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > > > > > 
> > > > > > > > > > This isn't very pretty, exit_reason should be written in
> > > > > > > > > > vcpu_mmio_write().  I guess we can refactor it later.
> > > > > > > > > 
> > > > > > > > > Sure.
> > > > > > > > > 
> > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > > > > > +
> > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > > > > > +
> > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > > > > > 
> > > > > > > > > > Any explanation of these?
> > > > > > > > > 
> > > > > > > > > I chose to use assigned device id instead of one specific
> > > > > > > > > table id, because every device should got at most one MSI
> > > > > > > > > MMIO(the same should applied to vfio device as well), and if
> > > > > > > > > we use specific table ID, we need way to associate with the
> > > > > > > > > device anyway, to perform mask/unmask or other operation. So
> > > > > > > > > I think it's better to use device ID here directly.
> > > > > > > > 
> > > > > > > > Table id will be needed to make things work for emulated
> > > > > > > > devices.
> > > > > > > 
> > > > > > > I suppose even emulated device should got some kind of id(BDF)?
> > > > > > 
> > > > > > Not that I know. Look at how irqfd is defined for example,
> > > > > > or how interrupts are sent through a gsi.
> > > > > > I would like to make the interface be able to support that.
> > > > > > 
> > > > > > > I think that is
> > > > > > > enough for identification, which is already there, so we don't
> > > > > > > need to allocate another ID for the device - because one device
> > > > > > > would got at most one MSI-X MMIO, then use BDF or other device
> > > > > > > specific ID should be quite straightforward.
> > > > > > 
> > > > > > So you propose allocating ids for emulated devices?
> > > > > > OK. How will we map e.g. irqfds to these?
> > > > > 
> > > > > I don't understand. I've checked virtio-pci.c which is using irqfd,
> > > > > and it's still a PCI device, and still have BDF, right?
> > > > > 
> > > > > Also, what we want is a way to identify the MSI-X MMIO. For assigned
> > > > > device, we use BDF, then we can easily identify the MMIO as well as
> > > > > the device. For others, even they don't have BDF(I don't think so,
> > > > > because MSI-X is a part of PCI, and every PCI device has BDF), what
> > > > > you need is an ID, no matter what it is and how it defined. QEmu can
> > > > > get the allocation done, and the type field in this API can still
> > > > > tell which kind of ID/devices they are, then determine how to deal
> > > > > with them.
> > > > 
> > > > Yes, the PCI device can be identified with e.g. BFD
> > > > (won't work for multi-domain but then we can write an allocator maybe).
> > > > But how will we inject these interrupts?
> > > > We can do this now with GSI ioctl or map GSI to irqfd
> > > > and inject with irqfd write.
> > > 
> > > I suppose it's not in the scope of this patch...
> > 
> > This is why I suggested mapping GSI to msix.
> > 
> > > But I think you can still do
> > > this, everything is the same as before. QEmu can read from table to get
> > > data/address pair, then program the routing table, etc.
> > 
> > Yes, fine, but mask is the problem :)
> > When qemu/irqfd injects an interrupt and it's masked,
> > guest should not be interrupted.
> 
> I think this should be done by other APIs(to map GSI with MSI-X entry). And of 
> course you can introduce one GSI type which require one ID and one entry number to 
> eject later, but it's not in the scope of this patch. 

What you propose is adding an API to map GSI to a table ID + entry
number?  Like this?

1. map table ID to address+length
2. map GSI to table ID + entry #

But if we have such an API we don't need anything else
as there's already an API to map assigned device interrupts
to GSIs?

> --
> regards
> Yang, Sheng
> 
> > 
> > > --
> > > regards
> > > Yang, Sheng
> > > 
> > > > > > > > My idea was this: we have the device id in
> > > > > > > > kvm_assigned_msix_entry already. Just put table id and entry
> > > > > > > > number in
> > > > > > > > kvm_irq_routing_entry (create a new gsi type for this).
> > > > > > > > The result will also work for irqfd because these are mapped to
> > > > > > > > gsi.
> > > > > > > > 
> > > > > > > > > And for the table and pba address, it's due to the mapping in
> > > > > > > > > userspace may know the guest MSI-X table address and PBA
> > > > > > > > > address at different time(due to different BAR, refer to the
> > > > > > > > > code in assigned_dev_iomem_map() of qemu). So I purposed
> > > > > > > > > this API to allow each of them can be passed to kernel space
> > > > > > > > > individually.
> > > > > > > > > 
> > > > > > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > > > > > +	__u32 dev_id;
> > > > > > > > > > > +	__u16 type;
> > > > > > > > > > > +	__u16 max_entries_nr;
> > > > > > > > > > > +	__u64 base_addr;
> > > > > > > > > > > +	__u64 base_va;
> > > > > > > > > > > +	__u64 flags;
> > > > > > > > > > > +	__u64 reserved[4];
> > > > > > > > > > > +};
> > > > > > > > > > > +
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm
> > > > > > > > > > > *kvm, +				int assigned_dev_id, int entry, u32 
> flag)
> > > > > > > > > > > +{
> > > > > > > > > > 
> > > > > > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > > > > > 
> > > > > > > > > > > +	int r = -EFAULT;
> > > > > > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > > > > > +	int i;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > > > > > +		return r;
> > > > > > > > > > > +
> > > > > > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > > > > > +	adev =
> > > > > > > > > > > kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > > > > > +				      assigned_dev_id);
> > > > > > > > > > > +	if (!adev)
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +
> > > > > > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > > > > > +			if (flag)
> > > > > > > > > > > +				disable_irq_nosync(
> > > > > > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > > > > > +			else
> > > > > > > > > > > +				enable_irq(adev-
> >host_msix_entries[i].vector);
> > > > > > > > > > > +			r = 0;
> > > > > > > > > > > +			break;
> > > > > > > > > > > +		}
> > > > > > > > > > > +out:
> > > > > > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > > > > > +	return r;
> > > > > > > > > > > +}
> > > > > > > > > > > 
> > > > > > > > > > > @@ -1988,6 +2008,12 @@ static int
> > > > > > > > > > > kvm_dev_ioctl_create_vm(void)
> > > > > > > > > > > 
> > > > > > > > > > >   		return r;
> > > > > > > > > > >   	
> > > > > > > > > > >   	}
> > > > > > > > > > >   
> > > > > > > > > > >   #endif
> > > > > > > > > > > 
> > > > > > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > > > > > +	if (r<  0) {
> > > > > > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > > > > > +		return r;
> > > > > > > > > > > +	}
> > > > > > > > > > 
> > > > > > > > > > Shouldn't this be part of individual KVM_REGISTER_MSIX_MMIO
> > > > > > > > > > calls?
> > > > > > > > > 
> > > > > > > > > In fact this MMIO device is more like global one for the VM,
> > > > > > > > > not for every devices. It should handle all MMIO from all
> > > > > > > > > MSI-X enabled devices, so I put it in the VM init/destroy
> > > > > > > > > process.
> > > > > > > > > 
> > > > > > > > > > > +static int msix_table_mmio_read(struct kvm_io_device
> > > > > > > > > > > *this, gpa_t addr, int len, +				void *val)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > > > > > +
> > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +	}
> > > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > > +		goto out;
> > > > > > > > > > 
> > > > > > > > > > What about (addr & 4) && (len == 8)? Is it supported? It
> > > > > > > > > > may cross entry boundaries.
> > > > > > > > > 
> > > > > > > > > Should not supported. But I haven't found words on the PCI
> > > > > > > > > spec for it. So I didn't add this check.
> > > > > > > > 
> > > > > > > > IMPLEMENTATION NOTE
> > > > > > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > > > > > 
> > > > > > > > ....
> > > > > > > > 
> > > > > > > > For all accesses to MSI-X Table and MSI-X PBA fields, software
> > > > > > > > must use aligned full
> > > > > > > > DWORD or aligned full QWORD transactions; otherwise, the result
> > > > > > > > is undefined.
> > > > > > > 
> > > > > > > Yes, this one is enough, I would add the checking.
> > > > > > > 
> > > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > > +
> > > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xf;
> > > > > > > > > > > +	r = copy_from_user(val, (void *)(mmio->table_base_va +
> > > > > > > > > > > +			entry * PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > +	if (r)
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +out:
> > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > +	return ret;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +static int msix_table_mmio_write(struct kvm_io_device
> > > > > > > > > > > *this, gpa_t addr, +				int len, const void *val)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > > > > > +	gpa_t entry_base;
> > > > > > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > > > > > +
> > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +	}
> > > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	entry_base = mmio->table_base_va +
> > > > > > > > > > > entry * PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xF;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > > +			sizeof old_ctrl))
> > > > > > > > > > > +		goto out;
> > > > > > > > > > 
> > > > > > > > > > get_user() is easier.
> > > > > > > > > > 
> > > > > > > > > > > +
> > > > > > > > > > > +	/* No allow writing to other fields when entry is
> > > > > > > > > > > unmasked */ +	if (!(old_ctrl& 
> > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +	    offset !=
> > > > > > > > > > > PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +
> > > > > > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > 
> > > > > > > > > > > +
> > > > > > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > > +			sizeof new_ctrl))
> > > > > > > > > > > +		goto out;
> > > > > > > > > > 
> > > > > > > > > > put_user()
> > > > > > > > > > 
> > > > > > > > > > > +
> > > > > > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len == 4)
> > > > > > > > > > > || +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > > > > > +		goto out;
> > > > > > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, 
> entry,
> > > > > > > > > > > 1); +	else if ((old_ctrl& 
> > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +			!(new_ctrl& 
> > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio, 
> entry,
> > > > > > > > > > > 0); +	if (r || ret)
> > > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > > +out:
> > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > +	return ret;
> > > > > > > > > > > +}
> > > > > > > > > > 
> > > > > > > > > > blank line...
> > > > > > > > > > 
> > > > > > > > > > > +static const struct kvm_io_device_ops
> > > > > > > > > > > msix_mmio_table_ops = { +	.read     =
> > > > > > > > > > > msix_table_mmio_read,
> > > > > > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > > > > > +};
> > > > > > > > > > > +
> > > > > > > > > > > ++
> > > > > > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > > > > > +{
> > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev
> > > > > > > > > > > =&kvm->msix_mmio_dev; +	struct kvm_msix_mmio *mmio =
> > > > > > > > > > > NULL;
> > > > > > > > > > > +	int r = 0, i;
> > > > > > > > > > > +
> > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > > > > > +		if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id&&
> > > > > > > > > > > +		    (mmio_dev->mmio[i].type&
> > > 
> > > KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > > 
> > > > > ==
> > > > > 
> > > > > > > > > > > +		    (mmio_user->type&  
> KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > > > > > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > > > > > +			if (mmio->max_entries_nr != mmio_user-
> > > >
> > > >max_entries_nr)
> > > >
> > > > > {
> > > > > 
> > > > > > > > > > > +				r = -EINVAL;
> > > > > > > > > > > +				goto out;
> > > > > > > > > > > +			}
> > > > > > > > > > > +			break;
> > > > > > > > > > > +		}
> > > > > > > > > > > +	}
> > > > > > > > > > > +	if (!mmio) {
> > > > > > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > > > > > +			r = -ENOSPC;
> > > > > > > > > > > +			goto out;
> > > > > > > > > > > +		}
> > > > > > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > > > > > +	}
> > > > > > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > > > > > 
> > > > > > > > > > Sanity check to avoid overflow.
> > > > > > > > > > 
> > > > > > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > > > > > 
> > > > > > > > > > Check for unsupported bits (all of them at present?)
> > > > > > > > > > 
> > > > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK) ==
> > > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > > > > > +
> > > > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > > > > > 
> > > > > > > > > > Check for va in kernel space.
> > > > > > > > > > 
> > > > > > > > > > > +	} else if ((mmio_user->type& 
> > > > > > > > > > > KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > > > > > +	}
> > > > > > > > > > > +out:
> > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > +	return r;
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > > +
> > > > > > > > > > 
> > > > > > > > > > In all, looks reasonable.  I'd like to see documentation
> > > > > > > > > > for it, and review from the pci people.  Alex, mst?
> > > > > > > > 
> > > > > > > > Some general comments:
> > > > > > > > PBA isn't supported in this version, which is OK, but let's not
> > > > > > > > add a capability until it is, and let's not try to guess what
> > > > > > > > the interface will look like. I think keeping PBA in userspace
> > > > > > > > will be hard because it needs to be modified from interrupt
> > > > > > > > context. Removing the PBA stub will make the interface
> > > > > > > > simpler.
> > > > > > > 
> > > > > > > The API only get the PBA address now which should be fine. And we
> > > > > > > still have threaded irq and tasklet for accessing the userspace
> > > > > > > for interrupt handler...
> > > > > > 
> > > > > > I don't think it's going to work: we are not
> > > > > > in the context of the right process. Further
> > > > > > I think we should keep the option of
> > > > > > reading the PBA status from the device or host kernel open.
> > > > > > And generally having an interface
> > > > > > for functionality we don't implement is not a good idea:
> > > > > > you don't know whether you really can support the interface you
> > > > > > promised.
> > > > > 
> > > > > Well, I don't know if we want to read PBA from device directly. To me
> > > > > it's not a good idea because the real device has nothing to do with
> > > > > the one we show to the guest. At least direct accessing the mask
> > > > > bits of real device would be very dangerous. Avi?
> > > > > 
> > > > > --
> > > > > regards
> > > > > Yang, Sheng
> > > > 
> > > > I am not really suggesting this. What I say is PBA is unimplemented
> > > > let us not commit to an interface yet.
> > > > 
> > > > > > > --
> > > > > > > regards
> > > > > > > Yang, Sheng
> > > > > > > 
> > > > > > > > > Would add the API document soon.
> > > > > > > > > 
> > > > > > > > > --
> > > > > > > > > regards
> > > > > > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  8:52                       ` Michael S. Tsirkin
@ 2010-12-30  9:13                         ` Sheng Yang
  0 siblings, 0 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-30  9:13 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Thursday 30 December 2010 16:52:58 Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 04:24:10PM +0800, Sheng Yang wrote:
> > On Thursday 30 December 2010 16:15:32 Michael S. Tsirkin wrote:
> > > On Thu, Dec 30, 2010 at 03:55:10PM +0800, Sheng Yang wrote:
> > > > On Thursday 30 December 2010 15:47:48 Michael S. Tsirkin wrote:
> > > > > On Thu, Dec 30, 2010 at 03:32:42PM +0800, Sheng Yang wrote:
> > > > > > On Wednesday 29 December 2010 17:28:24 Michael S. Tsirkin wrote:
> > > > > > > On Wed, Dec 29, 2010 at 04:55:19PM +0800, Sheng Yang wrote:
> > > > > > > > On Wednesday 29 December 2010 16:31:35 Michael S. Tsirkin wrote:
> > > > > > > > > On Wed, Dec 29, 2010 at 03:18:13PM +0800, Sheng Yang wrote:
> > > > > > > > > > On Tuesday 28 December 2010 20:26:13 Avi Kivity wrote:
> > > > > > > > > > > On 12/22/2010 10:44 AM, Sheng Yang wrote:
> > > > > > > > > > > > Then we can support mask bit operation of assigned
> > > > > > > > > > > > devices now.
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > @@ -3817,14 +3819,16 @@ static int
> > > > > > > > > > > > emulator_write_emulated_onepage(unsigned long addr,
> > > > > > > > > > > > 
> > > > > > > > > > > >   mmio:
> > > > > > > > > > > >   	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa,
> > > > > > > > > > > >   	*(u64 *)val);
> > > > > > > > > > > > 
> > > > > > > > > > > > +	r = vcpu_mmio_write(vcpu, gpa, bytes, val);
> > > > > > > > > > > > 
> > > > > > > > > > > >   	/*
> > > > > > > > > > > >   	
> > > > > > > > > > > >   	 * Is this MMIO handled locally?
> > > > > > > > > > > >   	 */
> > > > > > > > > > > > 
> > > > > > > > > > > > -	if (!vcpu_mmio_write(vcpu, gpa, bytes, val))
> > > > > > > > > > > > +	if (!r)
> > > > > > > > > > > > 
> > > > > > > > > > > >   		return X86EMUL_CONTINUE;
> > > > > > > > > > > >   	
> > > > > > > > > > > >   	vcpu->mmio_needed = 1;
> > > > > > > > > > > > 
> > > > > > > > > > > > -	vcpu->run->exit_reason = KVM_EXIT_MMIO;
> > > > > > > > > > > > +	vcpu->run->exit_reason = (r == -ENOTSYNC) ?
> > > > > > > > > > > > +		KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO;
> > > > > > > > > > > 
> > > > > > > > > > > This isn't very pretty, exit_reason should be written
> > > > > > > > > > > in vcpu_mmio_write().  I guess we can refactor it
> > > > > > > > > > > later.
> > > > > > > > > > 
> > > > > > > > > > Sure.
> > > > > > > > > > 
> > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV	    (1<<  0)
> > > > > > > > > > > > +
> > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE	    (1<<  8)
> > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_PBA	    (1<<  9)
> > > > > > > > > > > > +
> > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK	    0x00ff
> > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK	    0xff00
> > > > > > > > > > > 
> > > > > > > > > > > Any explanation of these?
> > > > > > > > > > 
> > > > > > > > > > I chose to use assigned device id instead of one specific
> > > > > > > > > > table id, because every device should got at most one MSI
> > > > > > > > > > MMIO(the same should applied to vfio device as well), and
> > > > > > > > > > if we use specific table ID, we need way to associate
> > > > > > > > > > with the device anyway, to perform mask/unmask or other
> > > > > > > > > > operation. So I think it's better to use device ID here
> > > > > > > > > > directly.
> > > > > > > > > 
> > > > > > > > > Table id will be needed to make things work for emulated
> > > > > > > > > devices.
> > > > > > > > 
> > > > > > > > I suppose even emulated device should got some kind of
> > > > > > > > id(BDF)?
> > > > > > > 
> > > > > > > Not that I know. Look at how irqfd is defined for example,
> > > > > > > or how interrupts are sent through a gsi.
> > > > > > > I would like to make the interface be able to support that.
> > > > > > > 
> > > > > > > > I think that is
> > > > > > > > enough for identification, which is already there, so we
> > > > > > > > don't need to allocate another ID for the device - because
> > > > > > > > one device would got at most one MSI-X MMIO, then use BDF or
> > > > > > > > other device specific ID should be quite straightforward.
> > > > > > > 
> > > > > > > So you propose allocating ids for emulated devices?
> > > > > > > OK. How will we map e.g. irqfds to these?
> > > > > > 
> > > > > > I don't understand. I've checked virtio-pci.c which is using
> > > > > > irqfd, and it's still a PCI device, and still have BDF, right?
> > > > > > 
> > > > > > Also, what we want is a way to identify the MSI-X MMIO. For
> > > > > > assigned device, we use BDF, then we can easily identify the
> > > > > > MMIO as well as the device. For others, even they don't have
> > > > > > BDF(I don't think so, because MSI-X is a part of PCI, and every
> > > > > > PCI device has BDF), what you need is an ID, no matter what it
> > > > > > is and how it defined. QEmu can get the allocation done, and the
> > > > > > type field in this API can still tell which kind of ID/devices
> > > > > > they are, then determine how to deal with them.
> > > > > 
> > > > > Yes, the PCI device can be identified with e.g. BFD
> > > > > (won't work for multi-domain but then we can write an allocator
> > > > > maybe). But how will we inject these interrupts?
> > > > > We can do this now with GSI ioctl or map GSI to irqfd
> > > > > and inject with irqfd write.
> > > > 
> > > > I suppose it's not in the scope of this patch...
> > > 
> > > This is why I suggested mapping GSI to msix.
> > > 
> > > > But I think you can still do
> > > > this, everything is the same as before. QEmu can read from table to
> > > > get data/address pair, then program the routing table, etc.
> > > 
> > > Yes, fine, but mask is the problem :)
> > > When qemu/irqfd injects an interrupt and it's masked,
> > > guest should not be interrupted.
> > 
> > I think this should be done by other APIs(to map GSI with MSI-X entry).
> > And of course you can introduce one GSI type which require one ID and
> > one entry number to eject later, but it's not in the scope of this
> > patch.
> 
> What you propose is adding an API to map GSI to a table ID + entry
> number?  Like this?
> 
> 1. map table ID to address+length
> 2. map GSI to table ID + entry #
> 
> But if we have such an API we don't need anything else
> as there's already an API to map assigned device interrupts
> to GSIs?

Assigned device's API is still there, and it's also not touched by this patch. You 
can modify it for other devices of course.

For assigned devices, this API is enough. And it can also support other devices, 
emulate their MSI-X MMIO. I think that's enough for this patch.

--
regards
Yang, Sheng

> 
> > --
> > regards
> > Yang, Sheng
> > 
> > > > --
> > > > regards
> > > > Yang, Sheng
> > > > 
> > > > > > > > > My idea was this: we have the device id in
> > > > > > > > > kvm_assigned_msix_entry already. Just put table id and
> > > > > > > > > entry number in
> > > > > > > > > kvm_irq_routing_entry (create a new gsi type for this).
> > > > > > > > > The result will also work for irqfd because these are
> > > > > > > > > mapped to gsi.
> > > > > > > > > 
> > > > > > > > > > And for the table and pba address, it's due to the
> > > > > > > > > > mapping in userspace may know the guest MSI-X table
> > > > > > > > > > address and PBA address at different time(due to
> > > > > > > > > > different BAR, refer to the code in
> > > > > > > > > > assigned_dev_iomem_map() of qemu). So I purposed this
> > > > > > > > > > API to allow each of them can be passed to kernel space
> > > > > > > > > > individually.
> > > > > > > > > > 
> > > > > > > > > > > > +struct kvm_msix_mmio_user {
> > > > > > > > > > > > +	__u32 dev_id;
> > > > > > > > > > > > +	__u16 type;
> > > > > > > > > > > > +	__u16 max_entries_nr;
> > > > > > > > > > > > +	__u64 base_addr;
> > > > > > > > > > > > +	__u64 base_va;
> > > > > > > > > > > > +	__u64 flags;
> > > > > > > > > > > > +	__u64 reserved[4];
> > > > > > > > > > > > +};
> > > > > > > > > > > > +
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct
> > > > > > > > > > > > kvm *kvm, +				int assigned_dev_id, int 
entry, u32
> > 
> > flag)
> > 
> > > > > > > > > > > > +{
> > > > > > > > > > > 
> > > > > > > > > > > Need a better name for 'flag' (and make it a bool).
> > > > > > > > > > > 
> > > > > > > > > > > > +	int r = -EFAULT;
> > > > > > > > > > > > +	struct kvm_assigned_dev_kernel *adev;
> > > > > > > > > > > > +	int i;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (!irqchip_in_kernel(kvm))
> > > > > > > > > > > > +		return r;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	mutex_lock(&kvm->lock);
> > > > > > > > > > > > +	adev =
> > > > > > > > > > > > kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
> > > > > > > > > > > > +				      assigned_dev_id);
> > > > > > > > > > > > +	if (!adev)
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	for (i = 0; i<  adev->entries_nr; i++)
> > > > > > > > > > > > +		if (adev->host_msix_entries[i].entry == entry) {
> > > > > > > > > > > > +			if (flag)
> > > > > > > > > > > > +				disable_irq_nosync(
> > > > > > > > > > > > +					adev->host_msix_entries[i].vector);
> > > > > > > > > > > > +			else
> > > > > > > > > > > > +				enable_irq(adev-
> > >
> > >host_msix_entries[i].vector);
> > >
> > > > > > > > > > > > +			r = 0;
> > > > > > > > > > > > +			break;
> > > > > > > > > > > > +		}
> > > > > > > > > > > > +out:
> > > > > > > > > > > > +	mutex_unlock(&kvm->lock);
> > > > > > > > > > > > +	return r;
> > > > > > > > > > > > +}
> > > > > > > > > > > > 
> > > > > > > > > > > > @@ -1988,6 +2008,12 @@ static int
> > > > > > > > > > > > kvm_dev_ioctl_create_vm(void)
> > > > > > > > > > > > 
> > > > > > > > > > > >   		return r;
> > > > > > > > > > > >   	
> > > > > > > > > > > >   	}
> > > > > > > > > > > >   
> > > > > > > > > > > >   #endif
> > > > > > > > > > > > 
> > > > > > > > > > > > +	r = kvm_register_msix_mmio_dev(kvm);
> > > > > > > > > > > > +	if (r<  0) {
> > > > > > > > > > > > +		kvm_put_kvm(kvm);
> > > > > > > > > > > > +		return r;
> > > > > > > > > > > > +	}
> > > > > > > > > > > 
> > > > > > > > > > > Shouldn't this be part of individual
> > > > > > > > > > > KVM_REGISTER_MSIX_MMIO calls?
> > > > > > > > > > 
> > > > > > > > > > In fact this MMIO device is more like global one for the
> > > > > > > > > > VM, not for every devices. It should handle all MMIO
> > > > > > > > > > from all MSI-X enabled devices, so I put it in the VM
> > > > > > > > > > init/destroy process.
> > > > > > > > > > 
> > > > > > > > > > > > +static int msix_table_mmio_read(struct kvm_io_device
> > > > > > > > > > > > *this, gpa_t addr, int len, +				void 
*val)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > > > +	int idx, ret = 0, entry, offset, r;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > 
> > > > > > > > > > > What about (addr & 4) && (len == 8)? Is it supported?
> > > > > > > > > > > It may cross entry boundaries.
> > > > > > > > > > 
> > > > > > > > > > Should not supported. But I haven't found words on the
> > > > > > > > > > PCI spec for it. So I didn't add this check.
> > > > > > > > > 
> > > > > > > > > IMPLEMENTATION NOTE
> > > > > > > > > MSI-X Memory Space Structures in Read/Write Memory
> > > > > > > > > 
> > > > > > > > > ....
> > > > > > > > > 
> > > > > > > > > For all accesses to MSI-X Table and MSI-X PBA fields,
> > > > > > > > > software must use aligned full
> > > > > > > > > DWORD or aligned full QWORD transactions; otherwise, the
> > > > > > > > > result is undefined.
> > > > > > > > 
> > > > > > > > Yes, this one is enough, I would add the checking.
> > > > > > > > 
> > > > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > > > +
> > > > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	offset = addr&  0xf;
> > > > > > > > > > > > +	r = copy_from_user(val, (void
> > > > > > > > > > > > *)(mmio->table_base_va + +			entry *
> > > > > > > > > > > > PCI_MSIX_ENTRY_SIZE + offset), len);
> > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > +	if (r)
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +out:
> > > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > > +	return ret;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +static int msix_table_mmio_write(struct
> > > > > > > > > > > > kvm_io_device *this, gpa_t addr, +				int 
len, const
> > > > > > > > > > > > void *val) +{
> > > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev =
> > > > > > > > > > > > +		container_of(this, struct kvm_msix_mmio_dev,
> > > > > > > > > > > > table_dev); +	struct kvm_msix_mmio *mmio;
> > > > > > > > > > > > +	int idx, entry, offset, ret = 0, r = 0;
> > > > > > > > > > > > +	gpa_t entry_base;
> > > > > > > > > > > > +	u32 old_ctrl, new_ctrl;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > > +	idx = get_mmio_table_index(mmio_dev, addr, len);
> > > > > > > > > > > > +	if (idx<  0) {
> > > > > > > > > > > > +		ret = -EOPNOTSUPP;
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +	if ((addr&  0x3) || (len != 4&&  len != 8))
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +	mmio =&mmio_dev->mmio[idx];
> > > > > > > > > > > > +	entry = (addr - mmio->table_base_addr) /
> > > > > > > > > > > > PCI_MSIX_ENTRY_SIZE; +	entry_base =
> > > > > > > > > > > > mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE;
> > > > > > > > > > > > +	offset = addr&  0xF; +
> > > > > > > > > > > > +	if (copy_from_user(&old_ctrl,
> > > > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > > > +			sizeof old_ctrl))
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > 
> > > > > > > > > > > get_user() is easier.
> > > > > > > > > > > 
> > > > > > > > > > > > +
> > > > > > > > > > > > +	/* No allow writing to other fields when entry is
> > > > > > > > > > > > unmasked */ +	if (!(old_ctrl&
> > > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +	    offset !=
> > > > > > > > > > > > PCI_MSIX_ENTRY_VECTOR_CTRL)
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (copy_to_user(entry_base + offset, val, len))
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > 
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if (copy_from_user(&new_ctrl,
> > > > > > > > > > > > +			entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL,
> > > > > > > > > > > > +			sizeof new_ctrl))
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > 
> > > > > > > > > > > put_user()
> > > > > > > > > > > 
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if ((offset<  PCI_MSIX_ENTRY_VECTOR_CTRL&&  len ==
> > > > > > > > > > > > 4)
> > > > > > > > > > > > 
> > > > > > > > > > > > || +	    (offset<  PCI_MSIX_ENTRY_DATA&&  len == 8))
> > > > > > > > > > > > 
> > > > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > > > +	if (old_ctrl == new_ctrl)
> > > > > > > > > > > > +		goto out;
> > > > > > > > > > > > +	if (!(old_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT)&&
> > > > > > > > > > > > +			(new_ctrl&  PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio,
> > 
> > entry,
> > 
> > > > > > > > > > > > 1); +	else if ((old_ctrl&
> > > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)&& +			!(new_ctrl&
> > > > > > > > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT))
> > > > > > > > > > > > +		r = update_msix_mask_bit(mmio_dev->kvm, mmio,
> > 
> > entry,
> > 
> > > > > > > > > > > > 0); +	if (r || ret)
> > > > > > > > > > > > +		ret = -ENOTSYNC;
> > > > > > > > > > > > +out:
> > > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > > +	return ret;
> > > > > > > > > > > > +}
> > > > > > > > > > > 
> > > > > > > > > > > blank line...
> > > > > > > > > > > 
> > > > > > > > > > > > +static const struct kvm_io_device_ops
> > > > > > > > > > > > msix_mmio_table_ops = { +	.read     =
> > > > > > > > > > > > msix_table_mmio_read,
> > > > > > > > > > > > +	.write    = msix_table_mmio_write,
> > > > > > > > > > > > +};
> > > > > > > > > > > > +
> > > > > > > > > > > > ++
> > > > > > > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm,
> > > > > > > > > > > > +				    struct kvm_msix_mmio_user *mmio_user)
> > > > > > > > > > > > +{
> > > > > > > > > > > > +	struct kvm_msix_mmio_dev *mmio_dev
> > > > > > > > > > > > =&kvm->msix_mmio_dev; +	struct kvm_msix_mmio *mmio =
> > > > > > > > > > > > NULL;
> > > > > > > > > > > > +	int r = 0, i;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	mutex_lock(&mmio_dev->lock);
> > > > > > > > > > > > +	for (i = 0; i<  mmio_dev->mmio_nr; i++) {
> > > > > > > > > > > > +		if (mmio_dev->mmio[i].dev_id ==
> > > > > > > > > > > > mmio_user->dev_id&& +		    (mmio_dev->mmio[i].type&
> > > > 
> > > > KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > > > 
> > > > > > ==
> > > > > > 
> > > > > > > > > > > > +		    (mmio_user->type&
> > 
> > KVM_MSIX_MMIO_TYPE_DEV_MASK)) {
> > 
> > > > > > > > > > > > +			mmio =&mmio_dev->mmio[i];
> > > > > > > > > > > > +			if (mmio->max_entries_nr != mmio_user-
> > > > >
> > > > >max_entries_nr)
> > > > >
> > > > > > {
> > > > > > 
> > > > > > > > > > > > +				r = -EINVAL;
> > > > > > > > > > > > +				goto out;
> > > > > > > > > > > > +			}
> > > > > > > > > > > > +			break;
> > > > > > > > > > > > +		}
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +	if (!mmio) {
> > > > > > > > > > > > +		if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) {
> > > > > > > > > > > > +			r = -ENOSPC;
> > > > > > > > > > > > +			goto out;
> > > > > > > > > > > > +		}
> > > > > > > > > > > > +		mmio =&mmio_dev->mmio[mmio_dev->mmio_nr];
> > > > > > > > > > > > +		mmio_dev->mmio_nr++;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +	mmio->max_entries_nr = mmio_user->max_entries_nr;
> > > > > > > > > > > 
> > > > > > > > > > > Sanity check to avoid overflow.
> > > > > > > > > > > 
> > > > > > > > > > > > +	mmio->dev_id = mmio_user->dev_id;
> > > > > > > > > > > > +	mmio->flags = mmio_user->flags;
> > > > > > > > > > > 
> > > > > > > > > > > Check for unsupported bits (all of them at present?)
> > > > > > > > > > > 
> > > > > > > > > > > > +	if ((mmio_user->type&  KVM_MSIX_MMIO_TYPE_DEV_MASK)
> > > > > > > > > > > > == +			KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV)
> > > > > > > > > > > > +		mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV;
> > > > > > > > > > > > +
> > > > > > > > > > > > +	if ((mmio_user->type& 
> > > > > > > > > > > > KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_TABLE) {
> > > > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE;
> > > > > > > > > > > > +		mmio->table_base_addr = mmio_user->base_addr;
> > > > > > > > > > > > +		mmio->table_base_va = mmio_user->base_va;
> > > > > > > > > > > 
> > > > > > > > > > > Check for va in kernel space.
> > > > > > > > > > > 
> > > > > > > > > > > > +	} else if ((mmio_user->type&
> > > > > > > > > > > > KVM_MSIX_MMIO_TYPE_BASE_MASK) ==
> > > > > > > > > > > > +			KVM_MSIX_MMIO_TYPE_BASE_PBA) {
> > > > > > > > > > > > +		mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_PBA;
> > > > > > > > > > > > +		mmio->pba_base_addr = mmio_user->base_addr;
> > > > > > > > > > > > +		mmio->pba_base_va = mmio_user->base_va;
> > > > > > > > > > > > +	}
> > > > > > > > > > > > +out:
> > > > > > > > > > > > +	mutex_unlock(&mmio_dev->lock);
> > > > > > > > > > > > +	return r;
> > > > > > > > > > > > +}
> > > > > > > > > > > > +
> > > > > > > > > > > > +
> > > > > > > > > > > 
> > > > > > > > > > > In all, looks reasonable.  I'd like to see
> > > > > > > > > > > documentation for it, and review from the pci people. 
> > > > > > > > > > > Alex, mst?
> > > > > > > > > 
> > > > > > > > > Some general comments:
> > > > > > > > > PBA isn't supported in this version, which is OK, but let's
> > > > > > > > > not add a capability until it is, and let's not try to
> > > > > > > > > guess what the interface will look like. I think keeping
> > > > > > > > > PBA in userspace will be hard because it needs to be
> > > > > > > > > modified from interrupt context. Removing the PBA stub
> > > > > > > > > will make the interface simpler.
> > > > > > > > 
> > > > > > > > The API only get the PBA address now which should be fine.
> > > > > > > > And we still have threaded irq and tasklet for accessing the
> > > > > > > > userspace for interrupt handler...
> > > > > > > 
> > > > > > > I don't think it's going to work: we are not
> > > > > > > in the context of the right process. Further
> > > > > > > I think we should keep the option of
> > > > > > > reading the PBA status from the device or host kernel open.
> > > > > > > And generally having an interface
> > > > > > > for functionality we don't implement is not a good idea:
> > > > > > > you don't know whether you really can support the interface you
> > > > > > > promised.
> > > > > > 
> > > > > > Well, I don't know if we want to read PBA from device directly.
> > > > > > To me it's not a good idea because the real device has nothing
> > > > > > to do with the one we show to the guest. At least direct
> > > > > > accessing the mask bits of real device would be very dangerous.
> > > > > > Avi?
> > > > > > 
> > > > > > --
> > > > > > regards
> > > > > > Yang, Sheng
> > > > > 
> > > > > I am not really suggesting this. What I say is PBA is unimplemented
> > > > > let us not commit to an interface yet.
> > > > > 
> > > > > > > > --
> > > > > > > > regards
> > > > > > > > Yang, Sheng
> > > > > > > > 
> > > > > > > > > > Would add the API document soon.
> > > > > > > > > > 
> > > > > > > > > > --
> > > > > > > > > > regards
> > > > > > > > > > Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  7:32             ` Sheng Yang
  2010-12-30  7:47               ` Michael S. Tsirkin
@ 2010-12-30  9:28               ` Avi Kivity
  2010-12-30 10:03                 ` Michael S. Tsirkin
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-12-30  9:28 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Michael S. Tsirkin, Marcelo Tosatti, kvm, Alex Williamson

On 12/30/2010 09:32 AM, Sheng Yang wrote:
> >
> >  I don't think it's going to work: we are not
> >  in the context of the right process. Further
> >  I think we should keep the option of
> >  reading the PBA status from the device or host kernel open.
> >  And generally having an interface
> >  for functionality we don't implement is not a good idea:
> >  you don't know whether you really can support the interface you promised.
>
> Well, I don't know if we want to read PBA from device directly. To me it's not a
> good idea because the real device has nothing to do with the one we show to the
> guest.

Right.  vPBA.bit = pPBA.bit | 
recorded_pending_bit_from_lazy_masked_interrupt.

>   At least direct accessing the mask bits of real device would be very
> dangerous. Avi?

Agree.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  7:47               ` Michael S. Tsirkin
  2010-12-30  7:55                 ` Sheng Yang
@ 2010-12-30  9:30                 ` Avi Kivity
  2010-12-30 10:32                   ` Michael S. Tsirkin
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-12-30  9:30 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> I am not really suggesting this. What I say is PBA is unimplemented
> let us not commit to an interface yet.

What happens to a guest that tries to use PBA?  It's a mandatory part of 
MSI-X, no?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  9:28               ` Avi Kivity
@ 2010-12-30 10:03                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30 10:03 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 11:28:20AM +0200, Avi Kivity wrote:
> On 12/30/2010 09:32 AM, Sheng Yang wrote:
> >>
> >>  I don't think it's going to work: we are not
> >>  in the context of the right process. Further
> >>  I think we should keep the option of
> >>  reading the PBA status from the device or host kernel open.
> >>  And generally having an interface
> >>  for functionality we don't implement is not a good idea:
> >>  you don't know whether you really can support the interface you promised.
> >
> >Well, I don't know if we want to read PBA from device directly. To me it's not a
> >good idea because the real device has nothing to do with the one we show to the
> >guest.
> 
> Right.  vPBA.bit = pPBA.bit |
> recorded_pending_bit_from_lazy_masked_interrupt.

At some level, this is correct. However both are *not* in userspace
memory so an interface that assumes that pending bits in userspace are
correct at all times is not going to work.

There is also the spec requirement that the device
clears the pending bit if the interrupt was not served
but the condition that caused the event no longer applies,
so that a driver can work just by polling pending bits.
Personally I don't think this requirement works in real life -
Seems more like a vague idea on behalf of spec authors -
so hopefully guests do not use this.

> >  At least direct accessing the mask bits of real device would be very
> >dangerous. Avi?
> 
> Agree.
> 
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30  9:30                 ` Avi Kivity
@ 2010-12-30 10:32                   ` Michael S. Tsirkin
  2010-12-30 10:37                     ` Avi Kivity
  2010-12-31  3:05                     ` Sheng Yang
  0 siblings, 2 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30 10:32 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> >I am not really suggesting this. What I say is PBA is unimplemented
> >let us not commit to an interface yet.
> 
> What happens to a guest that tries to use PBA?
> It's a mandatory part of MSI-X, no?

Yes. Unfortunately the pending bit is in fact a communication channel
used for function specific purposes when mask bit is set,
and 0 when unset. The spec even seems to *require* this use:

I refer to this:

	For MSI and MSI-X, while a vector is masked, the function is prohibited
	from sending the associated message, and the function must set the
	associated Pending bit whenever the function would otherwise send the
	message. When software unmasks a vector whose associated Pending bit is
	set, the function must schedule sending the associated message, and
	clear the Pending bit as soon as the message has been sent. Note that
	clearing the MSI-X Function Mask bit may result in many messages needing
	to be sent.


	If a masked vector has its Pending bit set, and the associated
	underlying interrupt events are somehow satisfied (usually by software
	though the exact manner is function-specific), the function must clear
	the Pending bit, to avoid sending a spurious interrupt message later
	when software unmasks the vector. However, if a subsequent interrupt
	event occurs while the vector is still masked, the function must again
	set the Pending bit.


	Software is permitted to mask one or more vectors indefinitely, and
	service their associated interrupt events strictly based on polling
	their Pending bits. A function must set and clear its Pending bits as
	necessary to support this “pure polling” mode of operation.

For assigned devices, supporting this would require
that the mask bits on the device are set if the mask bit in
guest is set (otherwise pending bits are disabled).

Existing code does not support PBA in assigned devices, so at least it's
not a regression there, and the virtio spec says nothing about this so
we should be fine.

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30 10:32                   ` Michael S. Tsirkin
@ 2010-12-30 10:37                     ` Avi Kivity
  2010-12-30 11:07                       ` Michael S. Tsirkin
  2010-12-31  3:05                     ` Sheng Yang
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-12-30 10:37 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On 12/30/2010 12:32 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> >  On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> >  >I am not really suggesting this. What I say is PBA is unimplemented
> >  >let us not commit to an interface yet.
> >
> >  What happens to a guest that tries to use PBA?
> >  It's a mandatory part of MSI-X, no?
>
> Yes. Unfortunately the pending bit is in fact a communication channel
> used for function specific purposes when mask bit is set,
> and 0 when unset. The spec even seems to *require* this use:
>
> I refer to this:
>
> 	For MSI and MSI-X, while a vector is masked, the function is prohibited
> 	from sending the associated message, and the function must set the
> 	associated Pending bit whenever the function would otherwise send the
> 	message. When software unmasks a vector whose associated Pending bit is
> 	set, the function must schedule sending the associated message, and
> 	clear the Pending bit as soon as the message has been sent. Note that
> 	clearing the MSI-X Function Mask bit may result in many messages needing
> 	to be sent.
>
>
> 	If a masked vector has its Pending bit set, and the associated
> 	underlying interrupt events are somehow satisfied (usually by software
> 	though the exact manner is function-specific), the function must clear
> 	the Pending bit, to avoid sending a spurious interrupt message later
> 	when software unmasks the vector. However, if a subsequent interrupt
> 	event occurs while the vector is still masked, the function must again
> 	set the Pending bit.
>
>
> 	Software is permitted to mask one or more vectors indefinitely, and
> 	service their associated interrupt events strictly based on polling
> 	their Pending bits. A function must set and clear its Pending bits as
> 	necessary to support this “pure polling” mode of operation.
>
> For assigned devices, supporting this would require
> that the mask bits on the device are set if the mask bit in
> guest is set (otherwise pending bits are disabled).

Can't this be done by setting the real mask bit when the guest reads the 
virtual pending bit, then reading the real pending bit?

> Existing code does not support PBA in assigned devices, so at least it's
> not a regression there, and the virtio spec says nothing about this so
> we should be fine.

Why isn't it subject to the pci spec?

If an interrupt condition exits, the bit should be set.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30 10:37                     ` Avi Kivity
@ 2010-12-30 11:07                       ` Michael S. Tsirkin
  2010-12-30 11:27                         ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30 11:07 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 12:37:18PM +0200, Avi Kivity wrote:
> On 12/30/2010 12:32 PM, Michael S. Tsirkin wrote:
> >On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> >>  On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> >>  >I am not really suggesting this. What I say is PBA is unimplemented
> >>  >let us not commit to an interface yet.
> >>
> >>  What happens to a guest that tries to use PBA?
> >>  It's a mandatory part of MSI-X, no?
> >
> >Yes. Unfortunately the pending bit is in fact a communication channel
> >used for function specific purposes when mask bit is set,
> >and 0 when unset. The spec even seems to *require* this use:
> >
> >I refer to this:
> >
> >	For MSI and MSI-X, while a vector is masked, the function is prohibited
> >	from sending the associated message, and the function must set the
> >	associated Pending bit whenever the function would otherwise send the
> >	message. When software unmasks a vector whose associated Pending bit is
> >	set, the function must schedule sending the associated message, and
> >	clear the Pending bit as soon as the message has been sent. Note that
> >	clearing the MSI-X Function Mask bit may result in many messages needing
> >	to be sent.
> >
> >
> >	If a masked vector has its Pending bit set, and the associated
> >	underlying interrupt events are somehow satisfied (usually by software
> >	though the exact manner is function-specific), the function must clear
> >	the Pending bit, to avoid sending a spurious interrupt message later
> >	when software unmasks the vector. However, if a subsequent interrupt
> >	event occurs while the vector is still masked, the function must again
> >	set the Pending bit.
> >
> >
> >	Software is permitted to mask one or more vectors indefinitely, and
> >	service their associated interrupt events strictly based on polling
> >	their Pending bits. A function must set and clear its Pending bits as
> >	necessary to support this “pure polling” mode of operation.
> >
> >For assigned devices, supporting this would require
> >that the mask bits on the device are set if the mask bit in
> >guest is set (otherwise pending bits are disabled).
> 
> Can't this be done by setting the real mask bit when the guest reads
> the virtual pending bit, then reading the real pending bit?

Function specific is function-specific, but most likely not,
by that time the pending bit in the device might be clear:
'clear the Pending bit as soon as the message has been sent'

> >Existing code does not support PBA in assigned devices, so at least it's
> >not a regression there, and the virtio spec says nothing about this so
> >we should be fine.
> 
> Why isn't it subject to the pci spec?
> 
> If an interrupt condition exits, the bit should be set.

I wish. But this is not what the spec says above. It says if vector is
unmasked, bit must be cleared.

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30 11:07                       ` Michael S. Tsirkin
@ 2010-12-30 11:27                         ` Avi Kivity
  2010-12-30 12:17                           ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2010-12-30 11:27 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On 12/30/2010 01:07 PM, Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 12:37:18PM +0200, Avi Kivity wrote:
> >  On 12/30/2010 12:32 PM, Michael S. Tsirkin wrote:
> >  >On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> >  >>   On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> >  >>   >I am not really suggesting this. What I say is PBA is unimplemented
> >  >>   >let us not commit to an interface yet.
> >  >>
> >  >>   What happens to a guest that tries to use PBA?
> >  >>   It's a mandatory part of MSI-X, no?
> >  >
> >  >Yes. Unfortunately the pending bit is in fact a communication channel
> >  >used for function specific purposes when mask bit is set,
> >  >and 0 when unset. The spec even seems to *require* this use:
> >  >
> >  >I refer to this:
> >  >
> >  >	For MSI and MSI-X, while a vector is masked, the function is prohibited
> >  >	from sending the associated message, and the function must set the
> >  >	associated Pending bit whenever the function would otherwise send the
> >  >	message. When software unmasks a vector whose associated Pending bit is
> >  >	set, the function must schedule sending the associated message, and
> >  >	clear the Pending bit as soon as the message has been sent. Note that
> >  >	clearing the MSI-X Function Mask bit may result in many messages needing
> >  >	to be sent.
> >  >
> >  >
> >  >	If a masked vector has its Pending bit set, and the associated
> >  >	underlying interrupt events are somehow satisfied (usually by software
> >  >	though the exact manner is function-specific), the function must clear
> >  >	the Pending bit, to avoid sending a spurious interrupt message later
> >  >	when software unmasks the vector. However, if a subsequent interrupt
> >  >	event occurs while the vector is still masked, the function must again
> >  >	set the Pending bit.
> >  >
> >  >
> >  >	Software is permitted to mask one or more vectors indefinitely, and
> >  >	service their associated interrupt events strictly based on polling
> >  >	their Pending bits. A function must set and clear its Pending bits as
> >  >	necessary to support this “pure polling” mode of operation.
> >  >
> >  >For assigned devices, supporting this would require
> >  >that the mask bits on the device are set if the mask bit in
> >  >guest is set (otherwise pending bits are disabled).
> >
> >  Can't this be done by setting the real mask bit when the guest reads
> >  the virtual pending bit, then reading the real pending bit?
>
> Function specific is function-specific, but most likely not,
> by that time the pending bit in the device might be clear:
> 'clear the Pending bit as soon as the message has been sent'

But when we set the mask bit, it must change the pending bit back to the 
function-specific condition?


> >  >Existing code does not support PBA in assigned devices, so at least it's
> >  >not a regression there, and the virtio spec says nothing about this so
> >  >we should be fine.
> >
> >  Why isn't it subject to the pci spec?
> >
> >  If an interrupt condition exits, the bit should be set.
>
> I wish. But this is not what the spec says above. It says if vector is
> unmasked, bit must be cleared.

If interrupt condition exists, and the vector is masked, the pending bit 
is set.  Otherwise the pending bit is clear.  Better?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30 11:27                         ` Avi Kivity
@ 2010-12-30 12:17                           ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2010-12-30 12:17 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Thu, Dec 30, 2010 at 01:27:15PM +0200, Avi Kivity wrote:
> On 12/30/2010 01:07 PM, Michael S. Tsirkin wrote:
> >On Thu, Dec 30, 2010 at 12:37:18PM +0200, Avi Kivity wrote:
> >>  On 12/30/2010 12:32 PM, Michael S. Tsirkin wrote:
> >>  >On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> >>  >>   On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> >>  >>   >I am not really suggesting this. What I say is PBA is unimplemented
> >>  >>   >let us not commit to an interface yet.
> >>  >>
> >>  >>   What happens to a guest that tries to use PBA?
> >>  >>   It's a mandatory part of MSI-X, no?
> >>  >
> >>  >Yes. Unfortunately the pending bit is in fact a communication channel
> >>  >used for function specific purposes when mask bit is set,
> >>  >and 0 when unset. The spec even seems to *require* this use:
> >>  >
> >>  >I refer to this:
> >>  >
> >>  >	For MSI and MSI-X, while a vector is masked, the function is prohibited
> >>  >	from sending the associated message, and the function must set the
> >>  >	associated Pending bit whenever the function would otherwise send the
> >>  >	message. When software unmasks a vector whose associated Pending bit is
> >>  >	set, the function must schedule sending the associated message, and
> >>  >	clear the Pending bit as soon as the message has been sent. Note that
> >>  >	clearing the MSI-X Function Mask bit may result in many messages needing
> >>  >	to be sent.
> >>  >
> >>  >
> >>  >	If a masked vector has its Pending bit set, and the associated
> >>  >	underlying interrupt events are somehow satisfied (usually by software
> >>  >	though the exact manner is function-specific), the function must clear
> >>  >	the Pending bit, to avoid sending a spurious interrupt message later
> >>  >	when software unmasks the vector. However, if a subsequent interrupt
> >>  >	event occurs while the vector is still masked, the function must again
> >>  >	set the Pending bit.
> >>  >
> >>  >
> >>  >	Software is permitted to mask one or more vectors indefinitely, and
> >>  >	service their associated interrupt events strictly based on polling
> >>  >	their Pending bits. A function must set and clear its Pending bits as
> >>  >	necessary to support this “pure polling” mode of operation.
> >>  >
> >>  >For assigned devices, supporting this would require
> >>  >that the mask bits on the device are set if the mask bit in
> >>  >guest is set (otherwise pending bits are disabled).
> >>
> >>  Can't this be done by setting the real mask bit when the guest reads
> >>  the virtual pending bit, then reading the real pending bit?
> >
> >Function specific is function-specific, but most likely not,
> >by that time the pending bit in the device might be clear:
> >'clear the Pending bit as soon as the message has been sent'
> 
> But when we set the mask bit, it must change the pending bit back to
> the function-specific condition?

All it says is 'whenever the function would otherwise send a message'.
So this is function-specific, generally functions only send a message
once per event, they don't resend it assuming that it was queued
and will eventually be handled.


> >>  >Existing code does not support PBA in assigned devices, so at least it's
> >>  >not a regression there, and the virtio spec says nothing about this so
> >>  >we should be fine.
> >>
> >>  Why isn't it subject to the pci spec?
> >>
> >>  If an interrupt condition exits, the bit should be set.
> >
> >I wish. But this is not what the spec says above. It says if vector is
> >unmasked, bit must be cleared.
> 
> If interrupt condition exists, and the vector is masked, the pending
> bit is set.  Otherwise the pending bit is clear.  Better?

'whenever the function would otherwise send a message'
does not seem to match this description:
functions do not generally keep sending messages as long as
condition is satisfied (if you think about this, the optimization
of not masking immediately in hardware relies on this).

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-30 10:32                   ` Michael S. Tsirkin
  2010-12-30 10:37                     ` Avi Kivity
@ 2010-12-31  3:05                     ` Sheng Yang
  2011-01-02  9:26                       ` Michael S. Tsirkin
  2011-01-02 10:26                       ` Avi Kivity
  1 sibling, 2 replies; 32+ messages in thread
From: Sheng Yang @ 2010-12-31  3:05 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Michael S. Tsirkin, Marcelo Tosatti, kvm, Alex Williamson

On Thursday 30 December 2010 18:32:56 Michael S. Tsirkin wrote:
> On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> > On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> > >I am not really suggesting this. What I say is PBA is unimplemented
> > >let us not commit to an interface yet.
> > 
> > What happens to a guest that tries to use PBA?
> > It's a mandatory part of MSI-X, no?
> 
> Yes. Unfortunately the pending bit is in fact a communication channel
> used for function specific purposes when mask bit is set,
> and 0 when unset. The spec even seems to *require* this use:
> 
> I refer to this:
> 
> 	For MSI and MSI-X, while a vector is masked, the function is prohibited
> 	from sending the associated message, and the function must set the
> 	associated Pending bit whenever the function would otherwise send the
> 	message. When software unmasks a vector whose associated Pending bit is
> 	set, the function must schedule sending the associated message, and
> 	clear the Pending bit as soon as the message has been sent. Note that
> 	clearing the MSI-X Function Mask bit may result in many messages needing
> 	to be sent.
> 
> 
> 	If a masked vector has its Pending bit set, and the associated
> 	underlying interrupt events are somehow satisfied (usually by software
> 	though the exact manner is function-specific), the function must clear
> 	the Pending bit, to avoid sending a spurious interrupt message later
> 	when software unmasks the vector. However, if a subsequent interrupt
> 	event occurs while the vector is still masked, the function must again
> 	set the Pending bit.
> 
> 
> 	Software is permitted to mask one or more vectors indefinitely, and
> 	service their associated interrupt events strictly based on polling
> 	their Pending bits. A function must set and clear its Pending bits as
> 	necessary to support this “pure polling” mode of operation.
> 
> For assigned devices, supporting this would require
> that the mask bits on the device are set if the mask bit in
> guest is set (otherwise pending bits are disabled).

For assigned device, I think the result we should return is IRQ_PENDING bit of 
related IRQ. Seems it perfectly fits the meaning of pending bit definition here - 
set when masked, and if we didn't clean it, one interrupt would be retriggered 
after unmask. But it's a internal flag, and use it would lead to some core 
change(more need to be considered if we want to operate the flag bit outside core 
kernel part). 
> 
> Existing code does not support PBA in assigned devices, so at least it's
> not a regression there, and the virtio spec says nothing about this so
> we should be fine.

I agree. At least it's not a regression. And in fact we haven't seen any device 
driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or 
msix_pba_offset_reg().

I guess it's fine to get MSI-X mask part in first, then deal with PBA part if 
necessary - though we haven't seen any driver use it so far. It won't be worse 
with this patch anyway...

--
regards
Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-31  3:05                     ` Sheng Yang
@ 2011-01-02  9:26                       ` Michael S. Tsirkin
  2011-01-02 10:26                       ` Avi Kivity
  1 sibling, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2011-01-02  9:26 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, kvm, Alex Williamson

On Fri, Dec 31, 2010 at 11:05:28AM +0800, Sheng Yang wrote:
> On Thursday 30 December 2010 18:32:56 Michael S. Tsirkin wrote:
> > On Thu, Dec 30, 2010 at 11:30:12AM +0200, Avi Kivity wrote:
> > > On 12/30/2010 09:47 AM, Michael S. Tsirkin wrote:
> > > >I am not really suggesting this. What I say is PBA is unimplemented
> > > >let us not commit to an interface yet.
> > > 
> > > What happens to a guest that tries to use PBA?
> > > It's a mandatory part of MSI-X, no?
> > 
> > Yes. Unfortunately the pending bit is in fact a communication channel
> > used for function specific purposes when mask bit is set,
> > and 0 when unset. The spec even seems to *require* this use:
> > 
> > I refer to this:
> > 
> > 	For MSI and MSI-X, while a vector is masked, the function is prohibited
> > 	from sending the associated message, and the function must set the
> > 	associated Pending bit whenever the function would otherwise send the
> > 	message. When software unmasks a vector whose associated Pending bit is
> > 	set, the function must schedule sending the associated message, and
> > 	clear the Pending bit as soon as the message has been sent. Note that
> > 	clearing the MSI-X Function Mask bit may result in many messages needing
> > 	to be sent.
> > 
> > 
> > 	If a masked vector has its Pending bit set, and the associated
> > 	underlying interrupt events are somehow satisfied (usually by software
> > 	though the exact manner is function-specific), the function must clear
> > 	the Pending bit, to avoid sending a spurious interrupt message later
> > 	when software unmasks the vector. However, if a subsequent interrupt
> > 	event occurs while the vector is still masked, the function must again
> > 	set the Pending bit.
> > 
> > 
> > 	Software is permitted to mask one or more vectors indefinitely, and
> > 	service their associated interrupt events strictly based on polling
> > 	their Pending bits. A function must set and clear its Pending bits as
> > 	necessary to support this “pure polling” mode of operation.
> > 
> > For assigned devices, supporting this would require
> > that the mask bits on the device are set if the mask bit in
> > guest is set (otherwise pending bits are disabled).
> 
> For assigned device, I think the result we should return is IRQ_PENDING bit of 
> related IRQ. Seems it perfectly fits the meaning of pending bit definition here - 
> set when masked, and if we didn't clean it, one interrupt would be retriggered 
> after unmask.

Well, it doesn't seem to fit this part of the definition 
> > 	If a masked vector has its Pending bit set, and the associated
> > 	underlying interrupt events are somehow satisfied (usually by software
> > 	though the exact manner is function-specific), the function must clear
> > 	the Pending bit, to avoid sending a spurious interrupt message later
> > 	when software unmasks the vector. However, if a subsequent interrupt
> > 	event occurs while the vector is still masked, the function must again
> > 	set the Pending bit.
> > 
> > 	Software is permitted to mask one or more vectors indefinitely, and
> > 	service their associated interrupt events strictly based on polling
> > 	their Pending bits. A function must set and clear its Pending bits as
> > 	necessary to support this “pure polling” mode of operation.
looking at IRQ_PENDING will make the pending bit *never* clear while
the vector is masked.


> But it's a internal flag, and use it would lead to some core 
> change(more need to be considered if we want to operate the flag bit outside core 
> kernel part). 
> > 
> > Existing code does not support PBA in assigned devices, so at least it's
> > not a regression there, and the virtio spec says nothing about this so
> > we should be fine.
> 
> I agree. At least it's not a regression. And in fact we haven't seen any device 
> driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or 
> msix_pba_offset_reg().
> 
> I guess it's fine to get MSI-X mask part in first, then deal with PBA part if 
> necessary - though we haven't seen any driver use it so far. It won't be worse 
> with this patch anyway...
> 
> --
> regards
> Yang, Sheng

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2010-12-31  3:05                     ` Sheng Yang
  2011-01-02  9:26                       ` Michael S. Tsirkin
@ 2011-01-02 10:26                       ` Avi Kivity
  2011-01-02 10:39                         ` Michael S. Tsirkin
  1 sibling, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2011-01-02 10:26 UTC (permalink / raw)
  To: Sheng Yang; +Cc: Michael S. Tsirkin, Marcelo Tosatti, kvm, Alex Williamson

On 12/31/2010 05:05 AM, Sheng Yang wrote:
> >
> >  Existing code does not support PBA in assigned devices, so at least it's
> >  not a regression there, and the virtio spec says nothing about this so
> >  we should be fine.
>
> I agree. At least it's not a regression. And in fact we haven't seen any device
> driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or
> msix_pba_offset_reg().
>
> I guess it's fine to get MSI-X mask part in first, then deal with PBA part if
> necessary - though we haven't seen any driver use it so far. It won't be worse
> with this patch anyway...

In a way it is worse because before, the fix would belong in user space, 
which is easier to test and distribute.  Now we have to fix it in the 
kernel.

However I recognize that drivers which rely on the pending bit are 
rare/nonexistent (likely on in preboot environments where interrupts are 
hard), so even if we do code it, it will likely be incorrect (certainly 
without a test).

So I'll accept the patch without PBA.  Michael, what about supporting 
virtio?  Can we base something on this patch?

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2011-01-02 10:26                       ` Avi Kivity
@ 2011-01-02 10:39                         ` Michael S. Tsirkin
  2011-01-02 10:58                           ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2011-01-02 10:39 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Sun, Jan 02, 2011 at 12:26:11PM +0200, Avi Kivity wrote:
> On 12/31/2010 05:05 AM, Sheng Yang wrote:
> >>
> >>  Existing code does not support PBA in assigned devices, so at least it's
> >>  not a regression there, and the virtio spec says nothing about this so
> >>  we should be fine.
> >
> >I agree. At least it's not a regression. And in fact we haven't seen any device
> >driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or
> >msix_pba_offset_reg().
> >
> >I guess it's fine to get MSI-X mask part in first, then deal with PBA part if
> >necessary - though we haven't seen any driver use it so far. It won't be worse
> >with this patch anyway...
> 
> In a way it is worse because before, the fix would belong in user
> space, which is easier to test and distribute.  Now we have to fix
> it in the kernel.
> 
> However I recognize that drivers which rely on the pending bit are
> rare/nonexistent (likely on in preboot environments where interrupts
> are hard), so even if we do code it, it will likely be incorrect
> (certainly without a test).
> 
> So I'll accept the patch without PBA.  Michael, what about
> supporting virtio?  Can we base something on this patch?

I don't see how userspace can send interrupts with this
interface unfortunately. We also need irqfd support ...

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2011-01-02 10:39                         ` Michael S. Tsirkin
@ 2011-01-02 10:58                           ` Avi Kivity
  2011-01-02 11:51                             ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2011-01-02 10:58 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On 01/02/2011 12:39 PM, Michael S. Tsirkin wrote:
> >  >
> >  >I agree. At least it's not a regression. And in fact we haven't seen any device
> >  >driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or
> >  >msix_pba_offset_reg().
> >  >
> >  >I guess it's fine to get MSI-X mask part in first, then deal with PBA part if
> >  >necessary - though we haven't seen any driver use it so far. It won't be worse
> >  >with this patch anyway...
> >
> >  In a way it is worse because before, the fix would belong in user
> >  space, which is easier to test and distribute.  Now we have to fix
> >  it in the kernel.
> >
> >  However I recognize that drivers which rely on the pending bit are
> >  rare/nonexistent (likely on in preboot environments where interrupts
> >  are hard), so even if we do code it, it will likely be incorrect
> >  (certainly without a test).
> >
> >  So I'll accept the patch without PBA.  Michael, what about
> >  supporting virtio?  Can we base something on this patch?
>
> I don't see how userspace can send interrupts with this
> interface unfortunately. We also need irqfd support ...

Sure we'll need additions to that interface.

What about vhost-net and vfio?  I thought that they could emulate the 
mask bits:

- KVM_MMIOFD(vmfd, mmio_range, fd1, fd2) associates an mmio range with an fd
- writel(mmio_range) or readl(mmio_range) from the guest causes a 
command to be written to fd1
- for readl(), read from fd2 to see the result (works nicely for "pci 
read flushes posted writes")

this allows interesting stuff to be implemented in separate processes, 
threads, or kernel modules.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2011-01-02 10:58                           ` Avi Kivity
@ 2011-01-02 11:51                             ` Michael S. Tsirkin
  2011-01-02 13:34                               ` Avi Kivity
  0 siblings, 1 reply; 32+ messages in thread
From: Michael S. Tsirkin @ 2011-01-02 11:51 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Sun, Jan 02, 2011 at 12:58:50PM +0200, Avi Kivity wrote:
> On 01/02/2011 12:39 PM, Michael S. Tsirkin wrote:
> >>  >
> >>  >I agree. At least it's not a regression. And in fact we haven't seen any device
> >>  >driver use this. I've checked Linux kernel code, found no one used PCI_MSIX_PBA or
> >>  >msix_pba_offset_reg().
> >>  >
> >>  >I guess it's fine to get MSI-X mask part in first, then deal with PBA part if
> >>  >necessary - though we haven't seen any driver use it so far. It won't be worse
> >>  >with this patch anyway...
> >>
> >>  In a way it is worse because before, the fix would belong in user
> >>  space, which is easier to test and distribute.  Now we have to fix
> >>  it in the kernel.
> >>
> >>  However I recognize that drivers which rely on the pending bit are
> >>  rare/nonexistent (likely on in preboot environments where interrupts
> >>  are hard), so even if we do code it, it will likely be incorrect
> >>  (certainly without a test).
> >>
> >>  So I'll accept the patch without PBA.  Michael, what about
> >>  supporting virtio?  Can we base something on this patch?
> >
> >I don't see how userspace can send interrupts with this
> >interface unfortunately. We also need irqfd support ...
> 
> Sure we'll need additions to that interface.

What I suggested is 
1. an ioctl to map phy address + size to table id
2. a new gsi type with a table id + entry number.

If we have that, assigned devices, virtio and vhost-net can work
mostly as is, with just the mask bits accelerated.

> What about vhost-net and vfio?  I thought that they could emulate
> the mask bits:
> 
> - KVM_MMIOFD(vmfd, mmio_range, fd1, fd2) associates an mmio range with an fd
> - writel(mmio_range) or readl(mmio_range) from the guest causes a
> command to be written to fd1
> - for readl(), read from fd2 to see the result (works nicely for
> "pci read flushes posted writes")
> 
> this allows interesting stuff to be implemented in separate
> processes, threads, or kernel modules.

This could work. Some thought needs to be given to how we make sure that
an appropriate type of file is passed in. Maybe using a netlink
based connector for this a good idea?

OTOH if we have MSIX mask bit emulation in kvm anyway, using it makes
sense ...

> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2011-01-02 11:51                             ` Michael S. Tsirkin
@ 2011-01-02 13:34                               ` Avi Kivity
  2011-01-02 13:57                                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 32+ messages in thread
From: Avi Kivity @ 2011-01-02 13:34 UTC (permalink / raw)
  To: Michael S. Tsirkin; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On 01/02/2011 01:51 PM, Michael S. Tsirkin wrote:
> >  >
> >  >I don't see how userspace can send interrupts with this
> >  >interface unfortunately. We also need irqfd support ...
> >
> >  Sure we'll need additions to that interface.
>
> What I suggested is
> 1. an ioctl to map phy address + size to table id
> 2. a new gsi type with a table id + entry number.
>
> If we have that, assigned devices, virtio and vhost-net can work
> mostly as is, with just the mask bits accelerated.
>

Ok.  Please adopt this patch and send a series that does this.  I'd like 
to see the whole thing working.

> >  What about vhost-net and vfio?  I thought that they could emulate
> >  the mask bits:
> >
> >  - KVM_MMIOFD(vmfd, mmio_range, fd1, fd2) associates an mmio range with an fd
> >  - writel(mmio_range) or readl(mmio_range) from the guest causes a
> >  command to be written to fd1
> >  - for readl(), read from fd2 to see the result (works nicely for
> >  "pci read flushes posted writes")
> >
> >  this allows interesting stuff to be implemented in separate
> >  processes, threads, or kernel modules.
>
> This could work. Some thought needs to be given to how we make sure that
> an appropriate type of file is passed in. Maybe using a netlink
> based connector for this a good idea?

Why do we care what type of file is passed?  We just write there, and 
whatever is on the other side needs to handle it.

> OTOH if we have MSIX mask bit emulation in kvm anyway, using it makes
> sense ...

Yeah, except I'm not sure how the current proposal can interface with 
vfio.  So we'll have two interfaces, until we vfio takes over completely.

-- 
error compiling committee.c: too many arguments to function


^ permalink raw reply	[flat|nested] 32+ messages in thread

* Re: [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel
  2011-01-02 13:34                               ` Avi Kivity
@ 2011-01-02 13:57                                 ` Michael S. Tsirkin
  0 siblings, 0 replies; 32+ messages in thread
From: Michael S. Tsirkin @ 2011-01-02 13:57 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Sheng Yang, Marcelo Tosatti, kvm, Alex Williamson

On Sun, Jan 02, 2011 at 03:34:21PM +0200, Avi Kivity wrote:
> On 01/02/2011 01:51 PM, Michael S. Tsirkin wrote:
> >>  >
> >>  >I don't see how userspace can send interrupts with this
> >>  >interface unfortunately. We also need irqfd support ...
> >>
> >>  Sure we'll need additions to that interface.
> >
> >What I suggested is
> >1. an ioctl to map phy address + size to table id
> >2. a new gsi type with a table id + entry number.
> >
> >If we have that, assigned devices, virtio and vhost-net can work
> >mostly as is, with just the mask bits accelerated.
> >
> 
> Ok.  Please adopt this patch and send a series that does this.  I'd
> like to see the whole thing working.
> 
> >>  What about vhost-net and vfio?  I thought that they could emulate
> >>  the mask bits:
> >>
> >>  - KVM_MMIOFD(vmfd, mmio_range, fd1, fd2) associates an mmio range with an fd
> >>  - writel(mmio_range) or readl(mmio_range) from the guest causes a
> >>  command to be written to fd1
> >>  - for readl(), read from fd2 to see the result (works nicely for
> >>  "pci read flushes posted writes")
> >>
> >>  this allows interesting stuff to be implemented in separate
> >>  processes, threads, or kernel modules.
> >
> >This could work. Some thought needs to be given to how we make sure that
> >an appropriate type of file is passed in. Maybe using a netlink
> >based connector for this a good idea?
> 
> Why do we care what type of file is passed?  We just write there,
> and whatever is on the other side needs to handle it.
> 
> >OTOH if we have MSIX mask bit emulation in kvm anyway, using it makes
> >sense ...
> 
> Yeah, except I'm not sure how the current proposal can interface
> with vfio.

Me too.

>  So we'll have two interfaces, until we vfio takes over
> completely.
> -- 
> error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 32+ messages in thread

end of thread, other threads:[~2011-01-02 13:58 UTC | newest]

Thread overview: 32+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-12-22  8:44 [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang
2010-12-22  8:44 ` [PATCH 1/2] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
2010-12-22  8:44 ` [PATCH 2/2][RFC] KVM: Emulate MSI-X table and PBA in kernel Sheng Yang
2010-12-28 12:26   ` Avi Kivity
2010-12-29  7:18     ` Sheng Yang
2010-12-29  8:31       ` Michael S. Tsirkin
2010-12-29  8:55         ` Sheng Yang
2010-12-29  9:28           ` Michael S. Tsirkin
2010-12-30  7:32             ` Sheng Yang
2010-12-30  7:47               ` Michael S. Tsirkin
2010-12-30  7:55                 ` Sheng Yang
2010-12-30  8:15                   ` Michael S. Tsirkin
2010-12-30  8:24                     ` Sheng Yang
2010-12-30  8:52                       ` Michael S. Tsirkin
2010-12-30  9:13                         ` Sheng Yang
2010-12-30  9:30                 ` Avi Kivity
2010-12-30 10:32                   ` Michael S. Tsirkin
2010-12-30 10:37                     ` Avi Kivity
2010-12-30 11:07                       ` Michael S. Tsirkin
2010-12-30 11:27                         ` Avi Kivity
2010-12-30 12:17                           ` Michael S. Tsirkin
2010-12-31  3:05                     ` Sheng Yang
2011-01-02  9:26                       ` Michael S. Tsirkin
2011-01-02 10:26                       ` Avi Kivity
2011-01-02 10:39                         ` Michael S. Tsirkin
2011-01-02 10:58                           ` Avi Kivity
2011-01-02 11:51                             ` Michael S. Tsirkin
2011-01-02 13:34                               ` Avi Kivity
2011-01-02 13:57                                 ` Michael S. Tsirkin
2010-12-30  9:28               ` Avi Kivity
2010-12-30 10:03                 ` Michael S. Tsirkin
2010-12-28  4:05 ` [PATCH 0/2 v6] MSI-X mask bit support for KVM Sheng Yang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox