* [PATCH 0/3 v8] MSI-X MMIO support for KVM
@ 2011-01-30 5:11 Sheng Yang
2011-01-30 5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Sheng Yang @ 2011-01-30 5:11 UTC (permalink / raw)
To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang
Change from v7:
Update according to Marcelo and Avi's comments.
BTW: I would be on vacation for Chinese New Year soon, and would be back mid Feb.
Sheng Yang (3):
KVM: Move struct kvm_io_device to kvm_host.h
KVM: Emulate MSI-X table in kernel
KVM: Add documents for MSI-X MMIO API
Documentation/kvm/api.txt | 47 ++++++++
arch/x86/kvm/Makefile | 2 +-
arch/x86/kvm/x86.c | 8 +-
include/linux/kvm.h | 21 ++++
include/linux/kvm_host.h | 48 ++++++++
virt/kvm/assigned-dev.c | 44 +++++++
virt/kvm/iodev.h | 25 +----
virt/kvm/kvm_main.c | 38 ++++++-
virt/kvm/msix_mmio.c | 286 +++++++++++++++++++++++++++++++++++++++++++++
virt/kvm/msix_mmio.h | 25 ++++
10 files changed, 513 insertions(+), 31 deletions(-)
create mode 100644 virt/kvm/msix_mmio.c
create mode 100644 virt/kvm/msix_mmio.h
^ permalink raw reply [flat|nested] 20+ messages in thread* [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h 2011-01-30 5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang @ 2011-01-30 5:11 ` Sheng Yang 2011-01-30 5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang 2011-01-30 5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang 2 siblings, 0 replies; 20+ messages in thread From: Sheng Yang @ 2011-01-30 5:11 UTC (permalink / raw) To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang Then it can be used by other struct in kvm_host.h Signed-off-by: Sheng Yang <sheng@linux.intel.com> --- include/linux/kvm_host.h | 23 +++++++++++++++++++++++ virt/kvm/iodev.h | 25 +------------------------ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b5021db..7d313e0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -98,6 +98,29 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif +struct kvm_io_device; + +/** + * kvm_io_device_ops are called under kvm slots_lock. + * read and write handlers return 0 if the transaction has been handled, + * or non-zero to have it passed to the next device. + **/ +struct kvm_io_device_ops { + int (*read)(struct kvm_io_device *this, + gpa_t addr, + int len, + void *val); + int (*write)(struct kvm_io_device *this, + gpa_t addr, + int len, + const void *val); + void (*destructor)(struct kvm_io_device *this); +}; + +struct kvm_io_device { + const struct kvm_io_device_ops *ops; +}; + struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h index 12fd3ca..d1f5651 100644 --- a/virt/kvm/iodev.h +++ b/virt/kvm/iodev.h @@ -17,32 +17,9 @@ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> +#include <linux/kvm_host.h> #include <asm/errno.h> -struct kvm_io_device; - -/** - * kvm_io_device_ops are called under kvm slots_lock. - * read and write handlers return 0 if the transaction has been handled, - * or non-zero to have it passed to the next device. - **/ -struct kvm_io_device_ops { - int (*read)(struct kvm_io_device *this, - gpa_t addr, - int len, - void *val); - int (*write)(struct kvm_io_device *this, - gpa_t addr, - int len, - const void *val); - void (*destructor)(struct kvm_io_device *this); -}; - - -struct kvm_io_device { - const struct kvm_io_device_ops *ops; -}; - static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { -- 1.7.0.1 ^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-01-30 5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang 2011-01-30 5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang @ 2011-01-30 5:11 ` Sheng Yang 2011-02-03 1:05 ` Marcelo Tosatti 2011-02-23 0:19 ` Alex Williamson 2011-01-30 5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang 2 siblings, 2 replies; 20+ messages in thread From: Sheng Yang @ 2011-01-30 5:11 UTC (permalink / raw) To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang Then we can support mask bit operation of assigned devices now. Signed-off-by: Sheng Yang <sheng@linux.intel.com> --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/x86.c | 8 +- include/linux/kvm.h | 21 ++++ include/linux/kvm_host.h | 25 ++++ virt/kvm/assigned-dev.c | 44 +++++++ virt/kvm/kvm_main.c | 38 ++++++- virt/kvm/msix_mmio.c | 286 ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h | 25 ++++ 8 files changed, 442 insertions(+), 7 deletions(-) create mode 100644 virt/kvm/msix_mmio.c create mode 100644 virt/kvm/msix_mmio.h diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f15501f..3a0d851 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ coalesced_mmio.o irq_comm.o eventfd.o \ - assigned-dev.o) + assigned-dev.o msix_mmio.o) kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fa708c9..89bf12c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_MSIX_MMIO: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int r; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, mmio: trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + r = vcpu_mmio_write(vcpu, gpa, bytes, val); /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + if (!r) return X86EMUL_CONTINUE; vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = (r == -ENOTSYNC) ? + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; vcpu->run->mmio.len = vcpu->mmio_size = bytes; vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index ea2dc1a..ad9df4b 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -161,6 +161,7 @@ struct kvm_pit_config { #define KVM_EXIT_NMI 16 #define KVM_EXIT_INTERNAL_ERROR 17 #define KVM_EXIT_OSI 18 +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 /* For KVM_EXIT_INTERNAL_ERROR */ #define KVM_INTERNAL_ERROR_EMULATION 1 @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_PPC_GET_PVINFO 57 #define KVM_CAP_PPC_IRQ_LEVEL 58 #define KVM_CAP_ASYNC_PF 59 +#define KVM_CAP_MSIX_MMIO 60 #ifdef KVM_CAP_IRQ_ROUTING @@ -672,6 +674,9 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +/* Available with KVM_CAP_MSIX_MMIO */ +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) + +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) + +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 +struct kvm_msix_mmio_user { + __u32 dev_id; + __u16 type; + __u16 max_entries_nr; + __u64 base_addr; + __u64 base_va; + __u64 flags; + __u64 reserved[4]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7d313e0..c10670c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -233,6 +233,27 @@ struct kvm_memslots { KVM_PRIVATE_MEM_SLOTS]; }; +#define KVM_MSIX_MMIO_MAX 32 + +struct kvm_msix_mmio { + u32 dev_id; + u16 type; + u16 max_entries_nr; + u64 flags; + gpa_t table_base_addr; + hva_t table_base_va; + gpa_t pba_base_addr; + hva_t pba_base_va; +}; + +struct kvm_msix_mmio_dev { + struct kvm *kvm; + struct kvm_io_device table_dev; + int mmio_nr; + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; + struct mutex lock; +}; + struct kvm { spinlock_t mmu_lock; raw_spinlock_t requests_lock; @@ -281,6 +302,7 @@ struct kvm { long mmu_notifier_count; #endif long tlbs_dirty; + struct kvm_msix_mmio_dev msix_mmio_dev; }; /* The guest did something we don't support. */ @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask); + /* For vcpu->arch.iommu_flags */ #define KVM_IOMMU_CACHE_COHERENCY 0x1 diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index ae72ae6..d1598a6 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -18,6 +18,7 @@ #include <linux/interrupt.h> #include <linux/slab.h> #include "irq.h" +#include "msix_mmio.h" static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); } +static void assigned_device_free_msix_mmio(struct kvm *kvm, + struct kvm_assigned_dev_kernel *adev) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = adev->assigned_dev_id; + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | + KVM_MSIX_MMIO_TYPE_BASE_TABLE; + kvm_free_msix_mmio(kvm, &mmio); +} + static void kvm_free_assigned_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *assigned_dev) { kvm_free_assigned_irq(kvm, assigned_dev); + assigned_device_free_msix_mmio(kvm, assigned_dev); + __pci_reset_function(assigned_dev->dev); pci_restore_state(assigned_dev->dev); @@ -785,3 +799,33 @@ out: return r; } +/* The caller should hold kvm->lock */ +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, + int assigned_dev_id, int entry, bool mask) +{ + int r = -EFAULT; + struct kvm_assigned_dev_kernel *adev; + int i; + + if (!irqchip_in_kernel(kvm)) + return r; + + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev_id); + if (!adev) + goto out; + + /* For non-MSIX enabled devices, entries_nr == 0 */ + for (i = 0; i < adev->entries_nr; i++) + if (adev->host_msix_entries[i].entry == entry) { + if (mask) + disable_irq_nosync( + adev->host_msix_entries[i].vector); + else + enable_irq(adev->host_msix_entries[i].vector); + r = 0; + break; + } +out: + return r; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b1b6cbb..b7807c8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -56,6 +56,7 @@ #include "coalesced_mmio.h" #include "async_pf.h" +#include "msix_mmio.h" #define CREATE_TRACE_POINTS #include <trace/events/kvm.h> @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); + kvm_unregister_msix_mmio_dev(kvm); spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, mutex_unlock(&kvm->lock); break; #endif + case KVM_REGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); + break; + } + case KVM_UNREGISTER_MSIX_MMIO: { + struct kvm_msix_mmio_user mmio_user; + + r = -EFAULT; + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) + goto out; + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); + break; + } default: r = kvm_arch_vm_ioctl(filp, ioctl, arg); if (r == -ENOTTY) @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) return r; } #endif + r = kvm_register_msix_mmio_dev(kvm); + if (r < 0) { + kvm_put_kvm(kvm); + return r; + } + r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); if (r < 0) kvm_put_kvm(kvm); @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len, const void *val) { - int i; + int i, r = -EOPNOTSUPP; struct kvm_io_bus *bus; bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); - for (i = 0; i < bus->dev_count; i++) - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) + for (i = 0; i < bus->dev_count; i++) { + r = kvm_iodevice_write(bus->devs[i], addr, len, val); + if (r == -ENOTSYNC) + break; + else if (!r) return 0; - return -EOPNOTSUPP; + } + return r; } /* kvm_io_bus_read - called under kvm->slots_lock */ diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c new file mode 100644 index 0000000..dde9b62 --- /dev/null +++ b/virt/kvm/msix_mmio.c @@ -0,0 +1,286 @@ +/* + * MSI-X MMIO emulation + * + * Copyright (c) 2010 Intel Corporation + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Author: + * Sheng Yang <sheng.yang@intel.com> + */ + +#include <linux/kvm_host.h> +#include <linux/kvm.h> + +#include "msix_mmio.h" +#include "iodev.h" + +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio, + int entry, u32 flag) +{ + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) + return kvm_assigned_device_update_msix_mask_bit(kvm, + mmio->dev_id, entry, flag); + return -EFAULT; +} + +/* Caller must hold dev->lock */ +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, + gpa_t addr, int len) +{ + gpa_t start, end; + int i, r = -EINVAL; + + for (i = 0; i < dev->mmio_nr; i++) { + start = dev->mmio[i].table_base_addr; + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * + dev->mmio[i].max_entries_nr; + if (addr >= start && addr + len <= end) { + r = i; + break; + } + } + + return r; +} + +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, + void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, ret = 0, entry, offset, r; + + mutex_lock(&mmio_dev->lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx < 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr & 0x3) || (len != 4 && len != 8)) + goto out; + + offset = addr & 0xf; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) + goto out; + + mmio = &mmio_dev->mmio[idx]; + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; + r = copy_from_user(val, (void __user *)(mmio->table_base_va + + entry * PCI_MSIX_ENTRY_SIZE + offset), len); + if (r) + goto out; +out: + mutex_unlock(&mmio_dev->lock); + return ret; +} + +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, + int len, const void *val) +{ + struct kvm_msix_mmio_dev *mmio_dev = + container_of(this, struct kvm_msix_mmio_dev, table_dev); + struct kvm_msix_mmio *mmio; + int idx, entry, offset, ret = 0, r = 0; + gpa_t entry_base; + u32 old_ctrl, new_ctrl; + u32 *ctrl_pos; + + mutex_lock(&mmio_dev->kvm->lock); + mutex_lock(&mmio_dev->lock); + idx = get_mmio_table_index(mmio_dev, addr, len); + if (idx < 0) { + ret = -EOPNOTSUPP; + goto out; + } + if ((addr & 0x3) || (len != 4 && len != 8)) + goto out; + + offset = addr & 0xF; + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) + goto out; + + mmio = &mmio_dev->mmio[idx]; + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); + + if (get_user(old_ctrl, ctrl_pos)) + goto out; + + /* No allow writing to other fields when entry is unmasked */ + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) + goto out; + + if (copy_to_user((void __user *)(entry_base + offset), val, len)) + goto out; + + if (get_user(new_ctrl, ctrl_pos)) + goto out; + + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) + ret = -ENOTSYNC; + if (old_ctrl == new_ctrl) + goto out; + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); + if (r || ret) + ret = -ENOTSYNC; +out: + mutex_unlock(&mmio_dev->lock); + mutex_unlock(&mmio_dev->kvm->lock); + return ret; +} + +static const struct kvm_io_device_ops msix_mmio_table_ops = { + .read = msix_table_mmio_read, + .write = msix_table_mmio_write, +}; + +int kvm_register_msix_mmio_dev(struct kvm *kvm) +{ + int ret; + + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops); + mutex_init(&kvm->msix_mmio_dev.lock); + kvm->msix_mmio_dev.kvm = kvm; + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, + &kvm->msix_mmio_dev.table_dev); + mutex_unlock(&kvm->slots_lock); + return ret; +} + +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) +{ + int ret; + + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, + &kvm->msix_mmio_dev.table_dev); + mutex_unlock(&kvm->slots_lock); + return ret; +} + +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; + struct kvm_msix_mmio *mmio = NULL; + int r = 0, i; + + mutex_lock(&mmio_dev->lock); + for (i = 0; i < mmio_dev->mmio_nr; i++) { + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { + mmio = &mmio_dev->mmio[i]; + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { + r = -EINVAL; + goto out; + } + break; + } + } + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { + r = -EINVAL; + goto out; + } + /* All reserved currently */ + if (mmio_user->flags) { + r = -EINVAL; + goto out; + } + + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { + r = -EINVAL; + goto out; + } + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { + r = -EINVAL; + goto out; + } + + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { + r = -EINVAL; + goto out; + } + if (!mmio) { + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { + r = -ENOSPC; + goto out; + } + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; + } + + mmio_dev->mmio_nr++; + mmio->max_entries_nr = mmio_user->max_entries_nr; + mmio->dev_id = mmio_user->dev_id; + mmio->flags = mmio_user->flags; + + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; + mmio->table_base_addr = mmio_user->base_addr; + mmio->table_base_va = mmio_user->base_va; + } +out: + mutex_unlock(&mmio_dev->lock); + return r; +} + +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) +{ + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; + int r = 0, i, j; + bool found = 0; + + if (!mmio) + return 0; + + mutex_lock(&mmio_dev->lock); + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); + for (i = 0; i < mmio_dev->mmio_nr; i++) { + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && + mmio_dev->mmio[i].type == mmio->type) { + found = true; + for (j = i; j < mmio_dev->mmio_nr - 1; j++) + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; + mmio_dev->mmio_nr--; + break; + } + } + if (!found) + r = -EINVAL; + mutex_unlock(&mmio_dev->lock); + return r; +} + +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user) +{ + struct kvm_msix_mmio mmio; + + mmio.dev_id = mmio_user->dev_id; + mmio.type = mmio_user->type; + + return kvm_free_msix_mmio(kvm, &mmio); +} + diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h new file mode 100644 index 0000000..01b6587 --- /dev/null +++ b/virt/kvm/msix_mmio.h @@ -0,0 +1,25 @@ +#ifndef __KVM_MSIX_MMIO_H__ +#define __KVM_MSIX_MMIO_H__ +/* + * MSI-X MMIO emulation + * + * Copyright (c) 2010 Intel Corporation + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Author: + * Sheng Yang <sheng.yang@intel.com> + */ + +#include <linux/pci.h> + +int kvm_register_msix_mmio_dev(struct kvm *kvm); +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user); +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, + struct kvm_msix_mmio_user *mmio_user); +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user); + +#endif -- 1.7.0.1 ^ permalink raw reply related [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-01-30 5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang @ 2011-02-03 1:05 ` Marcelo Tosatti 2011-02-18 8:15 ` Sheng Yang 2011-02-23 0:19 ` Alex Williamson 1 sibling, 1 reply; 20+ messages in thread From: Marcelo Tosatti @ 2011-02-03 1:05 UTC (permalink / raw) To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote: > Then we can support mask bit operation of assigned devices now. > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > + struct kvm_msix_mmio_user *mmio_user) > +{ > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > + struct kvm_msix_mmio *mmio = NULL; > + int r = 0, i; > + > + mutex_lock(&mmio_dev->lock); > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > + mmio = &mmio_dev->mmio[i]; > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > + r = -EINVAL; > + goto out; > + } > + break; > + } Why allow this ioctl to succeed if there's an entry already present? This case is broken as mmio_dev->mmio_nr is increased below. PCI bits must be reviewed... ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-03 1:05 ` Marcelo Tosatti @ 2011-02-18 8:15 ` Sheng Yang 2011-02-22 17:58 ` Marcelo Tosatti 0 siblings, 1 reply; 20+ messages in thread From: Sheng Yang @ 2011-02-18 8:15 UTC (permalink / raw) To: Marcelo Tosatti; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson On Thursday 03 February 2011 09:05:55 Marcelo Tosatti wrote: > On Sun, Jan 30, 2011 at 01:11:15PM +0800, Sheng Yang wrote: > > Then we can support mask bit operation of assigned devices now. > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio_user *mmio_user) > > +{ > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > + struct kvm_msix_mmio *mmio = NULL; > > + int r = 0, i; > > + > > + mutex_lock(&mmio_dev->lock); > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > + mmio = &mmio_dev->mmio[i]; > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > + r = -EINVAL; > > + goto out; > > + } > > + break; > > + } > > Why allow this ioctl to succeed if there's an entry already present? > This case is broken as mmio_dev->mmio_nr is increased below. Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it. The reason we allow multiply callings is userspace can register different types of address here(table address and PBA address). > PCI bits must be reviewed... Pardon? PCI related things are already in 2.6.38-rc. -- regards Yang, Sheng ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-18 8:15 ` Sheng Yang @ 2011-02-22 17:58 ` Marcelo Tosatti 0 siblings, 0 replies; 20+ messages in thread From: Marcelo Tosatti @ 2011-02-22 17:58 UTC (permalink / raw) To: Sheng Yang; +Cc: Avi Kivity, Michael S. Tsirkin, kvm, Alex Williamson On Fri, Feb 18, 2011 at 04:15:38PM +0800, Sheng Yang wrote: > > Why allow this ioctl to succeed if there's an entry already present? > > This case is broken as mmio_dev->mmio_nr is increased below. > > Oh, It's a bug to let mmio_nr increase even with MMIO found. I've fixed it. > > The reason we allow multiply callings is userspace can register different types of > address here(table address and PBA address). Ah OK. > > PCI bits must be reviewed... > > Pardon? PCI related things are already in 2.6.38-rc. I meant someone more familiar with PCI should review this patch, Michael/Alex, CC'ed. ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-01-30 5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang 2011-02-03 1:05 ` Marcelo Tosatti @ 2011-02-23 0:19 ` Alex Williamson 2011-02-23 6:59 ` Sheng Yang 1 sibling, 1 reply; 20+ messages in thread From: Alex Williamson @ 2011-02-23 0:19 UTC (permalink / raw) To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > Then we can support mask bit operation of assigned devices now. Looks pretty good overall. A few comments below. It seems like we should be able to hook this into vfio with a small stub in kvm. We just need to be able to communicate disabling and enabling of individual msix vectors. For brute force, we could do this with a lot of eventfds, triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure if there's something smaller that could do it. Thanks, Alex > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > --- > arch/x86/kvm/Makefile | 2 +- > arch/x86/kvm/x86.c | 8 +- > include/linux/kvm.h | 21 ++++ > include/linux/kvm_host.h | 25 ++++ > virt/kvm/assigned-dev.c | 44 +++++++ > virt/kvm/kvm_main.c | 38 ++++++- > virt/kvm/msix_mmio.c | 286 ++++++++++++++++++++++++++++++++++++++++++++++ > virt/kvm/msix_mmio.h | 25 ++++ > 8 files changed, 442 insertions(+), 7 deletions(-) > create mode 100644 virt/kvm/msix_mmio.c > create mode 100644 virt/kvm/msix_mmio.h > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > index f15501f..3a0d851 100644 > --- a/arch/x86/kvm/Makefile > +++ b/arch/x86/kvm/Makefile > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > coalesced_mmio.o irq_comm.o eventfd.o \ > - assigned-dev.o) > + assigned-dev.o msix_mmio.o) > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o) > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index fa708c9..89bf12c 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_X86_ROBUST_SINGLESTEP: > case KVM_CAP_XSAVE: > case KVM_CAP_ASYNC_PF: > + case KVM_CAP_MSIX_MMIO: > r = 1; > break; > case KVM_CAP_COALESCED_MMIO: > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, > struct kvm_vcpu *vcpu) > { > gpa_t gpa; > + int r; > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > @@ -3822,14 +3824,16 @@ static int emulator_write_emulated_onepage(unsigned long addr, > > mmio: > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > /* > * Is this MMIO handled locally? > */ > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > + if (!r) > return X86EMUL_CONTINUE; > > vcpu->mmio_needed = 1; > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > index ea2dc1a..ad9df4b 100644 > --- a/include/linux/kvm.h > +++ b/include/linux/kvm.h > @@ -161,6 +161,7 @@ struct kvm_pit_config { > #define KVM_EXIT_NMI 16 > #define KVM_EXIT_INTERNAL_ERROR 17 > #define KVM_EXIT_OSI 18 > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > /* For KVM_EXIT_INTERNAL_ERROR */ > #define KVM_INTERNAL_ERROR_EMULATION 1 > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > #define KVM_CAP_PPC_GET_PVINFO 57 > #define KVM_CAP_PPC_IRQ_LEVEL 58 > #define KVM_CAP_ASYNC_PF 59 > +#define KVM_CAP_MSIX_MMIO 60 > > #ifdef KVM_CAP_IRQ_ROUTING > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) > #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) > #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) > +/* Available with KVM_CAP_MSIX_MMIO */ > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct kvm_msix_mmio_user) > +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) > /* Available with KVM_CAP_PIT_STATE2 */ > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) > #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > __u16 padding[3]; > }; > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > + > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > + > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > +struct kvm_msix_mmio_user { > + __u32 dev_id; > + __u16 type; > + __u16 max_entries_nr; > + __u64 base_addr; > + __u64 base_va; > + __u64 flags; > + __u64 reserved[4]; > +}; > + > #endif /* __LINUX_KVM_H */ > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > index 7d313e0..c10670c 100644 > --- a/include/linux/kvm_host.h > +++ b/include/linux/kvm_host.h > @@ -233,6 +233,27 @@ struct kvm_memslots { > KVM_PRIVATE_MEM_SLOTS]; > }; > > +#define KVM_MSIX_MMIO_MAX 32 I assume this is based on 32 devices/bus, but we could exceed this is we add more buses or make use of more functions. How hard is it to make this dynamic? > + > +struct kvm_msix_mmio { > + u32 dev_id; > + u16 type; > + u16 max_entries_nr; > + u64 flags; > + gpa_t table_base_addr; > + hva_t table_base_va; > + gpa_t pba_base_addr; > + hva_t pba_base_va; > +}; > + > +struct kvm_msix_mmio_dev { > + struct kvm *kvm; > + struct kvm_io_device table_dev; > + int mmio_nr; > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > + struct mutex lock; > +}; > + > struct kvm { > spinlock_t mmu_lock; > raw_spinlock_t requests_lock; > @@ -281,6 +302,7 @@ struct kvm { > long mmu_notifier_count; > #endif > long tlbs_dirty; > + struct kvm_msix_mmio_dev msix_mmio_dev; > }; > > /* The guest did something we don't support. */ > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, > int kvm_request_irq_source_id(struct kvm *kvm); > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > + int assigned_dev_id, int entry, bool mask); > + > /* For vcpu->arch.iommu_flags */ > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > index ae72ae6..d1598a6 100644 > --- a/virt/kvm/assigned-dev.c > +++ b/virt/kvm/assigned-dev.c > @@ -18,6 +18,7 @@ > #include <linux/interrupt.h> > #include <linux/slab.h> > #include "irq.h" > +#include "msix_mmio.h" > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, > int assigned_dev_id) > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); > } > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > + struct kvm_assigned_dev_kernel *adev) > +{ > + struct kvm_msix_mmio mmio; > + > + mmio.dev_id = adev->assigned_dev_id; > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > + kvm_free_msix_mmio(kvm, &mmio); > +} > + > static void kvm_free_assigned_device(struct kvm *kvm, > struct kvm_assigned_dev_kernel > *assigned_dev) > { > kvm_free_assigned_irq(kvm, assigned_dev); > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > + > __pci_reset_function(assigned_dev->dev); > pci_restore_state(assigned_dev->dev); > > @@ -785,3 +799,33 @@ out: > return r; > } > > +/* The caller should hold kvm->lock */ > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > + int assigned_dev_id, int entry, bool mask) > +{ > + int r = -EFAULT; > + struct kvm_assigned_dev_kernel *adev; > + int i; > + > + if (!irqchip_in_kernel(kvm)) > + return r; > + > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > + assigned_dev_id); > + if (!adev) > + goto out; > + > + /* For non-MSIX enabled devices, entries_nr == 0 */ > + for (i = 0; i < adev->entries_nr; i++) > + if (adev->host_msix_entries[i].entry == entry) { > + if (mask) > + disable_irq_nosync( > + adev->host_msix_entries[i].vector); > + else > + enable_irq(adev->host_msix_entries[i].vector); > + r = 0; > + break; > + } > +out: > + return r; > +} > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > index b1b6cbb..b7807c8 100644 > --- a/virt/kvm/kvm_main.c > +++ b/virt/kvm/kvm_main.c > @@ -56,6 +56,7 @@ > > #include "coalesced_mmio.h" > #include "async_pf.h" > +#include "msix_mmio.h" > > #define CREATE_TRACE_POINTS > #include <trace/events/kvm.h> > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > struct mm_struct *mm = kvm->mm; > > kvm_arch_sync_events(kvm); > + kvm_unregister_msix_mmio_dev(kvm); > spin_lock(&kvm_lock); > list_del(&kvm->vm_list); > spin_unlock(&kvm_lock); > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > mutex_unlock(&kvm->lock); > break; > #endif > + case KVM_REGISTER_MSIX_MMIO: { > + struct kvm_msix_mmio_user mmio_user; > + > + r = -EFAULT; > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > + goto out; > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > + break; > + } > + case KVM_UNREGISTER_MSIX_MMIO: { > + struct kvm_msix_mmio_user mmio_user; > + > + r = -EFAULT; > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > + goto out; > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > + break; > + } > default: > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > if (r == -ENOTTY) > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > return r; > } > #endif > + r = kvm_register_msix_mmio_dev(kvm); > + if (r < 0) { > + kvm_put_kvm(kvm); > + return r; > + } > + > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > if (r < 0) > kvm_put_kvm(kvm); > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus *bus) > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, > int len, const void *val) > { > - int i; > + int i, r = -EOPNOTSUPP; > struct kvm_io_bus *bus; > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > - for (i = 0; i < bus->dev_count; i++) > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > + for (i = 0; i < bus->dev_count; i++) { > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > + if (r == -ENOTSYNC) > + break; > + else if (!r) > return 0; > - return -EOPNOTSUPP; > + } > + return r; > } > > /* kvm_io_bus_read - called under kvm->slots_lock */ > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > new file mode 100644 > index 0000000..dde9b62 > --- /dev/null > +++ b/virt/kvm/msix_mmio.c > @@ -0,0 +1,286 @@ > +/* > + * MSI-X MMIO emulation > + * > + * Copyright (c) 2010 Intel Corporation > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + * Author: > + * Sheng Yang <sheng.yang@intel.com> > + */ > + > +#include <linux/kvm_host.h> > +#include <linux/kvm.h> > + > +#include "msix_mmio.h" > +#include "iodev.h" > + > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio *mmio, > + int entry, u32 flag) > +{ > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > + return kvm_assigned_device_update_msix_mask_bit(kvm, > + mmio->dev_id, entry, flag); > + return -EFAULT; > +} > + > +/* Caller must hold dev->lock */ > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > + gpa_t addr, int len) > +{ > + gpa_t start, end; > + int i, r = -EINVAL; > + > + for (i = 0; i < dev->mmio_nr; i++) { > + start = dev->mmio[i].table_base_addr; > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > + dev->mmio[i].max_entries_nr; > + if (addr >= start && addr + len <= end) { > + r = i; > + break; > + } Yet another potential user of the weight balanced tree ;) > + } > + > + return r; > +} > + > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, int len, > + void *val) > +{ > + struct kvm_msix_mmio_dev *mmio_dev = > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > + struct kvm_msix_mmio *mmio; > + int idx, ret = 0, entry, offset, r; > + > + mutex_lock(&mmio_dev->lock); > + idx = get_mmio_table_index(mmio_dev, addr, len); > + if (idx < 0) { > + ret = -EOPNOTSUPP; > + goto out; > + } > + if ((addr & 0x3) || (len != 4 && len != 8)) > + goto out; > + > + offset = addr & 0xf; > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > + goto out; > + > + mmio = &mmio_dev->mmio[idx]; > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > + if (r) > + goto out; > +out: > + mutex_unlock(&mmio_dev->lock); > + return ret; > +} > + > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > + int len, const void *val) > +{ > + struct kvm_msix_mmio_dev *mmio_dev = > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > + struct kvm_msix_mmio *mmio; > + int idx, entry, offset, ret = 0, r = 0; > + gpa_t entry_base; > + u32 old_ctrl, new_ctrl; > + u32 *ctrl_pos; > + > + mutex_lock(&mmio_dev->kvm->lock); > + mutex_lock(&mmio_dev->lock); > + idx = get_mmio_table_index(mmio_dev, addr, len); > + if (idx < 0) { > + ret = -EOPNOTSUPP; > + goto out; > + } > + if ((addr & 0x3) || (len != 4 && len != 8)) > + goto out; > + > + offset = addr & 0xF; nit, this 0xf above. > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > + goto out; > + > + mmio = &mmio_dev->mmio[idx]; > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > + > + if (get_user(old_ctrl, ctrl_pos)) > + goto out; > + > + /* No allow writing to other fields when entry is unmasked */ > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > + goto out; > + > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > + goto out; > + > + if (get_user(new_ctrl, ctrl_pos)) > + goto out; > + > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > + ret = -ENOTSYNC; Couldn't we avoid the above get_user(new_ctrl,...) if we tested this first? I'd also prefer to see a direct goto out here since it's not entirely obvious that this first 'if' always falls into the below 'if'. I'm not a fan of using an error code as a special non-error return either. > + if (old_ctrl == new_ctrl) > + goto out; > + > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); An xor would remove some redundancy here if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > + if (r || ret) > + ret = -ENOTSYNC; > +out: > + mutex_unlock(&mmio_dev->lock); > + mutex_unlock(&mmio_dev->kvm->lock); > + return ret; > +} > + > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > + .read = msix_table_mmio_read, > + .write = msix_table_mmio_write, > +}; > + > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > +{ > + int ret; > + > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops); > + mutex_init(&kvm->msix_mmio_dev.lock); > + kvm->msix_mmio_dev.kvm = kvm; > + mutex_lock(&kvm->slots_lock); > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > + &kvm->msix_mmio_dev.table_dev); > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > + > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > +{ > + int ret; > + > + mutex_lock(&kvm->slots_lock); > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > + &kvm->msix_mmio_dev.table_dev); > + mutex_unlock(&kvm->slots_lock); > + return ret; > +} > + > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > + struct kvm_msix_mmio_user *mmio_user) > +{ > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > + struct kvm_msix_mmio *mmio = NULL; > + int r = 0, i; > + > + mutex_lock(&mmio_dev->lock); > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > + mmio = &mmio_dev->mmio[i]; > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > + r = -EINVAL; > + goto out; > + } > + break; > + } > + } > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > + r = -EINVAL; > + goto out; > + } > + /* All reserved currently */ > + if (mmio_user->flags) { > + r = -EINVAL; > + goto out; > + } > + > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > + r = -EINVAL; > + goto out; > + } > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > + r = -EINVAL; > + goto out; > + } > + > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > + r = -EINVAL; > + goto out; > + } > + if (!mmio) { > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > + r = -ENOSPC; > + goto out; > + } > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > + } > + > + mmio_dev->mmio_nr++; > + mmio->max_entries_nr = mmio_user->max_entries_nr; > + mmio->dev_id = mmio_user->dev_id; > + mmio->flags = mmio_user->flags; > + > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > + mmio->table_base_addr = mmio_user->base_addr; > + mmio->table_base_va = mmio_user->base_va; > + } > +out: > + mutex_unlock(&mmio_dev->lock); > + return r; > +} > + > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > +{ > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > + int r = 0, i, j; set r = -EINVAL here > + bool found = 0; > + > + if (!mmio) > + return 0; > + > + mutex_lock(&mmio_dev->lock); > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > + mmio_dev->mmio[i].type == mmio->type) { > + found = true; set r = 0 here > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > + mmio_dev->mmio_nr--; > + break; > + } > + } > + if (!found) > + r = -EINVAL; then get rid of found > + mutex_unlock(&mmio_dev->lock); > + return r; > +} > + > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > + struct kvm_msix_mmio_user *mmio_user) > +{ > + struct kvm_msix_mmio mmio; > + > + mmio.dev_id = mmio_user->dev_id; > + mmio.type = mmio_user->type; > + > + return kvm_free_msix_mmio(kvm, &mmio); > +} > + > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > new file mode 100644 > index 0000000..01b6587 > --- /dev/null > +++ b/virt/kvm/msix_mmio.h > @@ -0,0 +1,25 @@ > +#ifndef __KVM_MSIX_MMIO_H__ > +#define __KVM_MSIX_MMIO_H__ > +/* > + * MSI-X MMIO emulation > + * > + * Copyright (c) 2010 Intel Corporation > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + * Author: > + * Sheng Yang <sheng.yang@intel.com> > + */ > + > +#include <linux/pci.h> > + > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > + struct kvm_msix_mmio_user *mmio_user); > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > + struct kvm_msix_mmio_user *mmio_user); > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio_user); > + > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 0:19 ` Alex Williamson @ 2011-02-23 6:59 ` Sheng Yang 2011-02-23 8:45 ` Michael S. Tsirkin 2011-02-23 16:34 ` Alex Williamson 0 siblings, 2 replies; 20+ messages in thread From: Sheng Yang @ 2011-02-23 6:59 UTC (permalink / raw) To: Alex Williamson; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > Then we can support mask bit operation of assigned devices now. > > Looks pretty good overall. A few comments below. It seems like we > should be able to hook this into vfio with a small stub in kvm. We just > need to be able to communicate disabling and enabling of individual msix > vectors. For brute force, we could do this with a lot of eventfds, > triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure > if there's something smaller that could do it. Thanks, Alex, thanks for your comments. See my comments below: > > Alex > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > --- > > > > arch/x86/kvm/Makefile | 2 +- > > arch/x86/kvm/x86.c | 8 +- > > include/linux/kvm.h | 21 ++++ > > include/linux/kvm_host.h | 25 ++++ > > virt/kvm/assigned-dev.c | 44 +++++++ > > virt/kvm/kvm_main.c | 38 ++++++- > > virt/kvm/msix_mmio.c | 286 > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > | 25 ++++ > > 8 files changed, 442 insertions(+), 7 deletions(-) > > create mode 100644 virt/kvm/msix_mmio.c > > create mode 100644 virt/kvm/msix_mmio.h > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > index f15501f..3a0d851 100644 > > --- a/arch/x86/kvm/Makefile > > +++ b/arch/x86/kvm/Makefile > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > - assigned-dev.o) > > + assigned-dev.o msix_mmio.o) > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > async_pf.o) > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index fa708c9..89bf12c 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > case KVM_CAP_XSAVE: > > > > case KVM_CAP_ASYNC_PF: > > + case KVM_CAP_MSIX_MMIO: > > r = 1; > > break; > > > > case KVM_CAP_COALESCED_MMIO: > > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned > > long addr, > > > > struct kvm_vcpu *vcpu) > > > > { > > > > gpa_t gpa; > > > > + int r; > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > @@ -3822,14 +3824,16 @@ static int > > emulator_write_emulated_onepage(unsigned long addr, > > > > mmio: > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > /* > > > > * Is this MMIO handled locally? > > */ > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > + if (!r) > > > > return X86EMUL_CONTINUE; > > > > vcpu->mmio_needed = 1; > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > index ea2dc1a..ad9df4b 100644 > > --- a/include/linux/kvm.h > > +++ b/include/linux/kvm.h > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > #define KVM_EXIT_NMI 16 > > #define KVM_EXIT_INTERNAL_ERROR 17 > > #define KVM_EXIT_OSI 18 > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > #define KVM_CAP_ASYNC_PF 59 > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, > > 0x7e, struct kvm_msix_mmio_user) > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > struct kvm_pit_state2) > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > > > > __u16 padding[3]; > > > > }; > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > + > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > + > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > +struct kvm_msix_mmio_user { > > + __u32 dev_id; > > + __u16 type; > > + __u16 max_entries_nr; > > + __u64 base_addr; > > + __u64 base_va; > > + __u64 flags; > > + __u64 reserved[4]; > > +}; > > + > > > > #endif /* __LINUX_KVM_H */ > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 7d313e0..c10670c 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -233,6 +233,27 @@ struct kvm_memslots { > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > }; > > > > +#define KVM_MSIX_MMIO_MAX 32 > > I assume this is based on 32 devices/bus, but we could exceed this is we > add more buses or make use of more functions. How hard is it to make > this dynamic? We can improve this later(maybe with balance tree?). 32 should be more than sufficient for now. And this patchset has been in the mailing list for months, I really want to get it in first... > > + > > +struct kvm_msix_mmio { > > + u32 dev_id; > > + u16 type; > > + u16 max_entries_nr; > > + u64 flags; > > + gpa_t table_base_addr; > > + hva_t table_base_va; > > + gpa_t pba_base_addr; > > + hva_t pba_base_va; > > +}; > > + > > +struct kvm_msix_mmio_dev { > > + struct kvm *kvm; > > + struct kvm_io_device table_dev; > > + int mmio_nr; > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > + struct mutex lock; > > +}; > > + > > > > struct kvm { > > > > spinlock_t mmu_lock; > > raw_spinlock_t requests_lock; > > > > @@ -281,6 +302,7 @@ struct kvm { > > > > long mmu_notifier_count; > > > > #endif > > > > long tlbs_dirty; > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > }; > > > > /* The guest did something we don't support. */ > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > + int assigned_dev_id, int entry, bool mask); > > + > > > > /* For vcpu->arch.iommu_flags */ > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > index ae72ae6..d1598a6 100644 > > --- a/virt/kvm/assigned-dev.c > > +++ b/virt/kvm/assigned-dev.c > > @@ -18,6 +18,7 @@ > > > > #include <linux/interrupt.h> > > #include <linux/slab.h> > > #include "irq.h" > > > > +#include "msix_mmio.h" > > > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct > > list_head *head, > > > > int assigned_dev_id) > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, > > > > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); > > > > } > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > + struct kvm_assigned_dev_kernel *adev) > > +{ > > + struct kvm_msix_mmio mmio; > > + > > + mmio.dev_id = adev->assigned_dev_id; > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > + kvm_free_msix_mmio(kvm, &mmio); > > +} > > + > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > struct kvm_assigned_dev_kernel > > *assigned_dev) > > > > { > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > + > > > > __pci_reset_function(assigned_dev->dev); > > pci_restore_state(assigned_dev->dev); > > > > @@ -785,3 +799,33 @@ out: > > return r; > > > > } > > > > +/* The caller should hold kvm->lock */ > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > + int assigned_dev_id, int entry, bool mask) > > +{ > > + int r = -EFAULT; > > + struct kvm_assigned_dev_kernel *adev; > > + int i; > > + > > + if (!irqchip_in_kernel(kvm)) > > + return r; > > + > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > + assigned_dev_id); > > + if (!adev) > > + goto out; > > + > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > + for (i = 0; i < adev->entries_nr; i++) > > + if (adev->host_msix_entries[i].entry == entry) { > > + if (mask) > > + disable_irq_nosync( > > + adev->host_msix_entries[i].vector); > > + else > > + enable_irq(adev->host_msix_entries[i].vector); > > + r = 0; > > + break; > > + } > > +out: > > + return r; > > +} > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > index b1b6cbb..b7807c8 100644 > > --- a/virt/kvm/kvm_main.c > > +++ b/virt/kvm/kvm_main.c > > @@ -56,6 +56,7 @@ > > > > #include "coalesced_mmio.h" > > #include "async_pf.h" > > > > +#include "msix_mmio.h" > > > > #define CREATE_TRACE_POINTS > > #include <trace/events/kvm.h> > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > struct mm_struct *mm = kvm->mm; > > > > kvm_arch_sync_events(kvm); > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > spin_lock(&kvm_lock); > > list_del(&kvm->vm_list); > > spin_unlock(&kvm_lock); > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > > > > mutex_unlock(&kvm->lock); > > break; > > > > #endif > > > > + case KVM_REGISTER_MSIX_MMIO: { > > + struct kvm_msix_mmio_user mmio_user; > > + > > + r = -EFAULT; > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > + goto out; > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > + break; > > + } > > + case KVM_UNREGISTER_MSIX_MMIO: { > > + struct kvm_msix_mmio_user mmio_user; > > + > > + r = -EFAULT; > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > + goto out; > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > + break; > > + } > > > > default: > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > if (r == -ENOTTY) > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > return r; > > > > } > > > > #endif > > > > + r = kvm_register_msix_mmio_dev(kvm); > > + if (r < 0) { > > + kvm_put_kvm(kvm); > > + return r; > > + } > > + > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > if (r < 0) > > > > kvm_put_kvm(kvm); > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus > > *bus) > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, > > > > int len, const void *val) > > > > { > > > > - int i; > > + int i, r = -EOPNOTSUPP; > > > > struct kvm_io_bus *bus; > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > - for (i = 0; i < bus->dev_count; i++) > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > > + for (i = 0; i < bus->dev_count; i++) { > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > > + if (r == -ENOTSYNC) > > + break; > > + else if (!r) > > > > return 0; > > > > - return -EOPNOTSUPP; > > + } > > + return r; > > > > } > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > new file mode 100644 > > index 0000000..dde9b62 > > --- /dev/null > > +++ b/virt/kvm/msix_mmio.c > > @@ -0,0 +1,286 @@ > > +/* > > + * MSI-X MMIO emulation > > + * > > + * Copyright (c) 2010 Intel Corporation > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > + * the COPYING file in the top-level directory. > > + * > > + * Author: > > + * Sheng Yang <sheng.yang@intel.com> > > + */ > > + > > +#include <linux/kvm_host.h> > > +#include <linux/kvm.h> > > + > > +#include "msix_mmio.h" > > +#include "iodev.h" > > + > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio > > *mmio, + int entry, u32 flag) > > +{ > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > + mmio->dev_id, entry, flag); > > + return -EFAULT; > > +} > > + > > +/* Caller must hold dev->lock */ > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > + gpa_t addr, int len) > > +{ > > + gpa_t start, end; > > + int i, r = -EINVAL; > > + > > + for (i = 0; i < dev->mmio_nr; i++) { > > + start = dev->mmio[i].table_base_addr; > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > + dev->mmio[i].max_entries_nr; > > + if (addr >= start && addr + len <= end) { > > + r = i; > > + break; > > + } > > Yet another potential user of the weight balanced tree ;) Yeah, we can improve it later. > > > + } > > + > > + return r; > > +} > > + > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > int len, + void *val) > > +{ > > + struct kvm_msix_mmio_dev *mmio_dev = > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > + struct kvm_msix_mmio *mmio; > > + int idx, ret = 0, entry, offset, r; > > + > > + mutex_lock(&mmio_dev->lock); > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > + if (idx < 0) { > > + ret = -EOPNOTSUPP; > > + goto out; > > + } > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > + goto out; > > + > > + offset = addr & 0xf; > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > + goto out; > > + > > + mmio = &mmio_dev->mmio[idx]; > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > + if (r) > > + goto out; > > +out: > > + mutex_unlock(&mmio_dev->lock); > > + return ret; > > +} > > + > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > + int len, const void *val) > > +{ > > + struct kvm_msix_mmio_dev *mmio_dev = > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > + struct kvm_msix_mmio *mmio; > > + int idx, entry, offset, ret = 0, r = 0; > > + gpa_t entry_base; > > + u32 old_ctrl, new_ctrl; > > + u32 *ctrl_pos; > > + > > + mutex_lock(&mmio_dev->kvm->lock); > > + mutex_lock(&mmio_dev->lock); > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > + if (idx < 0) { > > + ret = -EOPNOTSUPP; > > + goto out; > > + } > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > + goto out; > > + > > + offset = addr & 0xF; > > nit, this 0xf above. So you like another mask? > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > + goto out; > > + > > + mmio = &mmio_dev->mmio[idx]; > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > + > > + if (get_user(old_ctrl, ctrl_pos)) > > + goto out; > > + > > + /* No allow writing to other fields when entry is unmasked */ > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > + goto out; > > + > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > + goto out; > > + > > + if (get_user(new_ctrl, ctrl_pos)) > > + goto out; > > + > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > + ret = -ENOTSYNC; > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > first? I'd also prefer to see a direct goto out here since it's not > entirely obvious that this first 'if' always falls into the below 'if'. > I'm not a fan of using an error code as a special non-error return > either. In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to see if they indeed modified ctrl bits. I would try to make the logic here more clear. > > > + if (old_ctrl == new_ctrl) > > + goto out; > > + > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); > > An xor would remove some redundancy here > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm, > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); Oh, yes, sure... > > > + if (r || ret) > > + ret = -ENOTSYNC; > > +out: > > + mutex_unlock(&mmio_dev->lock); > > + mutex_unlock(&mmio_dev->kvm->lock); > > + return ret; > > +} > > + > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > + .read = msix_table_mmio_read, > > + .write = msix_table_mmio_write, > > +}; > > + > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > +{ > > + int ret; > > + > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops); > > + mutex_init(&kvm->msix_mmio_dev.lock); > > + kvm->msix_mmio_dev.kvm = kvm; > > + mutex_lock(&kvm->slots_lock); > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > + &kvm->msix_mmio_dev.table_dev); > > + mutex_unlock(&kvm->slots_lock); > > + return ret; > > +} > > + > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > +{ > > + int ret; > > + > > + mutex_lock(&kvm->slots_lock); > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > + &kvm->msix_mmio_dev.table_dev); > > + mutex_unlock(&kvm->slots_lock); > > + return ret; > > +} > > + > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio_user *mmio_user) > > +{ > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > + struct kvm_msix_mmio *mmio = NULL; > > + int r = 0, i; > > + > > + mutex_lock(&mmio_dev->lock); > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > + mmio = &mmio_dev->mmio[i]; > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > + r = -EINVAL; > > + goto out; > > + } > > + break; > > + } > > + } > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > + r = -EINVAL; > > + goto out; > > + } > > + /* All reserved currently */ > > + if (mmio_user->flags) { > > + r = -EINVAL; > > + goto out; > > + } > > + > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > + r = -EINVAL; > > + goto out; > > + } > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > + r = -EINVAL; > > + goto out; > > + } > > + > > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > + r = -EINVAL; > > + goto out; > > + } > > + if (!mmio) { > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > + r = -ENOSPC; > > + goto out; > > + } > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > + } > > + > > + mmio_dev->mmio_nr++; > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > + mmio->dev_id = mmio_user->dev_id; > > + mmio->flags = mmio_user->flags; > > + > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > + mmio->table_base_addr = mmio_user->base_addr; > > + mmio->table_base_va = mmio_user->base_va; > > + } > > +out: > > + mutex_unlock(&mmio_dev->lock); > > + return r; > > +} > > + > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > > +{ > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > + int r = 0, i, j; > > set r = -EINVAL here > > > + bool found = 0; > > + > > + if (!mmio) > > + return 0; > > + > > + mutex_lock(&mmio_dev->lock); > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > + mmio_dev->mmio[i].type == mmio->type) { > > + found = true; > > set r = 0 here > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > + mmio_dev->mmio_nr--; > > + break; > > + } > > + } > > + if (!found) > > + r = -EINVAL; > > then get rid of found Sure. -- regards Yang, Sheng > > > + mutex_unlock(&mmio_dev->lock); > > + return r; > > +} > > + > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio_user *mmio_user) > > +{ > > + struct kvm_msix_mmio mmio; > > + > > + mmio.dev_id = mmio_user->dev_id; > > + mmio.type = mmio_user->type; > > + > > + return kvm_free_msix_mmio(kvm, &mmio); > > +} > > + > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > new file mode 100644 > > index 0000000..01b6587 > > --- /dev/null > > +++ b/virt/kvm/msix_mmio.h > > @@ -0,0 +1,25 @@ > > +#ifndef __KVM_MSIX_MMIO_H__ > > +#define __KVM_MSIX_MMIO_H__ > > +/* > > + * MSI-X MMIO emulation > > + * > > + * Copyright (c) 2010 Intel Corporation > > + * > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > + * the COPYING file in the top-level directory. > > + * > > + * Author: > > + * Sheng Yang <sheng.yang@intel.com> > > + */ > > + > > +#include <linux/pci.h> > > + > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio_user *mmio_user); > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio_user *mmio_user); > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > *mmio_user); + > > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 6:59 ` Sheng Yang @ 2011-02-23 8:45 ` Michael S. Tsirkin 2011-02-24 8:08 ` Sheng Yang 2011-02-24 9:44 ` Sheng Yang 2011-02-23 16:34 ` Alex Williamson 1 sibling, 2 replies; 20+ messages in thread From: Michael S. Tsirkin @ 2011-02-23 8:45 UTC (permalink / raw) To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > Then we can support mask bit operation of assigned devices now. > > > > Looks pretty good overall. A few comments below. It seems like we > > should be able to hook this into vfio with a small stub in kvm. We just > > need to be able to communicate disabling and enabling of individual msix > > vectors. For brute force, we could do this with a lot of eventfds, > > triggered by kvm and consumed by vfio, two per MSI-X vector. Not sure > > if there's something smaller that could do it. Thanks, > > Alex, thanks for your comments. See my comments below: > > > > > Alex > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > --- > > > > > > arch/x86/kvm/Makefile | 2 +- > > > arch/x86/kvm/x86.c | 8 +- > > > include/linux/kvm.h | 21 ++++ > > > include/linux/kvm_host.h | 25 ++++ > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > virt/kvm/kvm_main.c | 38 ++++++- > > > virt/kvm/msix_mmio.c | 286 > > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > | 25 ++++ > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > create mode 100644 virt/kvm/msix_mmio.c > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > index f15501f..3a0d851 100644 > > > --- a/arch/x86/kvm/Makefile > > > +++ b/arch/x86/kvm/Makefile > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > - assigned-dev.o) > > > + assigned-dev.o msix_mmio.o) > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > async_pf.o) > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > index fa708c9..89bf12c 100644 > > > --- a/arch/x86/kvm/x86.c > > > +++ b/arch/x86/kvm/x86.c > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > case KVM_CAP_XSAVE: > > > > > > case KVM_CAP_ASYNC_PF: > > > + case KVM_CAP_MSIX_MMIO: > > > r = 1; > > > break; > > > > > > case KVM_CAP_COALESCED_MMIO: > > > @@ -3807,6 +3808,7 @@ static int emulator_write_emulated_onepage(unsigned > > > long addr, > > > > > > struct kvm_vcpu *vcpu) > > > > > > { > > > > > > gpa_t gpa; > > > > > > + int r; > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > @@ -3822,14 +3824,16 @@ static int > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > mmio: > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > /* > > > > > > * Is this MMIO handled locally? > > > */ > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > + if (!r) > > > > > > return X86EMUL_CONTINUE; > > > > > > vcpu->mmio_needed = 1; > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; This use of -ENOTSYNC is IMO confusing. How about we make vcpu_mmio_write return the positive exit reason? Negative value will mean an error. > > > > > > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > > index ea2dc1a..ad9df4b 100644 > > > --- a/include/linux/kvm.h > > > +++ b/include/linux/kvm.h > > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > > > #define KVM_EXIT_NMI 16 > > > #define KVM_EXIT_INTERNAL_ERROR 17 > > > #define KVM_EXIT_OSI 18 > > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > > > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > > #define KVM_CAP_ASYNC_PF 59 > > > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > > > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, > > > 0x7e, struct kvm_msix_mmio_user) > > > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > > struct kvm_pit_state2) > > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > > > > > > __u16 padding[3]; > > > > > > }; > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > > + > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > > + > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > > +struct kvm_msix_mmio_user { > > > + __u32 dev_id; > > > + __u16 type; > > > + __u16 max_entries_nr; > > > + __u64 base_addr; > > > + __u64 base_va; > > > + __u64 flags; > > > + __u64 reserved[4]; > > > +}; > > > + > > > > > > #endif /* __LINUX_KVM_H */ > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > index 7d313e0..c10670c 100644 > > > --- a/include/linux/kvm_host.h > > > +++ b/include/linux/kvm_host.h > > > @@ -233,6 +233,27 @@ struct kvm_memslots { > > > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > > > }; > > > > > > +#define KVM_MSIX_MMIO_MAX 32 > > > > I assume this is based on 32 devices/bus, but we could exceed this is we > > add more buses or make use of more functions. How hard is it to make > > this dynamic? > > We can improve this later(maybe with balance tree?). 32 should be more than > sufficient for now. And this patchset has been in the mailing list for months, I > really want to get it in first... > > > > + > > > +struct kvm_msix_mmio { > > > + u32 dev_id; > > > + u16 type; > > > + u16 max_entries_nr; > > > + u64 flags; > > > + gpa_t table_base_addr; > > > + hva_t table_base_va; > > > + gpa_t pba_base_addr; > > > + hva_t pba_base_va; > > > +}; > > > + > > > +struct kvm_msix_mmio_dev { > > > + struct kvm *kvm; > > > + struct kvm_io_device table_dev; > > > + int mmio_nr; > > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > > + struct mutex lock; > > > +}; > > > + > > > > > > struct kvm { > > > > > > spinlock_t mmu_lock; > > > raw_spinlock_t requests_lock; > > > > > > @@ -281,6 +302,7 @@ struct kvm { > > > > > > long mmu_notifier_count; > > > > > > #endif > > > > > > long tlbs_dirty; > > > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > > > }; > > > > > > /* The guest did something we don't support. */ > > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, > > > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > + int assigned_dev_id, int entry, bool mask); > > > + > > > > > > /* For vcpu->arch.iommu_flags */ > > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > > index ae72ae6..d1598a6 100644 > > > --- a/virt/kvm/assigned-dev.c > > > +++ b/virt/kvm/assigned-dev.c > > > @@ -18,6 +18,7 @@ > > > > > > #include <linux/interrupt.h> > > > #include <linux/slab.h> > > > #include "irq.h" > > > > > > +#include "msix_mmio.h" > > > > > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct > > > list_head *head, > > > > > > int assigned_dev_id) > > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm *kvm, > > > > > > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); > > > > > > } > > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > > + struct kvm_assigned_dev_kernel *adev) > > > +{ > > > + struct kvm_msix_mmio mmio; > > > + > > > + mmio.dev_id = adev->assigned_dev_id; > > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > + kvm_free_msix_mmio(kvm, &mmio); > > > +} > > > + > > > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > > > struct kvm_assigned_dev_kernel > > > *assigned_dev) > > > > > > { > > > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > > + > > > > > > __pci_reset_function(assigned_dev->dev); > > > pci_restore_state(assigned_dev->dev); > > > > > > @@ -785,3 +799,33 @@ out: > > > return r; > > > > > > } > > > > > > +/* The caller should hold kvm->lock */ > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > + int assigned_dev_id, int entry, bool mask) > > > +{ > > > + int r = -EFAULT; > > > + struct kvm_assigned_dev_kernel *adev; > > > + int i; > > > + > > > + if (!irqchip_in_kernel(kvm)) > > > + return r; > > > + > > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > > + assigned_dev_id); > > > + if (!adev) > > > + goto out; > > > + > > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > > + for (i = 0; i < adev->entries_nr; i++) > > > + if (adev->host_msix_entries[i].entry == entry) { > > > + if (mask) > > > + disable_irq_nosync( > > > + adev->host_msix_entries[i].vector); > > > + else > > > + enable_irq(adev->host_msix_entries[i].vector); > > > + r = 0; > > > + break; > > > + } > > > +out: > > > + return r; > > > +} > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > index b1b6cbb..b7807c8 100644 > > > --- a/virt/kvm/kvm_main.c > > > +++ b/virt/kvm/kvm_main.c > > > @@ -56,6 +56,7 @@ > > > > > > #include "coalesced_mmio.h" > > > #include "async_pf.h" > > > > > > +#include "msix_mmio.h" > > > > > > #define CREATE_TRACE_POINTS > > > #include <trace/events/kvm.h> > > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > > > struct mm_struct *mm = kvm->mm; > > > > > > kvm_arch_sync_events(kvm); > > > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > > > spin_lock(&kvm_lock); > > > list_del(&kvm->vm_list); > > > spin_unlock(&kvm_lock); > > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > > > > > > mutex_unlock(&kvm->lock); > > > break; > > > > > > #endif > > > > > > + case KVM_REGISTER_MSIX_MMIO: { > > > + struct kvm_msix_mmio_user mmio_user; > > > + > > > + r = -EFAULT; > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > + goto out; > > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > > + break; > > > + } > > > + case KVM_UNREGISTER_MSIX_MMIO: { > > > + struct kvm_msix_mmio_user mmio_user; > > > + > > > + r = -EFAULT; > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > + goto out; > > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > > + break; > > > + } > > > > > > default: > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > > if (r == -ENOTTY) > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > > > return r; > > > > > > } > > > > > > #endif > > > > > > + r = kvm_register_msix_mmio_dev(kvm); > > > + if (r < 0) { > > > + kvm_put_kvm(kvm); > > > + return r; > > > + } > > > + > > > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > > if (r < 0) > > > > > > kvm_put_kvm(kvm); > > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct kvm_io_bus > > > *bus) > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, > > > > > > int len, const void *val) > > > > > > { > > > > > > - int i; > > > + int i, r = -EOPNOTSUPP; > > > > > > struct kvm_io_bus *bus; > > > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > > > - for (i = 0; i < bus->dev_count; i++) > > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > > > + for (i = 0; i < bus->dev_count; i++) { > > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > > > + if (r == -ENOTSYNC) > > > + break; > > > + else if (!r) > > > > > > return 0; So the above will be if (r >= 0) return r; > > > > > > - return -EOPNOTSUPP; > > > + } > > > + return r; > > > > > > } > > > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > > new file mode 100644 > > > index 0000000..dde9b62 > > > --- /dev/null > > > +++ b/virt/kvm/msix_mmio.c > > > @@ -0,0 +1,286 @@ > > > +/* > > > + * MSI-X MMIO emulation > > > + * > > > + * Copyright (c) 2010 Intel Corporation > > > + * > > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > > + * the COPYING file in the top-level directory. > > > + * > > > + * Author: > > > + * Sheng Yang <sheng.yang@intel.com> > > > + */ > > > + > > > +#include <linux/kvm_host.h> > > > +#include <linux/kvm.h> > > > + > > > +#include "msix_mmio.h" > > > +#include "iodev.h" > > > + > > > +static int update_msix_mask_bit(struct kvm *kvm, struct kvm_msix_mmio > > > *mmio, + int entry, u32 flag) > > > +{ > > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > > + mmio->dev_id, entry, flag); > > > + return -EFAULT; > > > +} > > > + > > > +/* Caller must hold dev->lock */ > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > > + gpa_t addr, int len) > > > +{ > > > + gpa_t start, end; > > > + int i, r = -EINVAL; > > > + > > > + for (i = 0; i < dev->mmio_nr; i++) { > > > + start = dev->mmio[i].table_base_addr; > > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > > + dev->mmio[i].max_entries_nr; > > > + if (addr >= start && addr + len <= end) { > > > + r = i; > > > + break; > > > + } > > > > Yet another potential user of the weight balanced tree ;) > > Yeah, we can improve it later. > > > > > + } > > > + > > > + return r; > > > +} > > > + > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > > int len, + void *val) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > + struct kvm_msix_mmio *mmio; > > > + int idx, ret = 0, entry, offset, r; > > > + > > > + mutex_lock(&mmio_dev->lock); > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > + if (idx < 0) { > > > + ret = -EOPNOTSUPP; > > > + goto out; > > > + } > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > + goto out; > > > + > > > + offset = addr & 0xf; > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > + goto out; This is not as strict as it could be. Also see below (in write) for an idea how to make the code clearer. > > > + > > > + mmio = &mmio_dev->mmio[idx]; > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > + if (r) > > > + goto out; Want to give the caller some indication of an error? If not add a comment why. > > > +out: > > > + mutex_unlock(&mmio_dev->lock); > > > + return ret; > > > +} > > > + > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > > + int len, const void *val) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > + struct kvm_msix_mmio *mmio; > > > + int idx, entry, offset, ret = 0, r = 0; > > > + gpa_t entry_base; > > > + u32 old_ctrl, new_ctrl; These are really __le32. Pls make them so and then fix up the sparse errors. > > > + u32 *ctrl_pos; get_user should not be done on u32 *, it should be __user. Does not sparse complain about this? > > > + > > > + mutex_lock(&mmio_dev->kvm->lock); > > > + mutex_lock(&mmio_dev->lock); > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > + if (idx < 0) { > > > + ret = -EOPNOTSUPP; > > > + goto out; > > > + } > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > + goto out; > > > + > > > + offset = addr & 0xF; > > > > nit, this 0xf above. > > So you like another mask? Yes, this should be offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > + goto out; I think it's a good idea to be as strict as possible. Also let's quote the spec. Finally using % will make the logic more transparent. How about the below? /* For all accesses to MSI-X Table and MSI-X PBA fields, software must use aligned full DWORD or aligned full QWORD transactions; otherwise, the result is undefined. */ if (len == 4) { if (addr % 4) goto out; } else if (len == 8) { if (addr % 8) goto out; } else goto out; As read needs same logic it's likely a good idea to add a function for this. > > > + > > > + mmio = &mmio_dev->mmio[idx]; > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > + > > > + if (get_user(old_ctrl, ctrl_pos)) It's probably a good idea to do access_ok check for the whole entry here, and then use __get_user etc below, these are cheaper. > > > + goto out; > > > + > > > + /* No allow writing to other fields when entry is unmasked */ Do not allow. > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > + goto out; > > > + > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) any such cast is suspect. Is entry_base really gpa? If it was you can not cast it to __user. I think it should be void __user *. > > > + goto out; > > > + > > > + if (get_user(new_ctrl, ctrl_pos)) > > > + goto out; get_user assumes aligned address. But there's no guarantee ctrl_pos is aligned because table_base_va might be misaligned. > > > + > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > + ret = -ENOTSYNC; Better to use != and not <, effectively same. Or in the above we can do: bool control; if (len == 4) { if (addr % 4) goto out; control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL; } else if (len == 8) { if (addr % 8) goto out; control = offset == PCI_MSIX_ENTRY_VECTOR_DATA; } else goto out; > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > first? I'd also prefer to see a direct goto out here since it's not > > entirely obvious that this first 'if' always falls into the below 'if'. > > I'm not a fan of using an error code as a special non-error return > > either. > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to > see if they indeed modified ctrl bits. > > I would try to make the logic here more clear. > > > > > + if (old_ctrl == new_ctrl) > > > + goto out; > > > + > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > > > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); > > > > An xor would remove some redundancy here > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm, > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > > Oh, yes, sure... Also why not set ret to correct value directly? > > > > > + if (r || ret) > > > + ret = -ENOTSYNC; when is ret set? And why not to the correct value directly? > > > +out: > > > + mutex_unlock(&mmio_dev->lock); > > > + mutex_unlock(&mmio_dev->kvm->lock); > > > + return ret; > > > +} > > > + > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > > + .read = msix_table_mmio_read, > > > + .write = msix_table_mmio_write, > > > +}; > > > + > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > > +{ > > > + int ret; > > > + > > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, &msix_mmio_table_ops); > > > + mutex_init(&kvm->msix_mmio_dev.lock); > > > + kvm->msix_mmio_dev.kvm = kvm; > > > + mutex_lock(&kvm->slots_lock); > > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > > + &kvm->msix_mmio_dev.table_dev); > > > + mutex_unlock(&kvm->slots_lock); > > > + return ret; > > > +} > > > + > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > > +{ > > > + int ret; > > > + > > > + mutex_lock(&kvm->slots_lock); > > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > > + &kvm->msix_mmio_dev.table_dev); > > > + mutex_unlock(&kvm->slots_lock); > > > + return ret; > > > +} > > > + > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > + struct kvm_msix_mmio_user *mmio_user) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > + struct kvm_msix_mmio *mmio = NULL; > > > + int r = 0, i; > > > + > > > + mutex_lock(&mmio_dev->lock); > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > > + mmio = &mmio_dev->mmio[i]; > > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + break; > > > + } > > > + } > > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + /* All reserved currently */ > > > + if (mmio_user->flags) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + > > > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > > + r = -EINVAL; > > > + goto out; > > > + } > > > + if (!mmio) { > > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > > + r = -ENOSPC; > > > + goto out; > > > + } > > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > > + } > > > + > > > + mmio_dev->mmio_nr++; > > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > > + mmio->dev_id = mmio_user->dev_id; > > > + mmio->flags = mmio_user->flags; > > > + > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > + mmio->table_base_addr = mmio_user->base_addr; > > > + mmio->table_base_va = mmio_user->base_va; Also check alignment for these values. base_va should be at least 4 byte aligned. Maybe more, need to think about it ... Look in spec for table_base_addr alignment. > > > + } > > > +out: > > > + mutex_unlock(&mmio_dev->lock); > > > + return r; > > > +} > > > + > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > + int r = 0, i, j; > > > > set r = -EINVAL here > > > > > + bool found = 0; > > > + > > > + if (!mmio) > > > + return 0; > > > + > > > + mutex_lock(&mmio_dev->lock); > > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > > + mmio_dev->mmio[i].type == mmio->type) { > > > + found = true; > > > > set r = 0 here > > > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > > + mmio_dev->mmio_nr--; > > > + break; > > > + } > > > + } > > > + if (!found) > > > + r = -EINVAL; > > > > then get rid of found > > Sure. > > -- > regards > Yang, Sheng > > > > > > + mutex_unlock(&mmio_dev->lock); > > > + return r; > > > +} > > > + > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > + struct kvm_msix_mmio_user *mmio_user) > > > +{ > > > + struct kvm_msix_mmio mmio; > > > + > > > + mmio.dev_id = mmio_user->dev_id; > > > + mmio.type = mmio_user->type; > > > + > > > + return kvm_free_msix_mmio(kvm, &mmio); > > > +} > > > + > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > > new file mode 100644 > > > index 0000000..01b6587 > > > --- /dev/null > > > +++ b/virt/kvm/msix_mmio.h > > > @@ -0,0 +1,25 @@ > > > +#ifndef __KVM_MSIX_MMIO_H__ > > > +#define __KVM_MSIX_MMIO_H__ > > > +/* > > > + * MSI-X MMIO emulation > > > + * > > > + * Copyright (c) 2010 Intel Corporation > > > + * > > > + * This work is licensed under the terms of the GNU GPL, version 2. See > > > + * the COPYING file in the top-level directory. > > > + * > > > + * Author: > > > + * Sheng Yang <sheng.yang@intel.com> > > > + */ > > > + > > > +#include <linux/pci.h> > > > + > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > + struct kvm_msix_mmio_user *mmio_user); > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > + struct kvm_msix_mmio_user *mmio_user); > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > *mmio_user); + > > > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 8:45 ` Michael S. Tsirkin @ 2011-02-24 8:08 ` Sheng Yang 2011-02-24 10:11 ` Michael S. Tsirkin 2011-02-24 9:44 ` Sheng Yang 1 sibling, 1 reply; 20+ messages in thread From: Sheng Yang @ 2011-02-24 8:08 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > Then we can support mask bit operation of assigned devices now. > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > should be able to hook this into vfio with a small stub in kvm. We > > > just need to be able to communicate disabling and enabling of > > > individual msix vectors. For brute force, we could do this with a lot > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X > > > vector. Not sure if there's something smaller that could do it. > > > Thanks, > > > > Alex, thanks for your comments. See my comments below: > > > Alex > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > --- > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > arch/x86/kvm/x86.c | 8 +- > > > > include/linux/kvm.h | 21 ++++ > > > > include/linux/kvm_host.h | 25 ++++ > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > virt/kvm/msix_mmio.c | 286 > > > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > > > > > > | 25 ++++ > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > index f15501f..3a0d851 100644 > > > > --- a/arch/x86/kvm/Makefile > > > > +++ b/arch/x86/kvm/Makefile > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > - assigned-dev.o) > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > > async_pf.o) > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > index fa708c9..89bf12c 100644 > > > > --- a/arch/x86/kvm/x86.c > > > > +++ b/arch/x86/kvm/x86.c > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > case KVM_CAP_XSAVE: > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > + case KVM_CAP_MSIX_MMIO: > > > > r = 1; > > > > break; > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > @@ -3807,6 +3808,7 @@ static int > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > { > > > > > > > > gpa_t gpa; > > > > > > > > + int r; > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > mmio: > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > /* > > > > > > > > * Is this MMIO handled locally? > > > > */ > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > + if (!r) > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > This use of -ENOTSYNC is IMO confusing. > How about we make vcpu_mmio_write return the positive exit reason? > Negative value will mean an error. Make sense. I would update it. > > > > > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > > > index ea2dc1a..ad9df4b 100644 > > > > --- a/include/linux/kvm.h > > > > +++ b/include/linux/kvm.h > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > > > > > #define KVM_EXIT_NMI 16 > > > > #define KVM_EXIT_INTERNAL_ERROR 17 > > > > #define KVM_EXIT_OSI 18 > > > > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > > > > > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > > > #define KVM_CAP_ASYNC_PF 59 > > > > > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > > > > > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, > > > > 0x7e, struct kvm_msix_mmio_user) > > > > > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > > > struct kvm_pit_state2) > > > > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > > > > > > > > __u16 padding[3]; > > > > > > > > }; > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > > > + > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > > > + > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > > > +struct kvm_msix_mmio_user { > > > > + __u32 dev_id; > > > > + __u16 type; > > > > + __u16 max_entries_nr; > > > > + __u64 base_addr; > > > > + __u64 base_va; > > > > + __u64 flags; > > > > + __u64 reserved[4]; > > > > +}; > > > > + > > > > > > > > #endif /* __LINUX_KVM_H */ > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > > index 7d313e0..c10670c 100644 > > > > --- a/include/linux/kvm_host.h > > > > +++ b/include/linux/kvm_host.h > > > > @@ -233,6 +233,27 @@ struct kvm_memslots { > > > > > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > > > > > }; > > > > > > > > +#define KVM_MSIX_MMIO_MAX 32 > > > > > > I assume this is based on 32 devices/bus, but we could exceed this is > > > we add more buses or make use of more functions. How hard is it to > > > make this dynamic? > > > > We can improve this later(maybe with balance tree?). 32 should be more > > than sufficient for now. And this patchset has been in the mailing list > > for months, I really want to get it in first... > > > > > > + > > > > +struct kvm_msix_mmio { > > > > + u32 dev_id; > > > > + u16 type; > > > > + u16 max_entries_nr; > > > > + u64 flags; > > > > + gpa_t table_base_addr; > > > > + hva_t table_base_va; > > > > + gpa_t pba_base_addr; > > > > + hva_t pba_base_va; > > > > +}; > > > > + > > > > +struct kvm_msix_m4mio_dev { > > > > + struct kvm *kvm; > > > > + struct kvm_io_device table_dev; > > > > + int mmio_nr; > > > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > > > + struct mutex lock; > > > > +}; > > > > + > > > > > > > > struct kvm { > > > > > > > > spinlock_t mmu_lock; > > > > raw_spinlock_t requests_lock; > > > > > > > > @@ -281,6 +302,7 @@ struct kvm { > > > > > > > > long mmu_notifier_count; > > > > > > > > #endif > > > > > > > > long tlbs_dirty; > > > > > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > > > > > }; > > > > > > > > /* The guest did something we don't support. */ > > > > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm > > > > *kvm, > > > > > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > + int assigned_dev_id, int entry, bool mask); > > > > + > > > > > > > > /* For vcpu->arch.iommu_flags */ > > > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > > > index ae72ae6..d1598a6 100644 > > > > --- a/virt/kvm/assigned-dev.c > > > > +++ b/virt/kvm/assigned-dev.c > > > > @@ -18,6 +18,7 @@ > > > > > > > > #include <linux/interrupt.h> > > > > #include <linux/slab.h> > > > > #include "irq.h" > > > > > > > > +#include "msix_mmio.h" > > > > > > > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct > > > > list_head *head, > > > > > > > > int assigned_dev_id) > > > > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm > > > > *kvm, > > > > > > > > kvm_deassign_irq(kvm, assigned_dev, > > > > assigned_dev->irq_requested_type); > > > > > > > > } > > > > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > > > + struct kvm_assigned_dev_kernel *adev) > > > > +{ > > > > + struct kvm_msix_mmio mmio; > > > > + > > > > + mmio.dev_id = adev->assigned_dev_id; > > > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > + kvm_free_msix_mmio(kvm, &mmio); > > > > +} > > > > + > > > > > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > > > > > struct kvm_assigned_dev_kernel > > > > *assigned_dev) > > > > > > > > { > > > > > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > > > + > > > > > > > > __pci_reset_function(assigned_dev->dev); > > > > pci_restore_state(assigned_dev->dev); > > > > > > > > @@ -785,3 +799,33 @@ out: > > > > return r; > > > > > > > > } > > > > > > > > +/* The caller should hold kvm->lock */ > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > + int assigned_dev_id, int entry, bool mask) > > > > +{ > > > > + int r = -EFAULT; > > > > + struct kvm_assigned_dev_kernel *adev; > > > > + int i; > > > > + > > > > + if (!irqchip_in_kernel(kvm)) > > > > + return r; > > > > + > > > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > > > + assigned_dev_id); > > > > + if (!adev) > > > > + goto out; > > > > + > > > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > > > + for (i = 0; i < adev->entries_nr; i++) > > > > + if (adev->host_msix_entries[i].entry == entry) { > > > > + if (mask) > > > > + disable_irq_nosync( > > > > + adev->host_msix_entries[i].vector); > > > > + else > > > > + enable_irq(adev->host_msix_entries[i].vector); > > > > + r = 0; > > > > + break; > > > > + } > > > > +out: > > > > + return r; > > > > +} > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > index b1b6cbb..b7807c8 100644 > > > > --- a/virt/kvm/kvm_main.c > > > > +++ b/virt/kvm/kvm_main.c > > > > @@ -56,6 +56,7 @@ > > > > > > > > #include "coalesced_mmio.h" > > > > #include "async_pf.h" > > > > > > > > +#include "msix_mmio.h" > > > > > > > > #define CREATE_TRACE_POINTS > > > > #include <trace/events/kvm.h> > > > > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > > > > > struct mm_struct *mm = kvm->mm; > > > > > > > > kvm_arch_sync_events(kvm); > > > > > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > > > > > spin_lock(&kvm_lock); > > > > list_del(&kvm->vm_list); > > > > spin_unlock(&kvm_lock); > > > > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > > > > > > > > mutex_unlock(&kvm->lock); > > > > break; > > > > > > > > #endif > > > > > > > > + case KVM_REGISTER_MSIX_MMIO: { > > > > + struct kvm_msix_mmio_user mmio_user; > > > > + > > > > + r = -EFAULT; > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > + goto out; > > > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > > > + break; > > > > + } > > > > + case KVM_UNREGISTER_MSIX_MMIO: { > > > > + struct kvm_msix_mmio_user mmio_user; > > > > + > > > > + r = -EFAULT; > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > + goto out; > > > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > > > + break; > > > > + } > > > > > > > > default: > > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > > > if (r == -ENOTTY) > > > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > > > > > return r; > > > > > > > > } > > > > > > > > #endif > > > > > > > > + r = kvm_register_msix_mmio_dev(kvm); > > > > + if (r < 0) { > > > > + kvm_put_kvm(kvm); > > > > + return r; > > > > + } > > > > + > > > > > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > > > if (r < 0) > > > > > > > > kvm_put_kvm(kvm); > > > > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct > > > > kvm_io_bus *bus) > > > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t > > > > addr, > > > > > > > > int len, const void *val) > > > > > > > > { > > > > > > > > - int i; > > > > + int i, r = -EOPNOTSUPP; > > > > > > > > struct kvm_io_bus *bus; > > > > > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > > > > > - for (i = 0; i < bus->dev_count; i++) > > > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > > > > + for (i = 0; i < bus->dev_count; i++) { > > > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > > > > + if (r == -ENOTSYNC) > > > > + break; > > > > + else if (!r) > > > > > > > > return 0; > > So the above will be > if (r >= 0) > return r; > > > > > - return -EOPNOTSUPP; > > > > + } > > > > + return r; > > > > > > > > } > > > > > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > > > new file mode 100644 > > > > index 0000000..dde9b62 > > > > --- /dev/null > > > > +++ b/virt/kvm/msix_mmio.c > > > > @@ -0,0 +1,286 @@ > > > > +/* > > > > + * MSI-X MMIO emulation > > > > + * > > > > + * Copyright (c) 2010 Intel Corporation > > > > + * > > > > + * This work is licensed under the terms of the GNU GPL, version 2. > > > > See + * the COPYING file in the top-level directory. > > > > + * > > > > + * Author: > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > + */ > > > > + > > > > +#include <linux/kvm_host.h> > > > > +#include <linux/kvm.h> > > > > + > > > > +#include "msix_mmio.h" > > > > +#include "iodev.h" > > > > + > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct > > > > kvm_msix_mmio *mmio, + int entry, u32 flag) > > > > +{ > > > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > > > + mmio->dev_id, entry, flag); > > > > + return -EFAULT; > > > > +} > > > > + > > > > +/* Caller must hold dev->lock */ > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > > > + gpa_t addr, int len) > > > > +{ > > > > + gpa_t start, end; > > > > + int i, r = -EINVAL; > > > > + > > > > + for (i = 0; i < dev->mmio_nr; i++) { > > > > + start = dev->mmio[i].table_base_addr; > > > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > > > + dev->mmio[i].max_entries_nr; > > > > + if (addr >= start && addr + len <= end) { > > > > + r = i; > > > > + break; > > > > + } > > > > > > Yet another potential user of the weight balanced tree ;) > > > > Yeah, we can improve it later. > > > > > > + } > > > > + > > > > + return r; > > > > +} > > > > + > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t > > > > addr, int len, + void *val) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, ret = 0, entry, offset, r; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > + goto out; > > > > + > > > > + offset = addr & 0xf; > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > + goto out; > > This is not as strict as it could be. Also > see below (in write) for an idea how to make the code clearer. > > > > > + > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > + if (r) > > > > + goto out; > > Want to give the caller some indication of an error? > If not add a comment why. I don't think we have a proper way to indicate the error now. Also the table_base_va already passed the checking when MMIO registered. We can add more error info later. > > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t > > > > addr, + int len, const void *val) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > + gpa_t entry_base; > > > > + u32 old_ctrl, new_ctrl; > > These are really __le32. Pls make them so and then > fix up the sparse errors. > > > > > + u32 *ctrl_pos; > > get_user should not be done on u32 *, it should be > __user. Does not sparse complain about this? > Haven't use sparse to check the patches. Would do it later. > > > > + > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > + goto out; > > > > + > > > > + offset = addr & 0xF; > > > > > > nit, this 0xf above. > > > > So you like another mask? > > Yes, this should be > > offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > + goto out; > > I think it's a good idea to be as strict as possible. > Also let's quote the spec. Finally using % will make > the logic more transparent. How about the below? > > /* For all accesses to MSI-X Table and MSI-X PBA fields, software must use > aligned full DWORD or aligned full QWORD transactions; otherwise, the > result is undefined. */ > if (len == 4) { > if (addr % 4) > goto out; > } else if (len == 8) { > if (addr % 8) > goto out; > } else > goto out; > > As read needs same logic it's likely a good idea > to add a function for this. > > > > > + > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > > + > > > > + if (get_user(old_ctrl, ctrl_pos)) > > It's probably a good idea to do access_ok check for the whole > entry here, and then use __get_user etc below, these are cheaper. table_base_va already passed access_ok checking. I am not sure if we need to do it again. > > > > > + goto out; > > > > + > > > > + /* No allow writing to other fields when entry is unmasked */ > > Do not allow. > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > + goto out; > > > > + > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > any such cast is suspect. > Is entry_base really gpa? If it was you can not cast it to __user. HVA. > I think it should be void __user *. > > > > > + goto out; > > > > + > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > + goto out; > > get_user assumes aligned address. But there's no guarantee ctrl_pos > is aligned because table_base_va might be misaligned. table_base_va already passed access_ok() check. > > > > > + > > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > > + ret = -ENOTSYNC; > > Better to use != and not <, effectively same. > Or in the above we can do: > bool control; > if (len == 4) { > if (addr % 4) > goto out; > control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL; > } else if (len == 8) { > if (addr % 8) > goto out; > control = offset == PCI_MSIX_ENTRY_VECTOR_DATA; > } else > goto out; > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > > first? I'd also prefer to see a direct goto out here since it's not > > > entirely obvious that this first 'if' always falls into the below 'if'. > > > I'm not a fan of using an error code as a special non-error return > > > either. > > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check > > new_ctrl to see if they indeed modified ctrl bits. > > > > I would try to make the logic here more clear. > > > > > > + if (old_ctrl == new_ctrl) > > > > + goto out; > > > > + > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > > > > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); > > > > > > An xor would remove some redundancy here > > > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm, > > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > > > > Oh, yes, sure... > > Also why not set ret to correct value directly? > > > > > + if (r || ret) > > > > + ret = -ENOTSYNC; > > when is ret set? And why not to the correct value directly? > > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + mutex_unlock(&mmio_dev->kvm->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > > > + .read = msix_table_mmio_read, > > > > + .write = msix_table_mmio_write, > > > > +}; > > > > + > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > > > +{ > > > > + int ret; > > > > + > > > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, > > > > &msix_mmio_table_ops); + mutex_init(&kvm->msix_mmio_dev.lock); > > > > + kvm->msix_mmio_dev.kvm = kvm; > > > > + mutex_lock(&kvm->slots_lock); > > > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > > > + &kvm->msix_mmio_dev.table_dev); > > > > + mutex_unlock(&kvm->slots_lock); > > > > + return ret; > > > > +} > > > > + > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > > > +{ > > > > + int ret; > > > > + > > > > + mutex_lock(&kvm->slots_lock); > > > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > > > + &kvm->msix_mmio_dev.table_dev); > > > > + mutex_unlock(&kvm->slots_lock); > > > > + return ret; > > > > +} > > > > + > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > + struct kvm_msix_mmio *mmio = NULL; > > > > + int r = 0, i; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > > > + mmio = &mmio_dev->mmio[i]; > > > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + break; > > > > + } > > > > + } > > > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + /* All reserved currently */ > > > > + if (mmio_user->flags) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + > > > > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > > > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > > > + r = -EINVAL; > > > > + goto out; > > > > + } > > > > + if (!mmio) { > > > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > > > + r = -ENOSPC; > > > > + goto out; > > > > + } > > > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > > > + } > > > > + > > > > + mmio_dev->mmio_nr++; > > > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > > > + mmio->dev_id = mmio_user->dev_id; > > > > + mmio->flags = mmio_user->flags; > > > > + > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > + mmio->table_base_addr = mmio_user->base_addr; > > > > + mmio->table_base_va = mmio_user->base_va; > > Also check alignment for these values. base_va should be > at least 4 byte aligned. Maybe more, need to think about it ... > Look in spec for table_base_addr alignment. OK. -- regards Yang, Sheng > > > > > + } > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return r; > > > > +} > > > > + > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > + int r = 0, i, j; > > > > > > set r = -EINVAL here > > > > > > > + bool found = 0; > > > > + > > > > + if (!mmio) > > > > + return 0; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > > > + mmio_dev->mmio[i].type == mmio->type) { > > > > + found = true; > > > > > > set r = 0 here > > > > > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > > > + mmio_dev->mmio_nr--; > > > > + break; > > > > + } > > > > + } > > > > + if (!found) > > > > + r = -EINVAL; > > > > > > then get rid of found > > > > Sure. > > > > -- > > regards > > Yang, Sheng > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return r; > > > > +} > > > > + > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > +{ > > > > + struct kvm_msix_mmio mmio; > > > > + > > > > + mmio.dev_id = mmio_user->dev_id; > > > > + mmio.type = mmio_user->type; > > > > + > > > > + return kvm_free_msix_mmio(kvm, &mmio); > > > > +} > > > > + > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > > > new file mode 100644 > > > > index 0000000..01b6587 > > > > --- /dev/null > > > > +++ b/virt/kvm/msix_mmio.h > > > > @@ -0,0 +1,25 @@ > > > > +#ifndef __KVM_MSIX_MMIO_H__ > > > > +#define __KVM_MSIX_MMIO_H__ > > > > +/* > > > > + * MSI-X MMIO emulation > > > > + * > > > > + * Copyright (c) 2010 Intel Corporation > > > > + * > > > > + * This work is licensed under the terms of the GNU GPL, version 2. > > > > See + * the COPYING file in the top-level directory. > > > > + * > > > > + * Author: > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > + */ > > > > + > > > > +#include <linux/pci.h> > > > > + > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > > *mmio_user); + > > > > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-24 8:08 ` Sheng Yang @ 2011-02-24 10:11 ` Michael S. Tsirkin 2011-02-25 5:54 ` Sheng Yang 0 siblings, 1 reply; 20+ messages in thread From: Michael S. Tsirkin @ 2011-02-24 10:11 UTC (permalink / raw) To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote: > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > > should be able to hook this into vfio with a small stub in kvm. We > > > > just need to be able to communicate disabling and enabling of > > > > individual msix vectors. For brute force, we could do this with a lot > > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X > > > > vector. Not sure if there's something smaller that could do it. > > > > Thanks, > > > > > > Alex, thanks for your comments. See my comments below: > > > > Alex > > > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > > --- > > > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > > arch/x86/kvm/x86.c | 8 +- > > > > > include/linux/kvm.h | 21 ++++ > > > > > include/linux/kvm_host.h | 25 ++++ > > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > > virt/kvm/msix_mmio.c | 286 > > > > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > > > > > > > > | 25 ++++ > > > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > > index f15501f..3a0d851 100644 > > > > > --- a/arch/x86/kvm/Makefile > > > > > +++ b/arch/x86/kvm/Makefile > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o > \ > > > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > > > - assigned-dev.o) > > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > > > async_pf.o) > > > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > index fa708c9..89bf12c 100644 > > > > > --- a/arch/x86/kvm/x86.c > > > > > +++ b/arch/x86/kvm/x86.c > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > > case KVM_CAP_XSAVE: > > > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > > + case KVM_CAP_MSIX_MMIO: > > > > > r = 1; > > > > > break; > > > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > > @@ -3807,6 +3808,7 @@ static int > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > > > { > > > > > > > > > > gpa_t gpa; > > > > > > > > > > + int r; > > > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > mmio: > > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > > > /* > > > > > > > > > > * Is this MMIO handled locally? > > > > > */ > > > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > > + if (!r) > > > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > This use of -ENOTSYNC is IMO confusing. > > How about we make vcpu_mmio_write return the positive exit reason? > > Negative value will mean an error. > > Make sense. I would update it. > > > > > > > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > > > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > > > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > > > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > > > > index ea2dc1a..ad9df4b 100644 > > > > > --- a/include/linux/kvm.h > > > > > +++ b/include/linux/kvm.h > > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > > > > > > > #define KVM_EXIT_NMI 16 > > > > > #define KVM_EXIT_INTERNAL_ERROR 17 > > > > > #define KVM_EXIT_OSI 18 > > > > > > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > > > > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > > > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > > > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > > > > > > > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > > > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > > > > #define KVM_CAP_ASYNC_PF 59 > > > > > > > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > > > > > > > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > > > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, > > > > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > > > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > > > > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > > > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO _IOW(KVMIO, > > > > > 0x7e, struct kvm_msix_mmio_user) > > > > > > > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > > > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > > > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, > > > > > struct kvm_pit_state2) > > > > > > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > > > > > > > > > > __u16 padding[3]; > > > > > > > > > > }; > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > > > > + > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > > > > + > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > > > > +struct kvm_msix_mmio_user { > > > > > + __u32 dev_id; > > > > > + __u16 type; > > > > > + __u16 max_entries_nr; > > > > > + __u64 base_addr; > > > > > + __u64 base_va; > > > > > + __u64 flags; > > > > > + __u64 reserved[4]; > > > > > +}; > > > > > + > > > > > > > > > > #endif /* __LINUX_KVM_H */ > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > > > index 7d313e0..c10670c 100644 > > > > > --- a/include/linux/kvm_host.h > > > > > +++ b/include/linux/kvm_host.h > > > > > @@ -233,6 +233,27 @@ struct kvm_memslots { > > > > > > > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > > > > > > > }; > > > > > > > > > > +#define KVM_MSIX_MMIO_MAX 32 > > > > > > > > I assume this is based on 32 devices/bus, but we could exceed this is > > > > we add more buses or make use of more functions. How hard is it to > > > > make this dynamic? > > > > > > We can improve this later(maybe with balance tree?). 32 should be more > > > than sufficient for now. And this patchset has been in the mailing list > > > for months, I really want to get it in first... > > > > > > > > + > > > > > +struct kvm_msix_mmio { > > > > > + u32 dev_id; > > > > > + u16 type; > > > > > + u16 max_entries_nr; > > > > > + u64 flags; > > > > > + gpa_t table_base_addr; > > > > > + hva_t table_base_va; > > > > > + gpa_t pba_base_addr; > > > > > + hva_t pba_base_va; > > > > > +}; > > > > > + > > > > > +struct kvm_msix_m4mio_dev { > > > > > + struct kvm *kvm; > > > > > + struct kvm_io_device table_dev; > > > > > + int mmio_nr; > > > > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > > > > + struct mutex lock; > > > > > +}; > > > > > + > > > > > > > > > > struct kvm { > > > > > > > > > > spinlock_t mmu_lock; > > > > > raw_spinlock_t requests_lock; > > > > > > > > > > @@ -281,6 +302,7 @@ struct kvm { > > > > > > > > > > long mmu_notifier_count; > > > > > > > > > > #endif > > > > > > > > > > long tlbs_dirty; > > > > > > > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > > > > > > > }; > > > > > > > > > > /* The guest did something we don't support. */ > > > > > > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm > > > > > *kvm, > > > > > > > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > > > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > > + int assigned_dev_id, int entry, bool mask); > > > > > + > > > > > > > > > > /* For vcpu->arch.iommu_flags */ > > > > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > > > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > > > > index ae72ae6..d1598a6 100644 > > > > > --- a/virt/kvm/assigned-dev.c > > > > > +++ b/virt/kvm/assigned-dev.c > > > > > @@ -18,6 +18,7 @@ > > > > > > > > > > #include <linux/interrupt.h> > > > > > #include <linux/slab.h> > > > > > #include "irq.h" > > > > > > > > > > +#include "msix_mmio.h" > > > > > > > > > > static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct > > > > > list_head *head, > > > > > > > > > > int assigned_dev_id) > > > > > > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct kvm > > > > > *kvm, > > > > > > > > > > kvm_deassign_irq(kvm, assigned_dev, > > > > > assigned_dev->irq_requested_type); > > > > > > > > > > } > > > > > > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > > > > + struct kvm_assigned_dev_kernel *adev) > > > > > +{ > > > > > + struct kvm_msix_mmio mmio; > > > > > + > > > > > + mmio.dev_id = adev->assigned_dev_id; > > > > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > > + kvm_free_msix_mmio(kvm, &mmio); > > > > > +} > > > > > + > > > > > > > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > > > > > > > struct kvm_assigned_dev_kernel > > > > > *assigned_dev) > > > > > > > > > > { > > > > > > > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > > > > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > > > > + > > > > > > > > > > __pci_reset_function(assigned_dev->dev); > > > > > pci_restore_state(assigned_dev->dev); > > > > > > > > > > @@ -785,3 +799,33 @@ out: > > > > > return r; > > > > > > > > > > } > > > > > > > > > > +/* The caller should hold kvm->lock */ > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > > + int assigned_dev_id, int entry, bool mask) > > > > > +{ > > > > > + int r = -EFAULT; > > > > > + struct kvm_assigned_dev_kernel *adev; > > > > > + int i; > > > > > + > > > > > + if (!irqchip_in_kernel(kvm)) > > > > > + return r; > > > > > + > > > > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > > > > + assigned_dev_id); > > > > > + if (!adev) > > > > > + goto out; > > > > > + > > > > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > > > > + for (i = 0; i < adev->entries_nr; i++) > > > > > + if (adev->host_msix_entries[i].entry == entry) { > > > > > + if (mask) > > > > > + disable_irq_nosync( > > > > > + adev->host_msix_entries[i].vector); > > > > > + else > > > > > + enable_irq(adev->host_msix_entries[i].vector); > > > > > + r = 0; > > > > > + break; > > > > > + } > > > > > +out: > > > > > + return r; > > > > > +} > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > > index b1b6cbb..b7807c8 100644 > > > > > --- a/virt/kvm/kvm_main.c > > > > > +++ b/virt/kvm/kvm_main.c > > > > > @@ -56,6 +56,7 @@ > > > > > > > > > > #include "coalesced_mmio.h" > > > > > #include "async_pf.h" > > > > > > > > > > +#include "msix_mmio.h" > > > > > > > > > > #define CREATE_TRACE_POINTS > > > > > #include <trace/events/kvm.h> > > > > > > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > > > > > > > struct mm_struct *mm = kvm->mm; > > > > > > > > > > kvm_arch_sync_events(kvm); > > > > > > > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > > > > > > > spin_lock(&kvm_lock); > > > > > list_del(&kvm->vm_list); > > > > > spin_unlock(&kvm_lock); > > > > > > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file *filp, > > > > > > > > > > mutex_unlock(&kvm->lock); > > > > > break; > > > > > > > > > > #endif > > > > > > > > > > + case KVM_REGISTER_MSIX_MMIO: { > > > > > + struct kvm_msix_mmio_user mmio_user; > > > > > + > > > > > + r = -EFAULT; > > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > > + goto out; > > > > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > > > > + break; > > > > > + } > > > > > + case KVM_UNREGISTER_MSIX_MMIO: { > > > > > + struct kvm_msix_mmio_user mmio_user; > > > > > + > > > > > + r = -EFAULT; > > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > > + goto out; > > > > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > > > > + break; > > > > > + } > > > > > > > > > > default: > > > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > > > > if (r == -ENOTTY) > > > > > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > > > > > > > return r; > > > > > > > > > > } > > > > > > > > > > #endif > > > > > > > > > > + r = kvm_register_msix_mmio_dev(kvm); > > > > > + if (r < 0) { > > > > > + kvm_put_kvm(kvm); > > > > > + return r; > > > > > + } > > > > > + > > > > > > > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > > > > if (r < 0) > > > > > > > > > > kvm_put_kvm(kvm); > > > > > > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct > > > > > kvm_io_bus *bus) > > > > > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t > > > > > addr, > > > > > > > > > > int len, const void *val) > > > > > > > > > > { > > > > > > > > > > - int i; > > > > > + int i, r = -EOPNOTSUPP; > > > > > > > > > > struct kvm_io_bus *bus; > > > > > > > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > > > > > > > - for (i = 0; i < bus->dev_count; i++) > > > > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > > > > > + for (i = 0; i < bus->dev_count; i++) { > > > > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > > > > > + if (r == -ENOTSYNC) > > > > > + break; > > > > > + else if (!r) > > > > > > > > > > return 0; > > > > So the above will be > > if (r >= 0) > > return r; > > > > > > > - return -EOPNOTSUPP; > > > > > + } > > > > > + return r; > > > > > > > > > > } > > > > > > > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > > > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > > > > new file mode 100644 > > > > > index 0000000..dde9b62 > > > > > --- /dev/null > > > > > +++ b/virt/kvm/msix_mmio.c > > > > > @@ -0,0 +1,286 @@ > > > > > +/* > > > > > + * MSI-X MMIO emulation > > > > > + * > > > > > + * Copyright (c) 2010 Intel Corporation > > > > > + * > > > > > + * This work is licensed under the terms of the GNU GPL, version 2. > > > > > See + * the COPYING file in the top-level directory. > > > > > + * > > > > > + * Author: > > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > > + */ > > > > > + > > > > > +#include <linux/kvm_host.h> > > > > > +#include <linux/kvm.h> > > > > > + > > > > > +#include "msix_mmio.h" > > > > > +#include "iodev.h" > > > > > + > > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct > > > > > kvm_msix_mmio *mmio, + int entry, u32 flag) > > > > > +{ > > > > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > > > > + mmio->dev_id, entry, flag); > > > > > + return -EFAULT; > > > > > +} > > > > > + > > > > > +/* Caller must hold dev->lock */ > > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > > > > + gpa_t addr, int len) > > > > > +{ > > > > > + gpa_t start, end; > > > > > + int i, r = -EINVAL; > > > > > + > > > > > + for (i = 0; i < dev->mmio_nr; i++) { > > > > > + start = dev->mmio[i].table_base_addr; > > > > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > > > > + dev->mmio[i].max_entries_nr; > > > > > + if (addr >= start && addr + len <= end) { > > > > > + r = i; > > > > > + break; > > > > > + } > > > > > > > > Yet another potential user of the weight balanced tree ;) > > > > > > Yeah, we can improve it later. > > > > > > > > + } > > > > > + > > > > > + return r; > > > > > +} > > > > > + > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t > > > > > addr, int len, + void *val) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > + struct kvm_msix_mmio *mmio; > > > > > + int idx, ret = 0, entry, offset, r; > > > > > + > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > + if (idx < 0) { > > > > > + ret = -EOPNOTSUPP; > > > > > + goto out; > > > > > + } > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > + goto out; > > > > > + > > > > > + offset = addr & 0xf; > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > + goto out; > > > > This is not as strict as it could be. Also > > see below (in write) for an idea how to make the code clearer. > > > > > > > + > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > > + if (r) > > > > > + goto out; > > > > Want to give the caller some indication of an error? > > If not add a comment why. > > I don't think we have a proper way to indicate the error now. Then let's add a comment, and skip the goto - it's just confusing: /* Ignore return value, we don't have a way to indicate the * error. */ r = copy_from_user(val, (void __user *)(mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > Also the > table_base_va already passed the checking when MMIO registered. We can add more > error info later. > > > > > > > +out: > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t > > > > > addr, + int len, const void *val) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > + struct kvm_msix_mmio *mmio; > > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > > + gpa_t entry_base; > > > > > + u32 old_ctrl, new_ctrl; > > > > These are really __le32. Pls make them so and then > > fix up the sparse errors. > > > > > > > + u32 *ctrl_pos; > > > > get_user should not be done on u32 *, it should be > > __user. Does not sparse complain about this? > > > > Haven't use sparse to check the patches. Would do it later. Please do. It's a lot of code with tricky pointer math, must make sure we are not accessing userspace pointers directly etc. > > > > > + > > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > + if (idx < 0) { > > > > > + ret = -EOPNOTSUPP; > > > > > + goto out; > > > > > + } > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > + goto out; > > > > > + > > > > > + offset = addr & 0xF; > > > > > > > > nit, this 0xf above. > > > > > > So you like another mask? > > > > Yes, this should be > > > > offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > + goto out; > > > > I think it's a good idea to be as strict as possible. > > Also let's quote the spec. Finally using % will make > > the logic more transparent. How about the below? > > > > /* For all accesses to MSI-X Table and MSI-X PBA fields, software must use > > aligned full DWORD or aligned full QWORD transactions; otherwise, the > > result is undefined. */ > > if (len == 4) { > > if (addr % 4) > > goto out; > > } else if (len == 8) { > > if (addr % 8) > > goto out; > > } else > > goto out; > > > > As read needs same logic it's likely a good idea > > to add a function for this. > > > > > > > + > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > > > + > > > > > + if (get_user(old_ctrl, ctrl_pos)) > > > > It's probably a good idea to do access_ok check for the whole > > entry here, and then use __get_user etc below, these are cheaper. > > table_base_va already passed access_ok checking. I am not sure if we need to do it > again. Yes but the process invoking the ioctl can be different from the one invoking the mmio. > > > > > > > + goto out; > > > > > + > > > > > + /* No allow writing to other fields when entry is unmasked */ > > > > Do not allow. > > > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > > + goto out; > > > > > + > > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > > > any such cast is suspect. > > Is entry_base really gpa? If it was you can not cast it to __user. > > HVA. Then don't declare it gpa_t > > I think it should be void __user *. > > > > > > > + goto out; > > > > > + > > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > > + goto out; > > > > get_user assumes aligned address. But there's no guarantee ctrl_pos > > is aligned because table_base_va might be misaligned. > > table_base_va already passed access_ok() check. access_ok done on table_base_va only checks that the address is not a kernel one. > > > > > > > + > > > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > > > + ret = -ENOTSYNC; > > > > Better to use != and not <, effectively same. > > Or in the above we can do: > > bool control; > > if (len == 4) { > > if (addr % 4) > > goto out; > > control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL; > > } else if (len == 8) { > > if (addr % 8) > > goto out; > > control = offset == PCI_MSIX_ENTRY_VECTOR_DATA; > > } else > > goto out; > > > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > > > first? I'd also prefer to see a direct goto out here since it's not > > > > entirely obvious that this first 'if' always falls into the below 'if'. > > > > I'm not a fan of using an error code as a special non-error return > > > > either. > > > > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check > > > new_ctrl to see if they indeed modified ctrl bits. > > > > > > I would try to make the logic here more clear. > > > > > > > > + if (old_ctrl == new_ctrl) > > > > > + goto out; > > > > > + > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > > > > > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); > > > > > > > > An xor would remove some redundancy here > > > > > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = update_msix_mask_bit(mmio_dev->kvm, > > > > mmio, entry, !!(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)); > > > > > > Oh, yes, sure... > > > > Also why not set ret to correct value directly? > > > > > > > + if (r || ret) > > > > > + ret = -ENOTSYNC; > > > > when is ret set? And why not to the correct value directly? > > > > > > > +out: > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > + mutex_unlock(&mmio_dev->kvm->lock); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > > > > + .read = msix_table_mmio_read, > > > > > + .write = msix_table_mmio_write, > > > > > +}; > > > > > + > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > > > > +{ > > > > > + int ret; > > > > > + > > > > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, > > > > > &msix_mmio_table_ops); + mutex_init(&kvm->msix_mmio_dev.lock); > > > > > + kvm->msix_mmio_dev.kvm = kvm; > > > > > + mutex_lock(&kvm->slots_lock); > > > > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > > > > + &kvm->msix_mmio_dev.table_dev); > > > > > + mutex_unlock(&kvm->slots_lock); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > > > > +{ > > > > > + int ret; > > > > > + > > > > > + mutex_lock(&kvm->slots_lock); > > > > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > > > > + &kvm->msix_mmio_dev.table_dev); > > > > > + mutex_unlock(&kvm->slots_lock); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > > + struct kvm_msix_mmio *mmio = NULL; > > > > > + int r = 0, i; > > > > > + > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > > > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > > > > + mmio = &mmio_dev->mmio[i]; > > > > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + break; > > > > > + } > > > > > + } > > > > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + /* All reserved currently */ > > > > > + if (mmio_user->flags) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + > > > > > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > > > > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > > > > + r = -EINVAL; > > > > > + goto out; > > > > > + } > > > > > + if (!mmio) { > > > > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > > > > + r = -ENOSPC; > > > > > + goto out; > > > > > + } > > > > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > > > > + } > > > > > + > > > > > + mmio_dev->mmio_nr++; > > > > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > > > > + mmio->dev_id = mmio_user->dev_id; > > > > > + mmio->flags = mmio_user->flags; > > > > > + > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > > + mmio->table_base_addr = mmio_user->base_addr; > > > > > + mmio->table_base_va = mmio_user->base_va; > > > > Also check alignment for these values. base_va should be > > at least 4 byte aligned. Maybe more, need to think about it ... > > Look in spec for table_base_addr alignment. > > OK. > > -- > regards > Yang, Sheng > > > > > > > > + } > > > > > +out: > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > + return r; > > > > > +} > > > > > + > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio *mmio) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > > + int r = 0, i, j; > > > > > > > > set r = -EINVAL here > > > > > > > > > + bool found = 0; > > > > > + > > > > > + if (!mmio) > > > > > + return 0; > > > > > + > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > > > > + mmio_dev->mmio[i].type == mmio->type) { > > > > > + found = true; > > > > > > > > set r = 0 here > > > > > > > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > > > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > > > > + mmio_dev->mmio_nr--; > > > > > + break; > > > > > + } > > > > > + } > > > > > + if (!found) > > > > > + r = -EINVAL; > > > > > > > > then get rid of found > > > > > > Sure. > > > > > > -- > > > regards > > > Yang, Sheng > > > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > + return r; > > > > > +} > > > > > + > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > > +{ > > > > > + struct kvm_msix_mmio mmio; > > > > > + > > > > > + mmio.dev_id = mmio_user->dev_id; > > > > > + mmio.type = mmio_user->type; > > > > > + > > > > > + return kvm_free_msix_mmio(kvm, &mmio); > > > > > +} > > > > > + > > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > > > > new file mode 100644 > > > > > index 0000000..01b6587 > > > > > --- /dev/null > > > > > +++ b/virt/kvm/msix_mmio.h > > > > > @@ -0,0 +1,25 @@ > > > > > +#ifndef __KVM_MSIX_MMIO_H__ > > > > > +#define __KVM_MSIX_MMIO_H__ > > > > > +/* > > > > > + * MSI-X MMIO emulation > > > > > + * > > > > > + * Copyright (c) 2010 Intel Corporation > > > > > + * > > > > > + * This work is licensed under the terms of the GNU GPL, version 2. > > > > > See + * the COPYING file in the top-level directory. > > > > > + * > > > > > + * Author: > > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > > + */ > > > > > + > > > > > +#include <linux/pci.h> > > > > > + > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > > > *mmio_user); + > > > > > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-24 10:11 ` Michael S. Tsirkin @ 2011-02-25 5:54 ` Sheng Yang 0 siblings, 0 replies; 20+ messages in thread From: Sheng Yang @ 2011-02-25 5:54 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Thursday 24 February 2011 18:11:44 Michael S. Tsirkin wrote: > On Thu, Feb 24, 2011 at 04:08:22PM +0800, Sheng Yang wrote: > > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > > > should be able to hook this into vfio with a small stub in kvm. We > > > > > just need to be able to communicate disabling and enabling of > > > > > individual msix vectors. For brute force, we could do this with a > > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per > > > > > MSI-X vector. Not sure if there's something smaller that could do > > > > > it. Thanks, > > > > > > > > Alex, thanks for your comments. See my comments below: > > > > > Alex > > > > > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > > > --- > > > > > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > > > arch/x86/kvm/x86.c | 8 +- > > > > > > include/linux/kvm.h | 21 ++++ > > > > > > include/linux/kvm_host.h | 25 ++++ > > > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > > > virt/kvm/msix_mmio.c | 286 > > > > > > ++++++++++++++++++++++++++++++++++++++++++++++ > > > > > > virt/kvm/msix_mmio.h > > > > > > > > > > > > | 25 ++++ > > > > > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > > > index f15501f..3a0d851 100644 > > > > > > --- a/arch/x86/kvm/Makefile > > > > > > +++ b/arch/x86/kvm/Makefile > > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o > > > > \ > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > > > > > - assigned-dev.o) > > > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, > > > > > > iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix > > > > > > ../../../virt/kvm/, async_pf.o) > > > > > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > > index fa708c9..89bf12c 100644 > > > > > > --- a/arch/x86/kvm/x86.c > > > > > > +++ b/arch/x86/kvm/x86.c > > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > > > case KVM_CAP_XSAVE: > > > > > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > > > + case KVM_CAP_MSIX_MMIO: > > > > > > r = 1; > > > > > > break; > > > > > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > > > @@ -3807,6 +3808,7 @@ static int > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > > > > > { > > > > > > > > > > > > gpa_t gpa; > > > > > > > > > > > > + int r; > > > > > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > mmio: > > > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > > > > > /* > > > > > > > > > > > > * Is this MMIO handled locally? > > > > > > */ > > > > > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > > > + if (!r) > > > > > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > > > This use of -ENOTSYNC is IMO confusing. > > > How about we make vcpu_mmio_write return the positive exit reason? > > > Negative value will mean an error. > > > > Make sense. I would update it. > > > > > > > > vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; > > > > > > vcpu->run->mmio.len = vcpu->mmio_size = bytes; > > > > > > vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; > > > > > > > > > > > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > > > > > index ea2dc1a..ad9df4b 100644 > > > > > > --- a/include/linux/kvm.h > > > > > > +++ b/include/linux/kvm.h > > > > > > @@ -161,6 +161,7 @@ struct kvm_pit_config { > > > > > > > > > > > > #define KVM_EXIT_NMI 16 > > > > > > #define KVM_EXIT_INTERNAL_ERROR 17 > > > > > > #define KVM_EXIT_OSI 18 > > > > > > > > > > > > +#define KVM_EXIT_MSIX_ROUTING_UPDATE 19 > > > > > > > > > > > > /* For KVM_EXIT_INTERNAL_ERROR */ > > > > > > #define KVM_INTERNAL_ERROR_EMULATION 1 > > > > > > > > > > > > @@ -541,6 +542,7 @@ struct kvm_ppc_pvinfo { > > > > > > > > > > > > #define KVM_CAP_PPC_GET_PVINFO 57 > > > > > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > > > > > > #define KVM_CAP_ASYNC_PF 59 > > > > > > > > > > > > +#define KVM_CAP_MSIX_MMIO 60 > > > > > > > > > > > > #ifdef KVM_CAP_IRQ_ROUTING > > > > > > > > > > > > @@ -672,6 +674,9 @@ struct kvm_clock_data { > > > > > > > > > > > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > > > > > kvm_xen_hvm_config) #define KVM_SET_CLOCK > > > > > > _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK > > > > > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > > > > > > > > > > > > +/* Available with KVM_CAP_MSIX_MMIO */ > > > > > > +#define KVM_REGISTER_MSIX_MMIO _IOW(KVMIO, 0x7d, struct > > > > > > kvm_msix_mmio_user) +#define KVM_UNREGISTER_MSIX_MMIO > > > > > > _IOW(KVMIO, 0x7e, struct kvm_msix_mmio_user) > > > > > > > > > > > > /* Available with KVM_CAP_PIT_STATE2 */ > > > > > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > > > > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, > > > > > > 0xa0, struct kvm_pit_state2) > > > > > > > > > > > > @@ -795,4 +800,20 @@ struct kvm_assigned_msix_entry { > > > > > > > > > > > > __u16 padding[3]; > > > > > > > > > > > > }; > > > > > > > > > > > > +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) > > > > > > + > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) > > > > > > + > > > > > > +#define KVM_MSIX_MMIO_TYPE_DEV_MASK 0x00ff > > > > > > +#define KVM_MSIX_MMIO_TYPE_BASE_MASK 0xff00 > > > > > > +struct kvm_msix_mmio_user { > > > > > > + __u32 dev_id; > > > > > > + __u16 type; > > > > > > + __u16 max_entries_nr; > > > > > > + __u64 base_addr; > > > > > > + __u64 base_va; > > > > > > + __u64 flags; > > > > > > + __u64 reserved[4]; > > > > > > +}; > > > > > > + > > > > > > > > > > > > #endif /* __LINUX_KVM_H */ > > > > > > > > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > > > > > index 7d313e0..c10670c 100644 > > > > > > --- a/include/linux/kvm_host.h > > > > > > +++ b/include/linux/kvm_host.h > > > > > > @@ -233,6 +233,27 @@ struct kvm_memslots { > > > > > > > > > > > > KVM_PRIVATE_MEM_SLOTS]; > > > > > > > > > > > > }; > > > > > > > > > > > > +#define KVM_MSIX_MMIO_MAX 32 > > > > > > > > > > I assume this is based on 32 devices/bus, but we could exceed this > > > > > is we add more buses or make use of more functions. How hard is > > > > > it to make this dynamic? > > > > > > > > We can improve this later(maybe with balance tree?). 32 should be > > > > more than sufficient for now. And this patchset has been in the > > > > mailing list for months, I really want to get it in first... > > > > > > > > > > + > > > > > > +struct kvm_msix_mmio { > > > > > > + u32 dev_id; > > > > > > + u16 type; > > > > > > + u16 max_entries_nr; > > > > > > + u64 flags; > > > > > > + gpa_t table_base_addr; > > > > > > + hva_t table_base_va; > > > > > > + gpa_t pba_base_addr; > > > > > > + hva_t pba_base_va; > > > > > > +}; > > > > > > + > > > > > > +struct kvm_msix_m4mio_dev { > > > > > > + struct kvm *kvm; > > > > > > + struct kvm_io_device table_dev; > > > > > > + int mmio_nr; > > > > > > + struct kvm_msix_mmio mmio[KVM_MSIX_MMIO_MAX]; > > > > > > + struct mutex lock; > > > > > > +}; > > > > > > + > > > > > > > > > > > > struct kvm { > > > > > > > > > > > > spinlock_t mmu_lock; > > > > > > raw_spinlock_t requests_lock; > > > > > > > > > > > > @@ -281,6 +302,7 @@ struct kvm { > > > > > > > > > > > > long mmu_notifier_count; > > > > > > > > > > > > #endif > > > > > > > > > > > > long tlbs_dirty; > > > > > > > > > > > > + struct kvm_msix_mmio_dev msix_mmio_dev; > > > > > > > > > > > > }; > > > > > > > > > > > > /* The guest did something we don't support. */ > > > > > > > > > > > > @@ -553,6 +575,9 @@ void kvm_unregister_irq_ack_notifier(struct > > > > > > kvm *kvm, > > > > > > > > > > > > int kvm_request_irq_source_id(struct kvm *kvm); > > > > > > void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); > > > > > > > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > > > + int assigned_dev_id, int entry, bool mask); > > > > > > + > > > > > > > > > > > > /* For vcpu->arch.iommu_flags */ > > > > > > #define KVM_IOMMU_CACHE_COHERENCY 0x1 > > > > > > > > > > > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > > > > > index ae72ae6..d1598a6 100644 > > > > > > --- a/virt/kvm/assigned-dev.c > > > > > > +++ b/virt/kvm/assigned-dev.c > > > > > > @@ -18,6 +18,7 @@ > > > > > > > > > > > > #include <linux/interrupt.h> > > > > > > #include <linux/slab.h> > > > > > > #include "irq.h" > > > > > > > > > > > > +#include "msix_mmio.h" > > > > > > > > > > > > static struct kvm_assigned_dev_kernel > > > > > > *kvm_find_assigned_dev(struct list_head *head, > > > > > > > > > > > > int assigned_dev_id) > > > > > > > > > > > > @@ -191,12 +192,25 @@ static void kvm_free_assigned_irq(struct > > > > > > kvm *kvm, > > > > > > > > > > > > kvm_deassign_irq(kvm, assigned_dev, > > > > > > assigned_dev->irq_requested_type); > > > > > > > > > > > > } > > > > > > > > > > > > +static void assigned_device_free_msix_mmio(struct kvm *kvm, > > > > > > + struct kvm_assigned_dev_kernel *adev) > > > > > > +{ > > > > > > + struct kvm_msix_mmio mmio; > > > > > > + > > > > > > + mmio.dev_id = adev->assigned_dev_id; > > > > > > + mmio.type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV | > > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > > > + kvm_free_msix_mmio(kvm, &mmio); > > > > > > +} > > > > > > + > > > > > > > > > > > > static void kvm_free_assigned_device(struct kvm *kvm, > > > > > > > > > > > > struct kvm_assigned_dev_kernel > > > > > > *assigned_dev) > > > > > > > > > > > > { > > > > > > > > > > > > kvm_free_assigned_irq(kvm, assigned_dev); > > > > > > > > > > > > + assigned_device_free_msix_mmio(kvm, assigned_dev); > > > > > > + > > > > > > > > > > > > __pci_reset_function(assigned_dev->dev); > > > > > > pci_restore_state(assigned_dev->dev); > > > > > > > > > > > > @@ -785,3 +799,33 @@ out: > > > > > > return r; > > > > > > > > > > > > } > > > > > > > > > > > > +/* The caller should hold kvm->lock */ > > > > > > +int kvm_assigned_device_update_msix_mask_bit(struct kvm *kvm, > > > > > > + int assigned_dev_id, int entry, bool mask) > > > > > > +{ > > > > > > + int r = -EFAULT; > > > > > > + struct kvm_assigned_dev_kernel *adev; > > > > > > + int i; > > > > > > + > > > > > > + if (!irqchip_in_kernel(kvm)) > > > > > > + return r; > > > > > > + > > > > > > + adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > > > > > + assigned_dev_id); > > > > > > + if (!adev) > > > > > > + goto out; > > > > > > + > > > > > > + /* For non-MSIX enabled devices, entries_nr == 0 */ > > > > > > + for (i = 0; i < adev->entries_nr; i++) > > > > > > + if (adev->host_msix_entries[i].entry == entry) { > > > > > > + if (mask) > > > > > > + disable_irq_nosync( > > > > > > + adev->host_msix_entries[i].vector); > > > > > > + else > > > > > > + enable_irq(adev->host_msix_entries[i].vector); > > > > > > + r = 0; > > > > > > + break; > > > > > > + } > > > > > > +out: > > > > > > + return r; > > > > > > +} > > > > > > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c > > > > > > index b1b6cbb..b7807c8 100644 > > > > > > --- a/virt/kvm/kvm_main.c > > > > > > +++ b/virt/kvm/kvm_main.c > > > > > > @@ -56,6 +56,7 @@ > > > > > > > > > > > > #include "coalesced_mmio.h" > > > > > > #include "async_pf.h" > > > > > > > > > > > > +#include "msix_mmio.h" > > > > > > > > > > > > #define CREATE_TRACE_POINTS > > > > > > #include <trace/events/kvm.h> > > > > > > > > > > > > @@ -509,6 +510,7 @@ static void kvm_destroy_vm(struct kvm *kvm) > > > > > > > > > > > > struct mm_struct *mm = kvm->mm; > > > > > > > > > > > > kvm_arch_sync_events(kvm); > > > > > > > > > > > > + kvm_unregister_msix_mmio_dev(kvm); > > > > > > > > > > > > spin_lock(&kvm_lock); > > > > > > list_del(&kvm->vm_list); > > > > > > spin_unlock(&kvm_lock); > > > > > > > > > > > > @@ -1877,6 +1879,24 @@ static long kvm_vm_ioctl(struct file > > > > > > *filp, > > > > > > > > > > > > mutex_unlock(&kvm->lock); > > > > > > break; > > > > > > > > > > > > #endif > > > > > > > > > > > > + case KVM_REGISTER_MSIX_MMIO: { > > > > > > + struct kvm_msix_mmio_user mmio_user; > > > > > > + > > > > > > + r = -EFAULT; > > > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > > > + goto out; > > > > > > + r = kvm_vm_ioctl_register_msix_mmio(kvm, &mmio_user); > > > > > > + break; > > > > > > + } > > > > > > + case KVM_UNREGISTER_MSIX_MMIO: { > > > > > > + struct kvm_msix_mmio_user mmio_user; > > > > > > + > > > > > > + r = -EFAULT; > > > > > > + if (copy_from_user(&mmio_user, argp, sizeof mmio_user)) > > > > > > + goto out; > > > > > > + r = kvm_vm_ioctl_unregister_msix_mmio(kvm, &mmio_user); > > > > > > + break; > > > > > > + } > > > > > > > > > > > > default: > > > > > > r = kvm_arch_vm_ioctl(filp, ioctl, arg); > > > > > > if (r == -ENOTTY) > > > > > > > > > > > > @@ -1988,6 +2008,12 @@ static int kvm_dev_ioctl_create_vm(void) > > > > > > > > > > > > return r; > > > > > > > > > > > > } > > > > > > > > > > > > #endif > > > > > > > > > > > > + r = kvm_register_msix_mmio_dev(kvm); > > > > > > + if (r < 0) { > > > > > > + kvm_put_kvm(kvm); > > > > > > + return r; > > > > > > + } > > > > > > + > > > > > > > > > > > > r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); > > > > > > if (r < 0) > > > > > > > > > > > > kvm_put_kvm(kvm); > > > > > > > > > > > > @@ -2223,14 +2249,18 @@ static void kvm_io_bus_destroy(struct > > > > > > kvm_io_bus *bus) > > > > > > > > > > > > int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, > > > > > > gpa_t addr, > > > > > > > > > > > > int len, const void *val) > > > > > > > > > > > > { > > > > > > > > > > > > - int i; > > > > > > + int i, r = -EOPNOTSUPP; > > > > > > > > > > > > struct kvm_io_bus *bus; > > > > > > > > > > > > bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); > > > > > > > > > > > > - for (i = 0; i < bus->dev_count; i++) > > > > > > - if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) > > > > > > + for (i = 0; i < bus->dev_count; i++) { > > > > > > + r = kvm_iodevice_write(bus->devs[i], addr, len, val); > > > > > > + if (r == -ENOTSYNC) > > > > > > + break; > > > > > > + else if (!r) > > > > > > > > > > > > return 0; > > > > > > So the above will be > > > > > > if (r >= 0) > > > > > > return r; > > > > > > > > > - return -EOPNOTSUPP; > > > > > > + } > > > > > > + return r; > > > > > > > > > > > > } > > > > > > > > > > > > /* kvm_io_bus_read - called under kvm->slots_lock */ > > > > > > > > > > > > diff --git a/virt/kvm/msix_mmio.c b/virt/kvm/msix_mmio.c > > > > > > new file mode 100644 > > > > > > index 0000000..dde9b62 > > > > > > --- /dev/null > > > > > > +++ b/virt/kvm/msix_mmio.c > > > > > > @@ -0,0 +1,286 @@ > > > > > > +/* > > > > > > + * MSI-X MMIO emulation > > > > > > + * > > > > > > + * Copyright (c) 2010 Intel Corporation > > > > > > + * > > > > > > + * This work is licensed under the terms of the GNU GPL, version > > > > > > 2. See + * the COPYING file in the top-level directory. > > > > > > + * > > > > > > + * Author: > > > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > > > + */ > > > > > > + > > > > > > +#include <linux/kvm_host.h> > > > > > > +#include <linux/kvm.h> > > > > > > + > > > > > > +#include "msix_mmio.h" > > > > > > +#include "iodev.h" > > > > > > + > > > > > > +static int update_msix_mask_bit(struct kvm *kvm, struct > > > > > > kvm_msix_mmio *mmio, + int entry, u32 flag) > > > > > > +{ > > > > > > + if (mmio->type & KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > > > + return kvm_assigned_device_update_msix_mask_bit(kvm, > > > > > > + mmio->dev_id, entry, flag); > > > > > > + return -EFAULT; > > > > > > +} > > > > > > + > > > > > > +/* Caller must hold dev->lock */ > > > > > > +static int get_mmio_table_index(struct kvm_msix_mmio_dev *dev, > > > > > > + gpa_t addr, int len) > > > > > > +{ > > > > > > + gpa_t start, end; > > > > > > + int i, r = -EINVAL; > > > > > > + > > > > > > + for (i = 0; i < dev->mmio_nr; i++) { > > > > > > + start = dev->mmio[i].table_base_addr; > > > > > > + end = dev->mmio[i].table_base_addr + PCI_MSIX_ENTRY_SIZE * > > > > > > + dev->mmio[i].max_entries_nr; > > > > > > + if (addr >= start && addr + len <= end) { > > > > > > + r = i; > > > > > > + break; > > > > > > + } > > > > > > > > > > Yet another potential user of the weight balanced tree ;) > > > > > > > > Yeah, we can improve it later. > > > > > > > > > > + } > > > > > > + > > > > > > + return r; > > > > > > +} > > > > > > + > > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, > > > > > > gpa_t addr, int len, + void *val) > > > > > > +{ > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > > + struct kvm_msix_mmio *mmio; > > > > > > + int idx, ret = 0, entry, offset, r; > > > > > > + > > > > > > + mutex_lock(&mmio_dev->lock); > > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > > + if (idx < 0) { > > > > > > + ret = -EOPNOTSUPP; > > > > > > + goto out; > > > > > > + } > > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > > + goto out; > > > > > > + > > > > > > + offset = addr & 0xf; > > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > > + goto out; > > > > > > This is not as strict as it could be. Also > > > see below (in write) for an idea how to make the code clearer. > > > > > > > > > + > > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > > > + if (r) > > > > > > + goto out; > > > > > > Want to give the caller some indication of an error? > > > If not add a comment why. > > > > I don't think we have a proper way to indicate the error now. > > Then let's add a comment, and skip the goto - it's just confusing: > > /* Ignore return value, we don't have a way to indicate the > * error. */ > r = copy_from_user(val, (void __user *)(mmio->table_base_va + > entry * PCI_MSIX_ENTRY_SIZE + offset), len); Um... I really think it's quite obvious in the code. > > > Also the > > table_base_va already passed the checking when MMIO registered. We can > > add more error info later. > > > > > > > > +out: > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, > > > > > > gpa_t addr, + int len, const void *val) > > > > > > +{ > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > > + struct kvm_msix_mmio *mmio; > > > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > > > + gpa_t entry_base; > > > > > > + u32 old_ctrl, new_ctrl; > > > > > > These are really __le32. Pls make them so and then > > > fix up the sparse errors. > > > > > > > > > + u32 *ctrl_pos; > > > > > > get_user should not be done on u32 *, it should be > > > __user. Does not sparse complain about this? > > > > Haven't use sparse to check the patches. Would do it later. > > Please do. It's a lot of code with tricky pointer math, must make sure > we are not accessing userspace pointers directly etc. All following patches has been checked. > > > > > > > + > > > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > > > + mutex_lock(&mmio_dev->lock); > > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > > + if (idx < 0) { > > > > > > + ret = -EOPNOTSUPP; > > > > > > + goto out; > > > > > > + } > > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > > + goto out; > > > > > > + > > > > > > + offset = addr & 0xF; > > > > > > > > > > nit, this 0xf above. > > > > > > > > So you like another mask? > > > > > > Yes, this should be > > > > > > offset = addr % PCI_MSIX_ENTRY_SIZE; > > > > > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > > + goto out; > > > > > > I think it's a good idea to be as strict as possible. > > > Also let's quote the spec. Finally using % will make > > > the logic more transparent. How about the below? > > > > > > /* For all accesses to MSI-X Table and MSI-X PBA fields, software must > > > use > > > > > > aligned full DWORD or aligned full QWORD transactions; otherwise, > > > the result is undefined. */ > > > > > > if (len == 4) { > > > > > > if (addr % 4) > > > > > > goto out; > > > > > > } else if (len == 8) { > > > > > > if (addr % 8) > > > > > > goto out; > > > > > > } else > > > > > > goto out; > > > > > > As read needs same logic it's likely a good idea > > > to add a function for this. > > > > > > > > > + > > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > > > > + > > > > > > + if (get_user(old_ctrl, ctrl_pos)) > > > > > > It's probably a good idea to do access_ok check for the whole > > > entry here, and then use __get_user etc below, these are cheaper. > > > > table_base_va already passed access_ok checking. I am not sure if we need > > to do it again. > > Yes but the process invoking the ioctl can be different from the one > invoking the mmio. I don't understand why it matters here... Oh, I found I missed your original point. You mean we can check all entry and call a cheaper __get_user instread of get_user. Well, we can leave this kind of tweak later... > > > > > > > + goto out; > > > > > > + > > > > > > + /* No allow writing to other fields when entry is unmasked */ > > > > > > Do not allow. > > > > > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > > > + goto out; > > > > > > + > > > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, > > > > > > len)) > > > > > > any such cast is suspect. > > > Is entry_base really gpa? If it was you can not cast it to __user. > > > > HVA. > > Then don't declare it gpa_t Oh, typo... Would update it later. > > > > I think it should be void __user *. > > > > > > > > > + goto out; > > > > > > + > > > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > > > + goto out; > > > > > > get_user assumes aligned address. But there's no guarantee ctrl_pos > > > is aligned because table_base_va might be misaligned. > > > > table_base_va already passed access_ok() check. > > access_ok done on table_base_va only checks that the address > is not a kernel one. Now table_base_va's alignment has been checked(see the latest patchset), so I think we're fine here. -- regards Yang, Sheng > > > > > > > + > > > > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > > > > + ret = -ENOTSYNC; > > > > > > Better to use != and not <, effectively same. > > > > > > Or in the above we can do: > > > bool control; > > > if (len == 4) { > > > > > > if (addr % 4) > > > > > > goto out; > > > > > > control = offset == PCI_MSIX_ENTRY_VECTOR_CTRL; > > > > > > } else if (len == 8) { > > > > > > if (addr % 8) > > > > > > goto out; > > > > > > control = offset == PCI_MSIX_ENTRY_VECTOR_DATA; > > > > > > } else > > > > > > goto out; > > > > > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested > > > > > this first? I'd also prefer to see a direct goto out here since > > > > > it's not entirely obvious that this first 'if' always falls into > > > > > the below 'if'. I'm not a fan of using an error code as a special > > > > > non-error return either. > > > > > > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to > > > > check new_ctrl to see if they indeed modified ctrl bits. > > > > > > > > I would try to make the logic here more clear. > > > > > > > > > > + if (old_ctrl == new_ctrl) > > > > > > + goto out; > > > > > > + > > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > > + (new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 1); > > > > > > + else if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > > + !(new_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT)) > > > > > > + r = update_msix_mask_bit(mmio_dev->kvm, mmio, entry, 0); > > > > > > > > > > An xor would remove some redundancy here > > > > > > > > > > if ((old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) ^ (new_ctrl & > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)) r = > > > > > update_msix_mask_bit(mmio_dev->kvm, mmio, entry, !!(new_ctrl & > > > > > PCI_MSIX_ENTRY_CTRL_MASKBIT)); > > > > > > > > Oh, yes, sure... > > > > > > Also why not set ret to correct value directly? > > > > > > > > > + if (r || ret) > > > > > > + ret = -ENOTSYNC; > > > > > > when is ret set? And why not to the correct value directly? > > > > > > > > > +out: > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > > + mutex_unlock(&mmio_dev->kvm->lock); > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > +static const struct kvm_io_device_ops msix_mmio_table_ops = { > > > > > > + .read = msix_table_mmio_read, > > > > > > + .write = msix_table_mmio_write, > > > > > > +}; > > > > > > + > > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm) > > > > > > +{ > > > > > > + int ret; > > > > > > + > > > > > > + kvm_iodevice_init(&kvm->msix_mmio_dev.table_dev, > > > > > > &msix_mmio_table_ops); + mutex_init(&kvm->msix_mmio_dev.lock); > > > > > > + kvm->msix_mmio_dev.kvm = kvm; > > > > > > + mutex_lock(&kvm->slots_lock); > > > > > > + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > > > > > + &kvm->msix_mmio_dev.table_dev); > > > > > > + mutex_unlock(&kvm->slots_lock); > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm) > > > > > > +{ > > > > > > + int ret; > > > > > > + > > > > > > + mutex_lock(&kvm->slots_lock); > > > > > > + ret = kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > > > > > + &kvm->msix_mmio_dev.table_dev); > > > > > > + mutex_unlock(&kvm->slots_lock); > > > > > > + return ret; > > > > > > +} > > > > > > + > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > > > +{ > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > > > + struct kvm_msix_mmio *mmio = NULL; > > > > > > + int r = 0, i; > > > > > > + > > > > > > + mutex_lock(&mmio_dev->lock); > > > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > > > + if (mmio_dev->mmio[i].dev_id == mmio_user->dev_id && > > > > > > + (mmio_dev->mmio[i].type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > > > + (mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK)) { > > > > > > + mmio = &mmio_dev->mmio[i]; > > > > > > + if (mmio->max_entries_nr != mmio_user->max_entries_nr) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + break; > > > > > > + } > > > > > > + } > > > > > > + if (mmio_user->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + /* All reserved currently */ > > > > > > + if (mmio_user->flags) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + > > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) != > > > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) != > > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + > > > > > > + if (!access_ok(VERIFY_WRITE, mmio_user->base_va, > > > > > > + mmio_user->max_entries_nr * PCI_MSIX_ENTRY_SIZE)) { > > > > > > + r = -EINVAL; > > > > > > + goto out; > > > > > > + } > > > > > > + if (!mmio) { > > > > > > + if (mmio_dev->mmio_nr == KVM_MSIX_MMIO_MAX) { > > > > > > + r = -ENOSPC; > > > > > > + goto out; > > > > > > + } > > > > > > + mmio = &mmio_dev->mmio[mmio_dev->mmio_nr]; > > > > > > + } > > > > > > + > > > > > > + mmio_dev->mmio_nr++; > > > > > > + mmio->max_entries_nr = mmio_user->max_entries_nr; > > > > > > + mmio->dev_id = mmio_user->dev_id; > > > > > > + mmio->flags = mmio_user->flags; > > > > > > + > > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_DEV_MASK) == > > > > > > + KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV) > > > > > > + mmio->type = KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV; > > > > > > + if ((mmio_user->type & KVM_MSIX_MMIO_TYPE_BASE_MASK) == > > > > > > + KVM_MSIX_MMIO_TYPE_BASE_TABLE) { > > > > > > + mmio->type |= KVM_MSIX_MMIO_TYPE_BASE_TABLE; > > > > > > + mmio->table_base_addr = mmio_user->base_addr; > > > > > > + mmio->table_base_va = mmio_user->base_va; > > > > > > Also check alignment for these values. base_va should be > > > at least 4 byte aligned. Maybe more, need to think about it ... > > > Look in spec for table_base_addr alignment. > > > > OK. > > > > -- > > regards > > Yang, Sheng > > > > > > > > + } > > > > > > +out: > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > > + return r; > > > > > > +} > > > > > > + > > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > > > > *mmio) +{ > > > > > > + struct kvm_msix_mmio_dev *mmio_dev = &kvm->msix_mmio_dev; > > > > > > + int r = 0, i, j; > > > > > > > > > > set r = -EINVAL here > > > > > > > > > > > + bool found = 0; > > > > > > + > > > > > > + if (!mmio) > > > > > > + return 0; > > > > > > + > > > > > > + mutex_lock(&mmio_dev->lock); > > > > > > + BUG_ON(mmio_dev->mmio_nr > KVM_MSIX_MMIO_MAX); > > > > > > + for (i = 0; i < mmio_dev->mmio_nr; i++) { > > > > > > + if (mmio_dev->mmio[i].dev_id == mmio->dev_id && > > > > > > + mmio_dev->mmio[i].type == mmio->type) { > > > > > > + found = true; > > > > > > > > > > set r = 0 here > > > > > > > > > > > + for (j = i; j < mmio_dev->mmio_nr - 1; j++) > > > > > > + mmio_dev->mmio[j] = mmio_dev->mmio[j + 1]; > > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].max_entries_nr = 0; > > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].dev_id = 0; > > > > > > + mmio_dev->mmio[mmio_dev->mmio_nr].type = 0; > > > > > > + mmio_dev->mmio_nr--; > > > > > > + break; > > > > > > + } > > > > > > + } > > > > > > + if (!found) > > > > > > + r = -EINVAL; > > > > > > > > > > then get rid of found > > > > > > > > Sure. > > > > > > > > -- > > > > regards > > > > Yang, Sheng > > > > > > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > > + return r; > > > > > > +} > > > > > > + > > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > > > + struct kvm_msix_mmio_user *mmio_user) > > > > > > +{ > > > > > > + struct kvm_msix_mmio mmio; > > > > > > + > > > > > > + mmio.dev_id = mmio_user->dev_id; > > > > > > + mmio.type = mmio_user->type; > > > > > > + > > > > > > + return kvm_free_msix_mmio(kvm, &mmio); > > > > > > +} > > > > > > + > > > > > > diff --git a/virt/kvm/msix_mmio.h b/virt/kvm/msix_mmio.h > > > > > > new file mode 100644 > > > > > > index 0000000..01b6587 > > > > > > --- /dev/null > > > > > > +++ b/virt/kvm/msix_mmio.h > > > > > > @@ -0,0 +1,25 @@ > > > > > > +#ifndef __KVM_MSIX_MMIO_H__ > > > > > > +#define __KVM_MSIX_MMIO_H__ > > > > > > +/* > > > > > > + * MSI-X MMIO emulation > > > > > > + * > > > > > > + * Copyright (c) 2010 Intel Corporation > > > > > > + * > > > > > > + * This work is licensed under the terms of the GNU GPL, version > > > > > > 2. See + * the COPYING file in the top-level directory. > > > > > > + * > > > > > > + * Author: > > > > > > + * Sheng Yang <sheng.yang@intel.com> > > > > > > + */ > > > > > > + > > > > > > +#include <linux/pci.h> > > > > > > + > > > > > > +int kvm_register_msix_mmio_dev(struct kvm *kvm); > > > > > > +int kvm_unregister_msix_mmio_dev(struct kvm *kvm); > > > > > > +int kvm_vm_ioctl_register_msix_mmio(struct kvm *kvm, > > > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > > > +int kvm_vm_ioctl_unregister_msix_mmio(struct kvm *kvm, > > > > > > + struct kvm_msix_mmio_user *mmio_user); > > > > > > +int kvm_free_msix_mmio(struct kvm *kvm, struct kvm_msix_mmio > > > > > > *mmio_user); + > > > > > > +#endif ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 8:45 ` Michael S. Tsirkin 2011-02-24 8:08 ` Sheng Yang @ 2011-02-24 9:44 ` Sheng Yang 2011-02-24 10:17 ` Michael S. Tsirkin 1 sibling, 1 reply; 20+ messages in thread From: Sheng Yang @ 2011-02-24 9:44 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > Then we can support mask bit operation of assigned devices now. > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > should be able to hook this into vfio with a small stub in kvm. We > > > just need to be able to communicate disabling and enabling of > > > individual msix vectors. For brute force, we could do this with a lot > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X > > > vector. Not sure if there's something smaller that could do it. > > > Thanks, > > > > Alex, thanks for your comments. See my comments below: > > > Alex > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > --- > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > arch/x86/kvm/x86.c | 8 +- > > > > include/linux/kvm.h | 21 ++++ > > > > include/linux/kvm_host.h | 25 ++++ > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > virt/kvm/msix_mmio.c | 286 > > > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > > > > > > | 25 ++++ > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > index f15501f..3a0d851 100644 > > > > --- a/arch/x86/kvm/Makefile > > > > +++ b/arch/x86/kvm/Makefile > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > - assigned-dev.o) > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > > async_pf.o) > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > index fa708c9..89bf12c 100644 > > > > --- a/arch/x86/kvm/x86.c > > > > +++ b/arch/x86/kvm/x86.c > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > case KVM_CAP_XSAVE: > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > + case KVM_CAP_MSIX_MMIO: > > > > r = 1; > > > > break; > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > @@ -3807,6 +3808,7 @@ static int > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > { > > > > > > > > gpa_t gpa; > > > > > > > > + int r; > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > mmio: > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > /* > > > > > > > > * Is this MMIO handled locally? > > > > */ > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > + if (!r) > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > This use of -ENOTSYNC is IMO confusing. > How about we make vcpu_mmio_write return the positive exit reason? > Negative value will mean an error. In fact currently nagative value means something more need to be done, the same as MMIO exit. Now I think we can keep it, or update them all later. -- regards Yang, Sheng ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-24 9:44 ` Sheng Yang @ 2011-02-24 10:17 ` Michael S. Tsirkin 2011-02-25 6:12 ` Sheng Yang 0 siblings, 1 reply; 20+ messages in thread From: Michael S. Tsirkin @ 2011-02-24 10:17 UTC (permalink / raw) To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > > should be able to hook this into vfio with a small stub in kvm. We > > > > just need to be able to communicate disabling and enabling of > > > > individual msix vectors. For brute force, we could do this with a lot > > > > of eventfds, triggered by kvm and consumed by vfio, two per MSI-X > > > > vector. Not sure if there's something smaller that could do it. > > > > Thanks, > > > > > > Alex, thanks for your comments. See my comments below: > > > > Alex > > > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > > --- > > > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > > arch/x86/kvm/x86.c | 8 +- > > > > > include/linux/kvm.h | 21 ++++ > > > > > include/linux/kvm_host.h | 25 ++++ > > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > > virt/kvm/msix_mmio.c | 286 > > > > > ++++++++++++++++++++++++++++++++++++++++++++++ virt/kvm/msix_mmio.h > > > > > > > > > > | 25 ++++ > > > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > > index f15501f..3a0d851 100644 > > > > > --- a/arch/x86/kvm/Makefile > > > > > +++ b/arch/x86/kvm/Makefile > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o > \ > > > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > > > - assigned-dev.o) > > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) > > > > > kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, > > > > > async_pf.o) > > > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > index fa708c9..89bf12c 100644 > > > > > --- a/arch/x86/kvm/x86.c > > > > > +++ b/arch/x86/kvm/x86.c > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > > case KVM_CAP_XSAVE: > > > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > > + case KVM_CAP_MSIX_MMIO: > > > > > r = 1; > > > > > break; > > > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > > @@ -3807,6 +3808,7 @@ static int > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > > > { > > > > > > > > > > gpa_t gpa; > > > > > > > > > > + int r; > > > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > mmio: > > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > > > /* > > > > > > > > > > * Is this MMIO handled locally? > > > > > */ > > > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > > + if (!r) > > > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > This use of -ENOTSYNC is IMO confusing. > > How about we make vcpu_mmio_write return the positive exit reason? > > Negative value will mean an error. > > In fact currently nagative value means something more need to be done, the same as > MMIO exit. So it would be if (!r) return X86EMUL_CONTINUE; vcpu->run->exit_reason = r; > Now I think we can keep it, or update them all later. The way to do this would be 1. patch to return KVM_EXIT_MMIO on mmio 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top > > -- > regards > Yang, Sheng ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-24 10:17 ` Michael S. Tsirkin @ 2011-02-25 6:12 ` Sheng Yang 2011-02-25 8:46 ` Michael S. Tsirkin 0 siblings, 1 reply; 20+ messages in thread From: Sheng Yang @ 2011-02-25 6:12 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote: > On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: > > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > > > should be able to hook this into vfio with a small stub in kvm. We > > > > > just need to be able to communicate disabling and enabling of > > > > > individual msix vectors. For brute force, we could do this with a > > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per > > > > > MSI-X vector. Not sure if there's something smaller that could do > > > > > it. Thanks, > > > > > > > > Alex, thanks for your comments. See my comments below: > > > > > Alex > > > > > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > > > --- > > > > > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > > > arch/x86/kvm/x86.c | 8 +- > > > > > > include/linux/kvm.h | 21 ++++ > > > > > > include/linux/kvm_host.h | 25 ++++ > > > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > > > virt/kvm/msix_mmio.c | 286 > > > > > > ++++++++++++++++++++++++++++++++++++++++++++++ > > > > > > virt/kvm/msix_mmio.h > > > > > > > > > > > > | 25 ++++ > > > > > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > > > index f15501f..3a0d851 100644 > > > > > > --- a/arch/x86/kvm/Makefile > > > > > > +++ b/arch/x86/kvm/Makefile > > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o > > > > \ > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > > > > > - assigned-dev.o) > > > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, > > > > > > iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix > > > > > > ../../../virt/kvm/, async_pf.o) > > > > > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > > index fa708c9..89bf12c 100644 > > > > > > --- a/arch/x86/kvm/x86.c > > > > > > +++ b/arch/x86/kvm/x86.c > > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > > > case KVM_CAP_XSAVE: > > > > > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > > > + case KVM_CAP_MSIX_MMIO: > > > > > > r = 1; > > > > > > break; > > > > > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > > > @@ -3807,6 +3808,7 @@ static int > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > > > > > { > > > > > > > > > > > > gpa_t gpa; > > > > > > > > > > > > + int r; > > > > > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > mmio: > > > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > > > > > /* > > > > > > > > > > > > * Is this MMIO handled locally? > > > > > > */ > > > > > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > > > + if (!r) > > > > > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > > > This use of -ENOTSYNC is IMO confusing. > > > How about we make vcpu_mmio_write return the positive exit reason? > > > Negative value will mean an error. > > > > In fact currently nagative value means something more need to be done, > > the same as MMIO exit. > > So it would be > if (!r) > return X86EMUL_CONTINUE; > > vcpu->run->exit_reason = r; > > > Now I think we can keep it, or update them all later. > > The way to do this would be > 1. patch to return KVM_EXIT_MMIO on mmio > 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 because KVM itself unable to complete the request. That's quite straightforward. But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can only know when all of them fail to handle the accessing. I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead of a error return. We can discuss more on this, but since it's not API/ABI change, I think we can get the patch in first. -- regards Yang, Sheng > > > -- > > regards > > Yang, Sheng ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-25 6:12 ` Sheng Yang @ 2011-02-25 8:46 ` Michael S. Tsirkin 0 siblings, 0 replies; 20+ messages in thread From: Michael S. Tsirkin @ 2011-02-25 8:46 UTC (permalink / raw) To: Sheng Yang; +Cc: Alex Williamson, Avi Kivity, Marcelo Tosatti, kvm On Fri, Feb 25, 2011 at 02:12:42PM +0800, Sheng Yang wrote: > On Thursday 24 February 2011 18:17:34 Michael S. Tsirkin wrote: > > On Thu, Feb 24, 2011 at 05:44:20PM +0800, Sheng Yang wrote: > > > On Wednesday 23 February 2011 16:45:37 Michael S. Tsirkin wrote: > > > > On Wed, Feb 23, 2011 at 02:59:04PM +0800, Sheng Yang wrote: > > > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > > > Then we can support mask bit operation of assigned devices now. > > > > > > > > > > > > Looks pretty good overall. A few comments below. It seems like we > > > > > > should be able to hook this into vfio with a small stub in kvm. We > > > > > > just need to be able to communicate disabling and enabling of > > > > > > individual msix vectors. For brute force, we could do this with a > > > > > > lot of eventfds, triggered by kvm and consumed by vfio, two per > > > > > > MSI-X vector. Not sure if there's something smaller that could do > > > > > > it. Thanks, > > > > > > > > > > Alex, thanks for your comments. See my comments below: > > > > > > Alex > > > > > > > > > > > > > Signed-off-by: Sheng Yang <sheng@linux.intel.com> > > > > > > > --- > > > > > > > > > > > > > > arch/x86/kvm/Makefile | 2 +- > > > > > > > arch/x86/kvm/x86.c | 8 +- > > > > > > > include/linux/kvm.h | 21 ++++ > > > > > > > include/linux/kvm_host.h | 25 ++++ > > > > > > > virt/kvm/assigned-dev.c | 44 +++++++ > > > > > > > virt/kvm/kvm_main.c | 38 ++++++- > > > > > > > virt/kvm/msix_mmio.c | 286 > > > > > > > ++++++++++++++++++++++++++++++++++++++++++++++ > > > > > > > virt/kvm/msix_mmio.h > > > > > > > > > > > > > > | 25 ++++ > > > > > > > > > > > > > > 8 files changed, 442 insertions(+), 7 deletions(-) > > > > > > > create mode 100644 virt/kvm/msix_mmio.c > > > > > > > create mode 100644 virt/kvm/msix_mmio.h > > > > > > > > > > > > > > diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile > > > > > > > index f15501f..3a0d851 100644 > > > > > > > --- a/arch/x86/kvm/Makefile > > > > > > > +++ b/arch/x86/kvm/Makefile > > > > > > > @@ -7,7 +7,7 @@ CFLAGS_vmx.o := -I. > > > > > > > > > > > > > > kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o > ioapic.o > > > > > > \ > > > > > > > > > > coalesced_mmio.o irq_comm.o eventfd.o \ > > > > > > > > > > > > > > - assigned-dev.o) > > > > > > > + assigned-dev.o msix_mmio.o) > > > > > > > > > > > > > > kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, > > > > > > > iommu.o) kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix > > > > > > > ../../../virt/kvm/, async_pf.o) > > > > > > > > > > > > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > > > > > index fa708c9..89bf12c 100644 > > > > > > > --- a/arch/x86/kvm/x86.c > > > > > > > +++ b/arch/x86/kvm/x86.c > > > > > > > @@ -1966,6 +1966,7 @@ int kvm_dev_ioctl_check_extension(long ext) > > > > > > > > > > > > > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > > > > > > > case KVM_CAP_XSAVE: > > > > > > > > > > > > > > case KVM_CAP_ASYNC_PF: > > > > > > > + case KVM_CAP_MSIX_MMIO: > > > > > > > r = 1; > > > > > > > break; > > > > > > > > > > > > > > case KVM_CAP_COALESCED_MMIO: > > > > > > > @@ -3807,6 +3808,7 @@ static int > > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > > > struct kvm_vcpu *vcpu) > > > > > > > > > > > > > > { > > > > > > > > > > > > > > gpa_t gpa; > > > > > > > > > > > > > > + int r; > > > > > > > > > > > > > > gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); > > > > > > > > > > > > > > @@ -3822,14 +3824,16 @@ static int > > > > > > > emulator_write_emulated_onepage(unsigned long addr, > > > > > > > > > > > > > > mmio: > > > > > > > trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); > > > > > > > > > > > > > > + r = vcpu_mmio_write(vcpu, gpa, bytes, val); > > > > > > > > > > > > > > /* > > > > > > > > > > > > > > * Is this MMIO handled locally? > > > > > > > */ > > > > > > > > > > > > > > - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) > > > > > > > + if (!r) > > > > > > > > > > > > > > return X86EMUL_CONTINUE; > > > > > > > > > > > > > > vcpu->mmio_needed = 1; > > > > > > > > > > > > > > - vcpu->run->exit_reason = KVM_EXIT_MMIO; > > > > > > > + vcpu->run->exit_reason = (r == -ENOTSYNC) ? > > > > > > > + KVM_EXIT_MSIX_ROUTING_UPDATE : KVM_EXIT_MMIO; > > > > > > > > This use of -ENOTSYNC is IMO confusing. > > > > How about we make vcpu_mmio_write return the positive exit reason? > > > > Negative value will mean an error. > > > > > > In fact currently nagative value means something more need to be done, > > > the same as MMIO exit. > > > > So it would be > > if (!r) > > return X86EMUL_CONTINUE; > > > > vcpu->run->exit_reason = r; > > > > > Now I think we can keep it, or update them all later. > > > > The way to do this would be > > 1. patch to return KVM_EXIT_MMIO on mmio > > 2. your patch that returns KVM_EXIT_MSIX_ROUTING_UPDATE on top > > It's not that straightforward. In most condition, the reason vcpu_mmio_write() < 0 > because KVM itself unable to complete the request. That's quite straightforward. > But each handler in the chain can't decided it would be "KVM_EXIT_MMIO", they can > only know when all of them fail to handle the accessing. > Well, this just violates the standard API for return value. The standard semantics are: - negative - error - positive - not an error So you can just return a positive KVM_EXIT_MSIX_ROUTING_UPDATE instead of ENOTSYNC. This way you do not need to rework all code and you don't spread MSIX all over kvm. You just do: (r > 0) ? r : KVM_EXIT_MMIO > I am not sure if we like every single handler said "I want KVM_EXIT_MMIO" instead > of a error return. > We can discuss more on this, but since it's not API/ABI change, > I think we can get the patch in first. > -- > regards > Yang, Sheng > > > > > > -- > > > regards > > > Yang, Sheng ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 6:59 ` Sheng Yang 2011-02-23 8:45 ` Michael S. Tsirkin @ 2011-02-23 16:34 ` Alex Williamson 2011-02-23 18:39 ` Michael S. Tsirkin 1 sibling, 1 reply; 20+ messages in thread From: Alex Williamson @ 2011-02-23 16:34 UTC (permalink / raw) To: Sheng Yang; +Cc: Avi Kivity, Marcelo Tosatti, Michael S. Tsirkin, kvm On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > > int len, + void *val) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > + struct kvm_msix_mmio *mmio; > > > + int idx, ret = 0, entry, offset, r; > > > + > > > + mutex_lock(&mmio_dev->lock); > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > + if (idx < 0) { > > > + ret = -EOPNOTSUPP; > > > + goto out; > > > + } > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > + goto out; > > > + > > > + offset = addr & 0xf; > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > + goto out; > > > + > > > + mmio = &mmio_dev->mmio[idx]; > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > + if (r) > > > + goto out; > > > +out: > > > + mutex_unlock(&mmio_dev->lock); > > > + return ret; > > > +} > > > + > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > > + int len, const void *val) > > > +{ > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > + struct kvm_msix_mmio *mmio; > > > + int idx, entry, offset, ret = 0, r = 0; > > > + gpa_t entry_base; > > > + u32 old_ctrl, new_ctrl; > > > + u32 *ctrl_pos; > > > + > > > + mutex_lock(&mmio_dev->kvm->lock); > > > + mutex_lock(&mmio_dev->lock); > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > + if (idx < 0) { > > > + ret = -EOPNOTSUPP; > > > + goto out; > > > + } > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > + goto out; > > > + > > > + offset = addr & 0xF; > > > > nit, this 0xf above. > > So you like another mask? I'm just noting that above you used 0xf and here you used 0xF. > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > + goto out; > > > + > > > + mmio = &mmio_dev->mmio[idx]; > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > + > > > + if (get_user(old_ctrl, ctrl_pos)) > > > + goto out; > > > + > > > + /* No allow writing to other fields when entry is unmasked */ > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > + goto out; > > > + > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > > + goto out; > > > + > > > + if (get_user(new_ctrl, ctrl_pos)) > > > + goto out; > > > + > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > + ret = -ENOTSYNC; > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > first? I'd also prefer to see a direct goto out here since it's not > > entirely obvious that this first 'if' always falls into the below 'if'. > > I'm not a fan of using an error code as a special non-error return > > either. > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to > see if they indeed modified ctrl bits. > > I would try to make the logic here more clear. Isn't that what you're testing for though? (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) If this is true, we can only be writing address, upper address, or data, not control. (offset < PCI_MSIX_ENTRY_DATA && len == 8) If this is true, we're writing address and upper address, not the control vector. Additionally, I think the size an alignment test should be more restrictive. We really only want to allow naturally aligned 4 or 8 byte accesses. Maybe instead of: if ((addr & 0x3) || (len != 4 && len != 8)) we should do this: if (!(len == 4 || len == 8) || addr & (len - 1)) Then the test could become: if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) Thanks, Alex ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 16:34 ` Alex Williamson @ 2011-02-23 18:39 ` Michael S. Tsirkin 2011-02-23 19:02 ` Alex Williamson 0 siblings, 1 reply; 20+ messages in thread From: Michael S. Tsirkin @ 2011-02-23 18:39 UTC (permalink / raw) To: Alex Williamson; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote: > On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > > > int len, + void *val) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, ret = 0, entry, offset, r; > > > > + > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > + goto out; > > > > + > > > > + offset = addr & 0xf; > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > + goto out; > > > > + > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > + if (r) > > > > + goto out; > > > > +out: > > > > + mutex_unlock(&mmio_dev->lock); > > > > + return ret; > > > > +} > > > > + > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > > > + int len, const void *val) > > > > +{ > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > + struct kvm_msix_mmio *mmio; > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > + gpa_t entry_base; > > > > + u32 old_ctrl, new_ctrl; > > > > + u32 *ctrl_pos; > > > > + > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > + mutex_lock(&mmio_dev->lock); > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > + if (idx < 0) { > > > > + ret = -EOPNOTSUPP; > > > > + goto out; > > > > + } > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > + goto out; > > > > + > > > > + offset = addr & 0xF; > > > > > > nit, this 0xf above. > > > > So you like another mask? > > I'm just noting that above you used 0xf and here you used 0xF. > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > + goto out; > > > > + > > > > + mmio = &mmio_dev->mmio[idx]; > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > > + > > > > + if (get_user(old_ctrl, ctrl_pos)) > > > > + goto out; > > > > + > > > > + /* No allow writing to other fields when entry is unmasked */ > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > + goto out; > > > > + > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > > > + goto out; > > > > + > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > + goto out; > > > > + > > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > > + ret = -ENOTSYNC; > > > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > > first? I'd also prefer to see a direct goto out here since it's not > > > entirely obvious that this first 'if' always falls into the below 'if'. > > > I'm not a fan of using an error code as a special non-error return > > > either. > > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to > > see if they indeed modified ctrl bits. > > > > I would try to make the logic here more clear. > > Isn't that what you're testing for though? > > (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) > > If this is true, we can only be writing address, upper address, or data, > not control. > > (offset < PCI_MSIX_ENTRY_DATA && len == 8) > > If this is true, we're writing address and upper address, not the > control vector. > > Additionally, I think the size an alignment test should be more > restrictive. We really only want to allow naturally aligned 4 or 8 byte > accesses. Maybe instead of: > > if ((addr & 0x3) || (len != 4 && len != 8)) > > we should do this: > > if (!(len == 4 || len == 8) || addr & (len - 1)) > > Then the test could become: > > if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) > > Thanks, > > Alex Or rather if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4) we are matching offset + length to control offset + control length, and avoid depending on the fact control is the last register. -- MST ^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH 2/3] KVM: Emulate MSI-X table in kernel 2011-02-23 18:39 ` Michael S. Tsirkin @ 2011-02-23 19:02 ` Alex Williamson 0 siblings, 0 replies; 20+ messages in thread From: Alex Williamson @ 2011-02-23 19:02 UTC (permalink / raw) To: Michael S. Tsirkin; +Cc: Sheng Yang, Avi Kivity, Marcelo Tosatti, kvm On Wed, 2011-02-23 at 20:39 +0200, Michael S. Tsirkin wrote: > On Wed, Feb 23, 2011 at 09:34:19AM -0700, Alex Williamson wrote: > > On Wed, 2011-02-23 at 14:59 +0800, Sheng Yang wrote: > > > On Wednesday 23 February 2011 08:19:21 Alex Williamson wrote: > > > > On Sun, 2011-01-30 at 13:11 +0800, Sheng Yang wrote: > > > > > +static int msix_table_mmio_read(struct kvm_io_device *this, gpa_t addr, > > > > > int len, + void *val) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > + struct kvm_msix_mmio *mmio; > > > > > + int idx, ret = 0, entry, offset, r; > > > > > + > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > + if (idx < 0) { > > > > > + ret = -EOPNOTSUPP; > > > > > + goto out; > > > > > + } > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > + goto out; > > > > > + > > > > > + offset = addr & 0xf; > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > + goto out; > > > > > + > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > + r = copy_from_user(val, (void __user *)(mmio->table_base_va + > > > > > + entry * PCI_MSIX_ENTRY_SIZE + offset), len); > > > > > + if (r) > > > > > + goto out; > > > > > +out: > > > > > + mutex_unlock(&mmio_dev->lock); > > > > > + return ret; > > > > > +} > > > > > + > > > > > +static int msix_table_mmio_write(struct kvm_io_device *this, gpa_t addr, > > > > > + int len, const void *val) > > > > > +{ > > > > > + struct kvm_msix_mmio_dev *mmio_dev = > > > > > + container_of(this, struct kvm_msix_mmio_dev, table_dev); > > > > > + struct kvm_msix_mmio *mmio; > > > > > + int idx, entry, offset, ret = 0, r = 0; > > > > > + gpa_t entry_base; > > > > > + u32 old_ctrl, new_ctrl; > > > > > + u32 *ctrl_pos; > > > > > + > > > > > + mutex_lock(&mmio_dev->kvm->lock); > > > > > + mutex_lock(&mmio_dev->lock); > > > > > + idx = get_mmio_table_index(mmio_dev, addr, len); > > > > > + if (idx < 0) { > > > > > + ret = -EOPNOTSUPP; > > > > > + goto out; > > > > > + } > > > > > + if ((addr & 0x3) || (len != 4 && len != 8)) > > > > > + goto out; > > > > > + > > > > > + offset = addr & 0xF; > > > > > > > > nit, this 0xf above. > > > > > > So you like another mask? > > > > I'm just noting that above you used 0xf and here you used 0xF. > > > > > > > + if (offset == PCI_MSIX_ENTRY_VECTOR_CTRL && len == 8) > > > > > + goto out; > > > > > + > > > > > + mmio = &mmio_dev->mmio[idx]; > > > > > + entry = (addr - mmio->table_base_addr) / PCI_MSIX_ENTRY_SIZE; > > > > > + entry_base = mmio->table_base_va + entry * PCI_MSIX_ENTRY_SIZE; > > > > > + ctrl_pos = (u32 *)(entry_base + PCI_MSIX_ENTRY_VECTOR_CTRL); > > > > > + > > > > > + if (get_user(old_ctrl, ctrl_pos)) > > > > > + goto out; > > > > > + > > > > > + /* No allow writing to other fields when entry is unmasked */ > > > > > + if (!(old_ctrl & PCI_MSIX_ENTRY_CTRL_MASKBIT) && > > > > > + offset != PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > > + goto out; > > > > > + > > > > > + if (copy_to_user((void __user *)(entry_base + offset), val, len)) > > > > > + goto out; > > > > > + > > > > > + if (get_user(new_ctrl, ctrl_pos)) > > > > > + goto out; > > > > > + > > > > > + if ((offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) || > > > > > + (offset < PCI_MSIX_ENTRY_DATA && len == 8)) > > > > > + ret = -ENOTSYNC; > > > > > > > > Couldn't we avoid the above get_user(new_ctrl,...) if we tested this > > > > first? I'd also prefer to see a direct goto out here since it's not > > > > entirely obvious that this first 'if' always falls into the below 'if'. > > > > I'm not a fan of using an error code as a special non-error return > > > > either. > > > > > > In fact when offset==PCI_MSIX_ENTRY_DATA and len ==8, we need to check new_ctrl to > > > see if they indeed modified ctrl bits. > > > > > > I would try to make the logic here more clear. > > > > Isn't that what you're testing for though? > > > > (offset < PCI_MSIX_ENTRY_VECTOR_CTRL && len == 4) > > > > If this is true, we can only be writing address, upper address, or data, > > not control. > > > > (offset < PCI_MSIX_ENTRY_DATA && len == 8) > > > > If this is true, we're writing address and upper address, not the > > control vector. > > > > Additionally, I think the size an alignment test should be more > > restrictive. We really only want to allow naturally aligned 4 or 8 byte > > accesses. Maybe instead of: > > > > if ((addr & 0x3) || (len != 4 && len != 8)) > > > > we should do this: > > > > if (!(len == 4 || len == 8) || addr & (len - 1)) > > > > Then the test could become: > > > > if (offset + len <= PCI_MSIX_ENTRY_VECTOR_CTRL) > > > > Thanks, > > > > Alex > > Or rather > > if (offset + len != PCI_MSIX_ENTRY_VECTOR_CTRL + 4) > > we are matching offset + length > to control offset + control length, > and avoid depending on the fact control is the last register. That's just silly. We're writing code to match a hardware specification. The registers always land at the same place, are always the same size and always have the alignment requirements dictated in the spec. I see no problem depending on that. They both do the same thing, I prefer my version. Alex ^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH 3/3] KVM: Add documents for MSI-X MMIO API 2011-01-30 5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang 2011-01-30 5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang 2011-01-30 5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang @ 2011-01-30 5:11 ` Sheng Yang 2 siblings, 0 replies; 20+ messages in thread From: Sheng Yang @ 2011-01-30 5:11 UTC (permalink / raw) To: Avi Kivity, Marcelo Tosatti; +Cc: Michael S. Tsirkin, kvm, Sheng Yang Signed-off-by: Sheng Yang <sheng@linux.intel.com> --- Documentation/kvm/api.txt | 47 +++++++++++++++++++++++++++++++++++++++++++++ 1 files changed, 47 insertions(+), 0 deletions(-) diff --git a/Documentation/kvm/api.txt b/Documentation/kvm/api.txt index e1a9297..e6b7a1d 100644 --- a/Documentation/kvm/api.txt +++ b/Documentation/kvm/api.txt @@ -1263,6 +1263,53 @@ struct kvm_assigned_msix_entry { __u16 padding[3]; }; +4.54 KVM_REGISTER_MSIX_MMIO + +Capability: KVM_CAP_MSIX_MMIO +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_msix_mmio_user (in) +Returns: 0 on success, -1 on error + +This API indicates an MSI-X MMIO address of a guest device. Then all MMIO +operation would be handled by kernel. When necessary(e.g. MSI data/address +changed), KVM would exit to userspace using KVM_EXIT_MSIX_ROUTING_UPDATE to +indicate the MMIO modification and require userspace to update IRQ routing +table. + +NOTICE: Writing the MSI-X MMIO page after it was registered with this API may +be dangerous for userspace program. The writing during VM running may result +in synchronization issue therefore the assigned device can't work properly. +The writing is allowed when VM is not running and can be used as save/restore +mechanism. + +struct kvm_msix_mmio_user { + __u32 dev_id; + __u16 type; /* Device type and MMIO address type */ + __u16 max_entries_nr; /* Maximum entries supported */ + __u64 base_addr; /* Guest physical address of MMIO */ + __u64 base_va; /* Host virtual address of MMIO mapping */ + __u64 flags; /* Reserved for now */ + __u64 reserved[4]; +}; + +Current device type can be: +#define KVM_MSIX_MMIO_TYPE_ASSIGNED_DEV (1 << 0) + +Current MMIO type can be: +#define KVM_MSIX_MMIO_TYPE_BASE_TABLE (1 << 8) + +4.55 KVM_UNREGISTER_MSIX_MMIO + +Capability: KVM_CAP_MSIX_MMIO +Architectures: x86 +Type: vm ioctl +Parameters: struct kvm_msix_mmio_user (in) +Returns: 0 on success, -1 on error + +This API would unregister the specific MSI-X MMIO, indicated by dev_id and +type fields of struct kvm_msix_mmio_user. + 5. The kvm_run structure Application code obtains a pointer to the kvm_run structure by -- 1.7.0.1 ^ permalink raw reply related [flat|nested] 20+ messages in thread
end of thread, other threads:[~2011-02-25 8:46 UTC | newest] Thread overview: 20+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2011-01-30 5:11 [PATCH 0/3 v8] MSI-X MMIO support for KVM Sheng Yang 2011-01-30 5:11 ` [PATCH 1/3] KVM: Move struct kvm_io_device to kvm_host.h Sheng Yang 2011-01-30 5:11 ` [PATCH 2/3] KVM: Emulate MSI-X table in kernel Sheng Yang 2011-02-03 1:05 ` Marcelo Tosatti 2011-02-18 8:15 ` Sheng Yang 2011-02-22 17:58 ` Marcelo Tosatti 2011-02-23 0:19 ` Alex Williamson 2011-02-23 6:59 ` Sheng Yang 2011-02-23 8:45 ` Michael S. Tsirkin 2011-02-24 8:08 ` Sheng Yang 2011-02-24 10:11 ` Michael S. Tsirkin 2011-02-25 5:54 ` Sheng Yang 2011-02-24 9:44 ` Sheng Yang 2011-02-24 10:17 ` Michael S. Tsirkin 2011-02-25 6:12 ` Sheng Yang 2011-02-25 8:46 ` Michael S. Tsirkin 2011-02-23 16:34 ` Alex Williamson 2011-02-23 18:39 ` Michael S. Tsirkin 2011-02-23 19:02 ` Alex Williamson 2011-01-30 5:11 ` [PATCH 3/3] KVM: Add documents for MSI-X MMIO API Sheng Yang
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).