From mboxrd@z Thu Jan 1 00:00:00 1970 From: Sheng Yang Subject: Re: [PATCH 6/7] KVM: assigned dev: MSI-X mask support Date: Fri, 12 Nov 2010 18:13:48 +0800 Message-ID: <201011121813.49140.sheng@linux.intel.com> References: <1289461620-7055-1-git-send-email-sheng@linux.intel.com> <1289461620-7055-7-git-send-email-sheng@linux.intel.com> <20101112095312.GA11354@redhat.com> Mime-Version: 1.0 Content-Type: Text/Plain; charset=utf-8 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: Avi Kivity , Marcelo Tosatti , kvm@vger.kernel.org To: "Michael S. Tsirkin" Return-path: Received: from mga02.intel.com ([134.134.136.20]:18131 "EHLO mga02.intel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752465Ab0KLKNe convert rfc822-to-8bit (ORCPT ); Fri, 12 Nov 2010 05:13:34 -0500 In-Reply-To: <20101112095312.GA11354@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: On Friday 12 November 2010 17:53:13 Michael S. Tsirkin wrote: > On Thu, Nov 11, 2010 at 03:46:59PM +0800, Sheng Yang wrote: > > This patch enable per-vector mask for assigned devices using MSI-X. > >=20 > > This patch provided two new APIs: one is for guest to specific devi= ce's > > MSI-X table address in MMIO, the other is for userspace to get > > information about mask bit. > >=20 > > All the mask bit operation are kept in kernel, in order to accelera= te. > > Userspace shouldn't access the device MMIO directly for the informa= tion, > > instead it should uses provided API to do so. > >=20 > > Signed-off-by: Sheng Yang > > --- > >=20 > > arch/x86/kvm/x86.c | 1 + > > include/linux/kvm.h | 32 +++++ > > include/linux/kvm_host.h | 5 + > > virt/kvm/assigned-dev.c | 316 > > +++++++++++++++++++++++++++++++++++++++++++++- 4 files changed,=20 353 > > insertions(+), 1 deletions(-) > >=20 > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index f3f86b2..847f1e1 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -1926,6 +1926,7 @@ int kvm_dev_ioctl_check_extension(long ext) > >=20 > > case KVM_CAP_DEBUGREGS: > > case KVM_CAP_X86_ROBUST_SINGLESTEP: > >=20 > > case KVM_CAP_XSAVE: > > + case KVM_CAP_MSIX_MASK: > > r =3D 1; > > break; > > =09 > > case KVM_CAP_COALESCED_MMIO: > > diff --git a/include/linux/kvm.h b/include/linux/kvm.h > > index 919ae53..32cd244 100644 > > --- a/include/linux/kvm.h > > +++ b/include/linux/kvm.h > > @@ -540,6 +540,9 @@ struct kvm_ppc_pvinfo { > >=20 > > #endif > > #define KVM_CAP_PPC_GET_PVINFO 57 > > #define KVM_CAP_PPC_IRQ_LEVEL 58 > >=20 > > +#ifdef __KVM_HAVE_MSIX > > +#define KVM_CAP_MSIX_MASK 59 > > +#endif > >=20 > > #ifdef KVM_CAP_IRQ_ROUTING > >=20 > > @@ -671,6 +674,9 @@ struct kvm_clock_data { > >=20 > > #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct > > kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO,=20 > > 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK =20 > > _IOR(KVMIO, 0x7c, struct kvm_clock_data) > >=20 > > +/* Available with KVM_CAP_MSIX_MASK */ > > +#define KVM_GET_MSIX_ENTRY _IOWR(KVMIO, 0x7d, struct > > kvm_msix_entry) +#define KVM_UPDATE_MSIX_MMIO _IOW(KVMIO, 0x7= e, > > struct kvm_msix_mmio) > >=20 > > /* Available with KVM_CAP_PIT_STATE2 */ > > #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct > > kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa= 0, > > struct kvm_pit_state2) > >=20 > > @@ -794,4 +800,30 @@ struct kvm_assigned_msix_entry { > >=20 > > __u16 padding[3]; > > =20 > > }; > >=20 > > +#define KVM_MSIX_TYPE_ASSIGNED_DEV 1 > > + > > +#define KVM_MSIX_FLAG_MASKBIT (1 << 0) > > +#define KVM_MSIX_FLAG_QUERY_MASKBIT (1 << 0) > > + > > +struct kvm_msix_entry { > > + __u32 id; > > + __u32 type; > > + __u32 entry; /* The index of entry in the MSI-X table */ > > + __u32 flags; > > + __u32 query_flags; > > + __u32 reserved[5]; > > +}; > > + > > +#define KVM_MSIX_MMIO_FLAG_REGISTER (1 << 0) > > +#define KVM_MSIX_MMIO_FLAG_UNREGISTER (1 << 1) > > + > > +struct kvm_msix_mmio { > > + __u32 id; > > + __u32 type; > > + __u64 base_addr; > > + __u32 max_entries_nr; > > + __u32 flags; > > + __u32 reserved[6]; > > +}; > > + > >=20 > > #endif /* __LINUX_KVM_H */ > >=20 > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h > > index 4b31539..9d49074 100644 > > --- a/include/linux/kvm_host.h > > +++ b/include/linux/kvm_host.h > > @@ -445,6 +445,7 @@ struct kvm_guest_msix_entry { > >=20 > > }; > > =20 > > #define KVM_ASSIGNED_ENABLED_IOMMU (1 << 0) > >=20 > > +#define KVM_ASSIGNED_ENABLED_MSIX_MMIO (1 << 1) > >=20 > > struct kvm_assigned_dev_kernel { > > =20 > > struct kvm_irq_ack_notifier ack_notifier; > > struct work_struct interrupt_work; > >=20 > > @@ -465,6 +466,10 @@ struct kvm_assigned_dev_kernel { > >=20 > > struct pci_dev *dev; > > struct kvm *kvm; > > spinlock_t assigned_dev_lock; > >=20 > > + DECLARE_BITMAP(msix_mask_bitmap, KVM_MAX_MSIX_PER_DEV); > > + gpa_t msix_mmio_base; > > + struct kvm_io_device msix_mmio_dev; > > + int msix_max_entries_nr; > >=20 > > }; > > =20 > > struct kvm_irq_mask_notifier { > >=20 > > diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c > > index 5c6b96d..3010d7d 100644 > > --- a/virt/kvm/assigned-dev.c > > +++ b/virt/kvm/assigned-dev.c > > @@ -226,12 +226,27 @@ static void kvm_free_assigned_irq(struct kvm = *kvm, > >=20 > > kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_t= ype); > > =20 > > } > >=20 > > +static void unregister_msix_mmio(struct kvm *kvm, > > + struct kvm_assigned_dev_kernel *adev) > > +{ > > + if (adev->flags & KVM_ASSIGNED_ENABLED_MSIX_MMIO) { > > + mutex_lock(&kvm->slots_lock); > > + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, > > + &adev->msix_mmio_dev); > > + mutex_unlock(&kvm->slots_lock); > > + adev->flags &=3D ~KVM_ASSIGNED_ENABLED_MSIX_MMIO; > > + } > > +} > > + > >=20 > > static void kvm_free_assigned_device(struct kvm *kvm, > > =20 > > struct kvm_assigned_dev_kernel > > *assigned_dev) > > =20 > > { > > =20 > > kvm_free_assigned_irq(kvm, assigned_dev); > >=20 > > +#ifdef __KVM_HAVE_MSIX > > + unregister_msix_mmio(kvm, assigned_dev); > > +#endif > >=20 > > pci_reset_function(assigned_dev->dev); > > =09 > > pci_release_regions(assigned_dev->dev); > >=20 > > @@ -504,7 +519,7 @@ out: > > static int kvm_vm_ioctl_assign_device(struct kvm *kvm, > > =20 > > struct kvm_assigned_pci_dev *assigned_dev) > > =20 > > { > >=20 > > - int r =3D 0, idx; > > + int r =3D 0, idx, i; > >=20 > > struct kvm_assigned_dev_kernel *match; > > struct pci_dev *dev; > >=20 > > @@ -564,6 +579,10 @@ static int kvm_vm_ioctl_assign_device(struct k= vm > > *kvm, > >=20 > > list_add(&match->list, &kvm->arch.assigned_dev_head); > >=20 > > + /* The state after reset of MSI-X table is all masked */ > > + for (i =3D 0; i < KVM_MAX_MSIX_PER_DEV; i++) > > + set_bit(i, match->msix_mask_bitmap); > > + > >=20 > > if (assigned_dev->flags & KVM_ASSIGNED_ENABLED_IOMMU) { > > =09 > > if (!kvm->arch.iommu_domain) { > > =09 > > r =3D kvm_iommu_map_guest(kvm); > >=20 > > @@ -667,6 +686,43 @@ msix_nr_out: > > return r; > > =20 > > } > >=20 > > +static void update_msix_mask(struct kvm_assigned_dev_kernel *adev, > > + int idx, bool new_mask_flag) > > +{ > > + int irq; > > + bool old_mask_flag, need_flush =3D false; > > + > > + spin_lock_irq(&adev->assigned_dev_lock); > > + > > + if (!adev->dev->msix_enabled || > > + !(adev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX)) > > + goto out; > > + > > + old_mask_flag =3D test_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + if (old_mask_flag =3D=3D new_mask_flag) > > + goto out; > > + > > + irq =3D adev->host_msix_entries[idx].vector; > > + BUG_ON(irq =3D=3D 0); > > + > > + if (new_mask_flag) { > > + set_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + disable_irq_nosync(irq); > > + need_flush =3D true; > > + } else { > > + clear_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + enable_irq(irq); > > + } > > +out: > > + spin_unlock_irq(&adev->assigned_dev_lock); > > + > > + if (need_flush) > > + flush_work(&adev->interrupt_work); > > +} > > + > >=20 > > static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, > > =20 > > struct kvm_assigned_msix_entry *entry) > > =20 > > { > >=20 > > @@ -701,6 +757,233 @@ msix_entry_out: > > return r; > > =20 > > } > >=20 > > + > > +static int kvm_vm_ioctl_get_msix_entry(struct kvm *kvm, > > + struct kvm_msix_entry *entry) > > +{ > > + int r =3D 0; > > + struct kvm_assigned_dev_kernel *adev; > > + > > + if (entry->type !=3D KVM_MSIX_TYPE_ASSIGNED_DEV) > > + return -EINVAL; > > + > > + if (!entry->query_flags) > > + return -EINVAL; > > + > > + mutex_lock(&kvm->lock); > > + > > + adev =3D kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > + entry->id); > > + > > + if (!adev) { > > + r =3D -EINVAL; > > + goto out; > > + } > > + > > + if (entry->entry >=3D adev->msix_max_entries_nr) { > > + r =3D -ENOSPC; > > + goto out; > > + } > > + > > + if (entry->query_flags & KVM_MSIX_FLAG_QUERY_MASKBIT) { > > + if (test_bit(entry->entry, adev->msix_mask_bitmap)) > > + entry->flags |=3D KVM_MSIX_FLAG_MASKBIT; > > + else > > + entry->flags &=3D ~KVM_MSIX_FLAG_MASKBIT; > > + } > > + > > +out: > > + mutex_unlock(&kvm->lock); > > + > > + return r; > > +} > > + > > +static bool msix_mmio_in_range(struct kvm_assigned_dev_kernel *ade= v, > > + gpa_t addr, int len) > > +{ > > + gpa_t start, end; > > + > > + BUG_ON(adev->msix_mmio_base =3D=3D 0); >=20 > I thought we wanted to use a separate flag for this? O, flag added, but forgot to update this one. >=20 > > + start =3D adev->msix_mmio_base; > > + end =3D adev->msix_mmio_base + PCI_MSIX_ENTRY_SIZE * > > + adev->msix_max_entries_nr; > > + if (addr >=3D start && addr + len <=3D end) > > + return true; > > + > > + return false; > > +} > > + > > +static int msix_get_enabled_idx(struct kvm_assigned_dev_kernel *ad= ev, > > + gpa_t addr, int len) > > +{ > > + int i, index =3D (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_S= IZE; > > + > > + for (i =3D 0; i < adev->entries_nr; i++) > > + if (adev->guest_msix_entries[i].entry =3D=3D index) > > + return i; > > + > > + return -EINVAL; > > +} > > + >=20 > Hmm, we still have a linear scan on each write. Is the point to dete= ct > which entries need to be handled in kernel? Maybe another bitmap for > this? Or just handle whole entry write in kernel as well, then we wo= n't > need this at all. Still dont' like handling the whole entry writing. In fact here we have= two=20 questions: 1. If the entry already enabled 2. If it is, then what's it sequence number. Bitmap can solve question 1, but we still need O(n) scan for the second= =2E So I=20 think leave it like this should be fine. >=20 > > +static int msix_mmio_read(struct kvm_io_device *this, gpa_t addr, = int > > len, + void *val) > > +{ > > + struct kvm_assigned_dev_kernel *adev =3D > > + container_of(this, struct kvm_assigned_dev_kernel, > > + msix_mmio_dev); > > + int idx, r =3D 0; > > + u32 entry[4]; > > + struct kvm_kernel_irq_routing_entry e; > > + > > + mutex_lock(&adev->kvm->lock); > > + if (!msix_mmio_in_range(adev, addr, len)) { > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + if ((addr & 0x3) || len !=3D 4) > > + goto out; > > + > > + idx =3D msix_get_enabled_idx(adev, addr, len); > > + if (idx < 0) { > > + idx =3D (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > > + if ((addr % PCI_MSIX_ENTRY_SIZE) =3D=3D > > + PCI_MSIX_ENTRY_VECTOR_CTRL) > > + *(unsigned long *)val =3D > > + test_bit(idx, adev->msix_mask_bitmap) ? > > + PCI_MSIX_ENTRY_CTRL_MASKBIT : 0; > > + else > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + > > + r =3D kvm_get_irq_routing_entry(adev->kvm, > > + adev->guest_msix_entries[idx].vector, &e); > > + if (r || e.type !=3D KVM_IRQ_ROUTING_MSI) { > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + entry[0] =3D e.msi.address_lo; > > + entry[1] =3D e.msi.address_hi; > > + entry[2] =3D e.msi.data; > > + entry[3] =3D test_bit(adev->guest_msix_entries[idx].entry, > > + adev->msix_mask_bitmap); > > + memcpy(val, &entry[addr % PCI_MSIX_ENTRY_SIZE / sizeof *entry], l= en); >=20 > It seems weird to pass writes to userspace but do reads > in kernel. > Either we should support entry read and write or neither. Entry read is for speed up, because kernel would write to the mask bit = then try to=20 read from it for flushing. Don't think reading matter much here, as lon= g as=20 userspace still own routing table. =20 > > + > > +out: > > + mutex_unlock(&adev->kvm->lock); > > + return r; > > +} > > + > > +static int msix_mmio_write(struct kvm_io_device *this, gpa_t addr,= int > > len, + const void *val) > > +{ > >=20 > > + struct kvm_assigned_dev_kernel *adev =3D > > + container_of(this, struct kvm_assigned_dev_kernel, > > + msix_mmio_dev); > > + int idx, r =3D 0; > > + unsigned long new_val =3D *(unsigned long *)val; >=20 > Move this to after you did length and alignment checks, > it might not be safe here. >=20 > > + > > + mutex_lock(&adev->kvm->lock); > > + if (!msix_mmio_in_range(adev, addr, len)) { > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + if ((addr & 0x3) || len !=3D 4) >=20 > The lenght could be 8: >=20 > For all accesses to MSI-X Table and MSI-X PBA fields, software must u= se > aligned full DWORD or aligned full QWORD transactions; otherwise, the > result is undefined. >=20 > and >=20 > Software is permitted to fill in MSI-X Table entry DWORD fields > individually with DWORD writes, or software in certain cases is permi= tted > to fill in appropriate pairs of DWORDs with a single QWORD write. > Specifically, software is always permitted to fill in the Message Add= ress > and Message Upper Address fields with a single QWORD write. If a give= n > entry is currently masked (via its Mask bit or the Function Mask bit)= , > software is permitted to fill in the Message Data and Vector Control > fields with a single QWORD write, taking advantage of the fact the Me= ssage > Data field is guaranteed to become visible to hardware no later than = the > Vector Control field. However, if software wishes to mask a currently > unmasked entry (without setting the Function Mask bit), software must= set > the entry=E2=80=99s Mask bit using a DWORD write to the Vector Contro= l field, > since performing a QWORD write to the Message Data and Vector Control > fields might result in the Message Data field being modified before t= he > Mask bit in the Vector Control field becomes set. =20 Haven't seen any device use QWORD accessing. Also QEmu can't handle QWO= RD MMIO as=20 well. So I guess we can leave it later. > > + goto out; >=20 > Why don't we pass to userspace on error? > Will make it easy to debug ... OK, at least we can know if guest want to access it using 8 bits. >=20 > > + > > + idx =3D msix_get_enabled_idx(adev, addr, len); > > + if (idx < 0) { > > + idx =3D (addr - adev->msix_mmio_base) / PCI_MSIX_ENTRY_SIZE; > > + if (((addr % PCI_MSIX_ENTRY_SIZE) =3D=3D > > + PCI_MSIX_ENTRY_VECTOR_CTRL)) { > > + if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT) > > + goto out; > > + if (new_val & PCI_MSIX_ENTRY_CTRL_MASKBIT) > > + set_bit(idx, adev->msix_mask_bitmap); > > + else > > + clear_bit(idx, adev->msix_mask_bitmap); > > + /* It's possible that we need re-enable MSI-X, so go > > + * back to userspace */ > > + } > > + /* Userspace would handle other MMIO writing */ > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + if (addr % PCI_MSIX_ENTRY_SIZE !=3D PCI_MSIX_ENTRY_VECTOR_CTRL) { > > + r =3D -EOPNOTSUPP; > > + goto out; > > + } > > + if (new_val & ~PCI_MSIX_ENTRY_CTRL_MASKBIT) > > + goto out; >=20 > So if the write touches any other bit, we ignore mask bit write > completely? Don't even pass to userspace? why? Because other bits are reserved. The writing behavior is illegal. Spec = said result=20 is undefined. -- regards Yang, Sheng >=20 > > + update_msix_mask(adev, idx, !!(new_val & PCI_MSIX_ENTRY_CTRL_MASK= BIT)); > > +out: > > + mutex_unlock(&adev->kvm->lock); > > + > > + return r; > > +} > > + > > +static const struct kvm_io_device_ops msix_mmio_ops =3D { > > + .read =3D msix_mmio_read, > > + .write =3D msix_mmio_write, > > +}; > > + > > +static int kvm_vm_ioctl_update_msix_mmio(struct kvm *kvm, > > + struct kvm_msix_mmio *msix_mmio) > > +{ > > + int r =3D 0; > > + struct kvm_assigned_dev_kernel *adev; > > + > > + if (msix_mmio->type !=3D KVM_MSIX_TYPE_ASSIGNED_DEV) > > + return -EINVAL; > > + > > + if (!msix_mmio->flags) > > + return -EINVAL; > > + > > + mutex_lock(&kvm->lock); > > + adev =3D kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, > > + msix_mmio->id); > > + if (!adev) { > > + r =3D -EINVAL; > > + goto out; > > + } > > + if (msix_mmio->base_addr =3D=3D 0) { > > + r =3D -EINVAL; > > + goto out; > > + } > > + if (msix_mmio->max_entries_nr =3D=3D 0 || > > + msix_mmio->max_entries_nr > KVM_MAX_MSIX_PER_DEV) { > > + r =3D -EINVAL; > > + goto out; > > + } > > + > > + if ((msix_mmio->flags & KVM_MSIX_MMIO_FLAG_REGISTER) && > > + (msix_mmio->flags & KVM_MSIX_MMIO_FLAG_UNREGISTER)) { > > + r =3D -EINVAL; > > + goto out; > > + } > > + > > + if (msix_mmio->flags & KVM_MSIX_MMIO_FLAG_REGISTER) { > > + if (!(adev->flags & KVM_ASSIGNED_ENABLED_MSIX_MMIO)) { > > + mutex_lock(&kvm->slots_lock); > > + kvm_iodevice_init(&adev->msix_mmio_dev, > > + &msix_mmio_ops); > > + r =3D kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, > > + &adev->msix_mmio_dev); > > + if (!r) > > + adev->flags |=3D KVM_ASSIGNED_ENABLED_MSIX_MMIO; > > + mutex_unlock(&kvm->slots_lock); > > + } > > + if (!r) { > > + adev->msix_mmio_base =3D msix_mmio->base_addr; > > + adev->msix_max_entries_nr =3D msix_mmio->max_entries_nr; > > + } > > + } else if (msix_mmio->flags & KVM_MSIX_MMIO_FLAG_UNREGISTER) > > + unregister_msix_mmio(kvm, adev); > > +out: > > + mutex_unlock(&kvm->lock); > > + > > + return r; > > +} > >=20 > > #endif > > =20 > > long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, > >=20 > > @@ -813,6 +1096,37 @@ long kvm_vm_ioctl_assigned_device(struct kvm = *kvm, > > unsigned ioctl, > >=20 > > goto out; > > =09 > > break; > > =09 > > } > >=20 > > + case KVM_GET_MSIX_ENTRY: { > > + struct kvm_msix_entry entry; > > + r =3D -EFAULT; > > + if (copy_from_user(&entry, argp, sizeof entry)) > > + goto out; > > + r =3D kvm_vm_ioctl_get_msix_entry(kvm, &entry); > > + if (r) > > + goto out; > > + r =3D -EFAULT; > > + if (copy_to_user(argp, &entry, sizeof entry)) > > + goto out; > > + r =3D 0; > > + break; > > + } > > + case KVM_UPDATE_MSIX_MMIO: { > > + struct kvm_msix_mmio msix_mmio; > > + > > + r =3D -EFAULT; > > + if (copy_from_user(&msix_mmio, argp, sizeof(msix_mmio))) > > + goto out; > > + > > + r =3D -EINVAL; > > + if (find_first_bit((unsigned long *)msix_mmio.reserved, > > + sizeof(msix_mmio.reserved)) < sizeof(msix_mmio.reserved)) > > + goto out; > > + > > + r =3D kvm_vm_ioctl_update_msix_mmio(kvm, &msix_mmio); > > + if (r) > > + goto out; > > + break; > > + } > >=20 > > #endif > > =20 > > } > > =20 > > out: